# NLP Text Generation

*Course: Machine Learning Projects with TensorFlow 2.0 by Vlad Sebastian Ionescu*

*Data: https://www.kaggle.com/kazanova/sentiment140*

In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

## 1. Load data

In [2]:
data = pd.read_csv('data/tweets.csv', encoding='latin-1', header=None)
data = data.sample(frac=1)
data.head()

Unnamed: 0,0,1,2,3,4,5
351005,0,2018151132,Wed Jun 03 09:31:03 PDT 2009,NO_QUERY,loahriot,it gets harder each time he leaves. I haven't...
906944,4,1695583037,Mon May 04 06:29:11 PDT 2009,NO_QUERY,JaneHungOz,"@1critic Yep, ah, damn, I don't wanna leave my..."
1326113,4,2015220665,Wed Jun 03 04:03:25 PDT 2009,NO_QUERY,tenyentegebs,be my fellawor.
1590265,4,2191371977,Tue Jun 16 05:08:08 PDT 2009,NO_QUERY,Missbabyp,"omg the test went great! i'm happy, i'm very h..."
1086331,4,1969298360,Fri May 29 23:36:06 PDT 2009,NO_QUERY,rachelteamICO,Testing that my laptop works from home Yes it...


## 2. Data Processing

In [3]:
train_size = int(0.7*len(data))
features = data[5]
target = data[0]
X_train, X_test = features.values[:train_size], features.values[train_size:]
y_train, y_test = target.values[:train_size], target.values[train_size:]

y_train[y_train==2] = 1
y_train[y_train==4] = 2

y_test[y_test==2] = 1
y_test[y_test==4] = 2

In [4]:
count_vectorizer = CountVectorizer(max_features=100)
X_train_num = count_vectorizer.fit_transform(X_train).toarray()
X_test_num = count_vectorizer.transform(X_test).toarray()

In [5]:
instance = 14
print(X_train[instance])
print(X_train_num[instance])

@ajam247 ty! I haven't been there since we went in '97! I'll ride our ride, Vortex, for ya! 
[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]


## 3. Text Classifier

In [6]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation=tf.nn.leaky_relu),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(3)
])

adam_optimizer = tf.keras.optimizers.Adam()
model.compile(
    optimizer=adam_optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

print(y_train)

model.fit(X_train_num,
          y_train,
          batch_size=64,
          epochs=2,
          validation_split=0.1,
          verbose=2
          )

[0 2 0 ... 0 2 0]
Epoch 1/2
1575/1575 - 3s - loss: 0.6390 - accuracy: 0.6417 - val_loss: 0.6189 - val_accuracy: 0.6590
Epoch 2/2
1575/1575 - 3s - loss: 0.6184 - accuracy: 0.6546 - val_loss: 0.6164 - val_accuracy: 0.6619


<tensorflow.python.keras.callbacks.History at 0x7fe55ec76450>

## 4. Text Generation

### 4.1 Data Processing

In [3]:
text = ' '.join(data[5])
text[:300]

" it gets harder each time he leaves. I haven't been able to get a goodnight sleep since then. i miss him @1critic Yep, ah, damn, I don't wanna leave my warm doona to get a hot beverage...I guess I'll just have to make do w/o it  be my fellawor.  omg the test went great! i'm happy, i'm very happy!! a"

In [4]:
# Create a unique set of characters (vocabulary)
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

# Map unique characters to an index and the opposite
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])
text_as_int

193 unique characters


array([ 1, 71, 82, ..., 19, 19, 19])

In [5]:
# Maximum length in characters for a sentence
seq_length = 64
examples_per_epoch = len(text) // (seq_length+1)

# Creating training examples
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

 
i
t
 
g


In [6]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

" it gets harder each time he leaves. I haven't been able to get a"
' goodnight sleep since then. i miss him @1critic Yep, ah, damn, I'
" don't wanna leave my warm doona to get a hot beverage...I guess "
"I'll just have to make do w/o it  be my fellawor.  omg the test w"
"ent great! i'm happy, i'm very happy!! and i finally have wireles"


In [7]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

for input_example, target_example in dataset.take(1):
    print('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print('Target data: ', repr(''.join(idx2char[target_example.numpy()])))
    
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print('Step {:4d}'.format(i))
    print(' input: {} ({:s})'.format(input_idx, repr(idx2char[input_idx])))
    print(' expected_output: {} ({:s})'.format(target_idx, repr(idx2char[target_idx])))

Input data:  " it gets harder each time he leaves. I haven't been able to get "
Target data:  "it gets harder each time he leaves. I haven't been able to get a"
Step    0
 input: 1 (' ')
 expected_output: 71 ('i')
Step    1
 input: 71 ('i')
 expected_output: 82 ('t')
Step    2
 input: 82 ('t')
 expected_output: 1 (' ')
Step    3
 input: 1 (' ')
 expected_output: 69 ('g')
Step    4
 input: 69 ('g')
 expected_output: 67 ('e')


### 4.2 Training

In [8]:
BATCH_SIZE = 64

# Buffer size to shuffle the dataset in memory
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 64), (64, 64)), types: (tf.int64, tf.int64)>

In [9]:
def get_model(batch_size, vocab, embedding_dim=256, rnn_units=512):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(len(vocab), 
                                  embedding_dim, 
                                  batch_input_shape=[batch_size, None]), 
        
        tf.keras.layers.LSTM(rnn_units, 
                             return_sequences=True, 
                             stateful=True, 
                             recurrent_initializer='glorot_uniform'),
        
        tf.keras.layers.Dense(len(vocab))
    ])
    
    return model

model = get_model(BATCH_SIZE, vocab)

In [10]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)

sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=1).numpy()
sampled_indices

array([ 99,  58, 153,  35,  38, 177,  80,   7,  13, 115,  58,  89, 177,
       109, 184,  34, 172, 126,   5, 144, 156, 164,  48,   8, 177, 147,
        60,  31,  24, 192,  43,  24,  73,  14,  68,  76, 105,  89,  70,
       190,  31,   2,  74, 147, 124, 158, 153,  18, 137,  87,   9,  31,
       103, 114, 152, 172, 141, 113, 159, 115,  89,  24,   3, 111])

In [11]:
print('Input: \n', repr(''.join(idx2char[input_example_batch[0]])))
print()
print('Next char prediction: \n', repr(''.join(idx2char[sampled_indices])))

Input: 
 "cent! lol jk I'll accept you as a sister =P what else are you? b"

Next char prediction: 
 "\x86\\ÂEHàr'-\x99\\{à\x93çDØ¥%·ÅÍR(àº^A8ïM8k.fn\x8c{híA!lº£ÇÂ2°y)A\x8a\x98¿Ø´\x97È\x99{8#\x95"


In [None]:
EPOCHS = 1

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

model.fit(dataset, 
          epochs=EPOCHS,
          callbacks=[checkpoint_callback],
          verbose=2
         )

In [None]:
def generate_text(model, start_string):
    
    # Number of characters to generate
    num_generate = 1000
    
    # Coverting the start_string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    
    text_generated = []
    # Low temperature --> more predictable text
    # High temperature --> more surprising text
    temperature = 1.0
    
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        
        # Using a categorical distribution to predict the char returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        
        # We pass the predicted char to the model along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)
        
        text_generated.append(idx2char[predicted_id])
        
    return (start_string + ''.join(text_generated))

In [None]:
generating_model = get_model(1, vocab)
generating_model.load_weights(tf.train.latest_checkpoint(checkpoint_dir = './training_checkpoints'))
generating_model.build(tf.TensorShape([1, None]))

In [None]:
print(generate_text(generating_model, start_string=u'Well, ))