# Predicting the Next Token in Tweets Using LSTM and TensorFlow



In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('preprocessed_tweets.csv')

In [3]:
df

Unnamed: 0,tweet,tokens
0,awww that is a bummer you shoulda got davi...,"['awww', 'that', 'is', 'a', 'bummer', 'you', '..."
1,is upset that he cannot update his facebook by...,"['is', 'upset', 'that', 'he', 'cannot', 'updat..."
2,i dived many times for the ball managed to sa...,"['i', 'dived', 'many', 'times', 'for', 'the', ..."
3,my whole body feels itchy and like its on fire,"['my', 'whole', 'body', 'feels', 'itchy', 'and..."
4,no it is not behaving at all i am mad why am ...,"['no', 'it', 'is', 'not', 'behaving', 'at', 'a..."
...,...,...
1599995,just woke up having no school is the best feel...,"['just', 'woke', 'up', 'having', 'no', 'school..."
1599996,thewdbcom very cool to hear old walt intervie...,"['thewdbcom', 'very', 'cool', 'to', 'hear', 'o..."
1599997,are you ready for your mojo makeover ask me fo...,"['are', 'you', 'ready', 'for', 'your', 'mojo',..."
1599998,happy th birthday to my boo of alll time tupac...,"['happy', 'th', 'birthday', 'to', 'my', 'boo',..."


## 2. Load and Prepare Data

Next, we load the preprocessed tweets and prepare them for training. This involves tokenizing the text data and creating sequences that will be used as input to our LSTM model.


In [4]:
df

Unnamed: 0,tweet,tokens
0,awww that is a bummer you shoulda got davi...,"['awww', 'that', 'is', 'a', 'bummer', 'you', '..."
1,is upset that he cannot update his facebook by...,"['is', 'upset', 'that', 'he', 'cannot', 'updat..."
2,i dived many times for the ball managed to sa...,"['i', 'dived', 'many', 'times', 'for', 'the', ..."
3,my whole body feels itchy and like its on fire,"['my', 'whole', 'body', 'feels', 'itchy', 'and..."
4,no it is not behaving at all i am mad why am ...,"['no', 'it', 'is', 'not', 'behaving', 'at', 'a..."
...,...,...
1599995,just woke up having no school is the best feel...,"['just', 'woke', 'up', 'having', 'no', 'school..."
1599996,thewdbcom very cool to hear old walt intervie...,"['thewdbcom', 'very', 'cool', 'to', 'hear', 'o..."
1599997,are you ready for your mojo makeover ask me fo...,"['are', 'you', 'ready', 'for', 'your', 'mojo',..."
1599998,happy th birthday to my boo of alll time tupac...,"['happy', 'th', 'birthday', 'to', 'my', 'boo',..."


In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['tweet'])
total_words = len(tokenizer.word_index) + 1

In [6]:
print(f"There are a total of {total_words} unique words")

There are a total of 417963 unique words


In [7]:
input_sequences = []
for line in df['tweet']:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [8]:
input_sequences

[[432, 16],
 [432, 16, 4],
 [432, 16, 4, 5],
 [432, 16, 4, 5, 1155],
 [432, 16, 4, 5, 1155, 6],
 [432, 16, 4, 5, 1155, 6, 3356],
 [432, 16, 4, 5, 1155, 6, 3356, 41],
 [432, 16, 4, 5, 1155, 6, 3356, 41, 819],
 [432, 16, 4, 5, 1155, 6, 3356, 41, 819, 9591],
 [432, 16, 4, 5, 1155, 6, 3356, 41, 819, 9591, 14],
 [432, 16, 4, 5, 1155, 6, 3356, 41, 819, 9591, 14, 1782],
 [432, 16, 4, 5, 1155, 6, 3356, 41, 819, 9591, 14, 1782, 34],
 [432, 16, 4, 5, 1155, 6, 3356, 41, 819, 9591, 14, 1782, 34, 2],
 [432, 16, 4, 5, 1155, 6, 3356, 41, 819, 9591, 14, 1782, 34, 2, 20],
 [432, 16, 4, 5, 1155, 6, 3356, 41, 819, 9591, 14, 1782, 34, 2, 20, 8],
 [432, 16, 4, 5, 1155, 6, 3356, 41, 819, 9591, 14, 1782, 34, 2, 20, 8, 365],
 [4, 733],
 [4, 733, 16],
 [4, 733, 16, 81],
 [4, 733, 16, 81, 43],
 [4, 733, 16, 81, 43, 510],
 [4, 733, 16, 81, 43, 510, 171],
 [4, 733, 16, 81, 43, 510, 171, 516],
 [4, 733, 16, 81, 43, 510, 171, 516, 103],
 [4, 733, 16, 81, 43, 510, 171, 516, 103, 1942],
 [4, 733, 16, 81, 43, 510, 171

In [9]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [10]:
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

In [None]:
label = tf.keras.utils.to_categorical(label, num_classes=total_words)

## 3. Build the LSTM Model

With our data prepared, we can now build the LSTM model. We use an Embedding layer to learn token embeddings, followed by an LSTM layer and a Dense layer for prediction.


In [None]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

model.summary()


## 4. Compile the Model

We compile the model using the 'adam' optimizer and 'categorical_crossentropy' as the loss function, suitable for multi-class classification tasks.


In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


## 5. Train the Model

It's time to train our model. Note that this process can be time-consuming, depending on the size of your data and the complexity of the model.


In [None]:
history = model.fit(predictors, label, epochs=100, verbose=1)


## 6. Evaluate the Model

After training, we can evaluate our model's performance and plot the training history to visualize the learning process.


In [None]:
import matplotlib.pyplot as plt

# Plot accuracy
plt.plot(history.history['accuracy'])
plt.title('Model Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')
plt.show()

# Plot loss
plt.plot(history.history['loss'])
plt.title('Model Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')
plt.show()


## 7. Test the Model

Finally, let's test our model with a custom input to predict the next token in a sequence.


In [None]:
def predict_next_token(seed_text):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict_classes(token_list, verbose=0)
    return tokenizer.index_word[predicted[0]]

seed_text = "I feel"
next_token = predict_next_token(seed_text)
print(f"Next token after '{seed_text}': {next_token}")


## 8. Save the Model

To deploy the model, we first need to save it. TensorFlow provides a simple API to save models in the SavedModel format, which can be easily served in different environments.


In [None]:
model_save_path = 'saved_model/next_token_predictor'
model.save(model_save_path)


## 9. Containerize the Model Using Docker

To prepare our model for deployment on AWS SageMaker, we'll containerize it using Docker. This process involves creating a Dockerfile, building a Docker image, and testing it locally.


In [None]:


FROM tensorflow/serving

COPY ${model_save_path} /models/next_token_predictor/1

ENV MODEL_NAME=next_token_predictor


In [None]:
docker build -t next-token-predictor:latest .


In [None]:
docker run -p 8501:8501 --name=my_model_container next-token-predictor:latest


## 10. Upload the Model to Amazon ECR

For deploying our model with AWS SageMaker, we need to upload our Docker container to Amazon Elastic Container Registry (ECR). This section outlines the steps to create a repository in ECR, authenticate Docker to push images to ECR, and finally, push the image.


In [None]:
import boto3

aws_region = 'us-west-2'
ecr_repository_name = 'next-token-predictor'

ecr_client = boto3.client('ecr', region_name=aws_region)

response = ecr_client.create_repository(repositoryName=ecr_repository_name)
repository_uri = response['repository']['repositoryUri']

print(f"Repository URI: {repository_uri}")


In [None]:
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin <repository_uri>


In [None]:
docker tag next-token-predictor:latest <repository_uri>:latest


In [None]:
docker push <repository_uri>:latest
