# MSML651 Project: Predicting Sentiment of Tweets Using the Sentiment140 Dataset

- Name: Aditya Patkar
- UID: 119390818

# LSTM

In [None]:
#install required libraries
!pip install awscli boto3 wandb

## Imports and config

In [None]:
#Set up required secrets
from google.colab import userdata
access_key = userdata.get('AWS_ACCESS_KEY_ID')
secret_key = userdata.get('AWS_SECRET_ACCESS_KEY')
wandb_key = userdata.get('WANDB_KEY')
huggingface_key = userdata.get('HUGGINGFACE_KEY')

In [None]:
#WandB Config
import wandb
from wandb.keras import WandbMetricsLogger, WandbModelCheckpoint
wandb.login(relogin=True, key=wandb_key) #uncomment this line if you are running this code for the first time
wandb.init(project="msml651-sentiment-analysis", entity="apatkar", name="LSTM", config={"bs": 12})

In [None]:
#AWS config
!aws configure set aws_access_key_id $access_key
!aws configure set aws_secret_access_key $secret_key
!aws configure set default.region us-east-1

In [None]:
#Necessary imports
import pandas as pd
import nltk
import pickle
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import ReduceLROnPlateau

In [None]:
#download required files from AWS
!aws s3 cp s3://msml651/sentiment140_clean_no_stopwords.parquet .
!aws s3 cp s3://msml651/lstm_tokenizer.pickle .
!aws s3 cp s3://msml651/glove.twitter.27B.200d.txt .


In [None]:
#model and data configs
config = {'train_size':0.8,
          'test_size':0.2,
          'embedding_dim':200,
          'dropout':0.2,
          'batch_size':1024,
          'epochs':25,
          'patience_lr':2,
          'factor_lr': 0.1,
          'min_lr':0.0001}
wandb.config.update(config)

## Data Preprocessing

In [None]:
#read data
df = pd.read_parquet("/content/sentiment140_clean_no_stopwords.parquet")
df.head()

In [None]:
#train_test_split
train_data, test_data = train_test_split(df, test_size = config['test_size'], random_state = 20, stratify = df.target)

In [None]:
#encode target
encoder = LabelEncoder()
encoder.fit(train_data['target'].to_list())
y_train = encoder.transform(train_data['target'].to_list())
y_test = encoder.transform(test_data['target'].to_list())
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

In [None]:
#load tokenizer
with open('/content/lstm_tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
max_length = max([len(s.split()) for s in train_data['tweet_without_stopwords']])

# pad sequences in x_train data set to the max length
x_train = pad_sequences(tokenizer.texts_to_sequences(train_data['tweet_without_stopwords']),
                        maxlen = max_length)
# pad sequences in x_test data set to the max length
x_test = pad_sequences(tokenizer.texts_to_sequences(test_data['tweet_without_stopwords']),
                       maxlen = max_length)

In [None]:
#code courtesy: MachineLearningMastery (NOT MY CODE)

# load embedding as a dict
def load_embedding(filename):
    # load embedding into memory, skip first line
    file = open(filename,'r',encoding="utf-8")
    lines = file.readlines()
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # key is string word, value is numpy array for vector
        embedding[parts[0]] = np.asarray(parts[1:], dtype='float32')
    return embedding

# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, embedding_dim))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        vector = embedding.get(word)
        if vector is not None:
            weight_matrix[i] = vector
    return weight_matrix

#get embedding weights
embedding_dim = config['embedding_dim']
vocab = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
raw_embedding = load_embedding('/content/glove.twitter.27B.200d.txt')
embedding_matrix = get_weight_matrix(raw_embedding, vocab)

## Model

In [None]:
#create an embedding layer using the weights
embedding_layer = Embedding(vocab_size,
                            embedding_dim,
                            weights = [embedding_matrix],
                            input_length = max_length,
                            trainable = False)

In [None]:
#build the lstm
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(config['dropout']))
model.add(LSTM(200, dropout = 0.2))
model.add(Dense(64, activation='leaky_relu'))
model.add(Dense(1, activation = "sigmoid"))

print(model.summary())

model.compile(optimizer = "adam", loss = 'binary_crossentropy', metrics = ['accuracy'])

## Training

In [None]:
BATCH_SIZE = config['batch_size']
EPOCHS = config['epochs']

#LR Scheduler
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss',
                              factor = config['factor_lr'],
                              patience = config['patience_lr'],
                              min_lr = config['min_lr'])

#train
history = model.fit(x_train, y_train, batch_size = BATCH_SIZE, epochs = EPOCHS,
                    validation_split = 0.1, verbose = 1, callbacks = [reduce_lr, WandbMetricsLogger(), WandbModelCheckpoint("models")])

##Evaluation

In [None]:
#evaluate the model
score = model.evaluate(x_test, y_test, batch_size = BATCH_SIZE)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

In [None]:
#get predictions
y_pred = model.predict(x_test, batch_size=1024, verbose=1)
y_pred_lst = [[0] if i <0.5 else [1] for i in y_pred]
precision = precision_score(y_test, y_pred_lst, average='binary')
recall = recall_score(y_test, y_pred_lst, average='binary')
f1 = f1_score(y_test, y_pred_lst, average='binary')

print(precision, recall, f1)

In [None]:
def test_thresholds(y_test, y_pred_og, thresholds=[0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65]):
  """
  This function tests different thresholds to find the best F1 score
  """
  best_threshold = thresholds[0]
  best_f1 = 0
  best_y_pred = [[0] if i < thresholds[0] else [1] for i in y_pred_og]
  for threshold in thresholds:
    y_pred = [[0] if i < threshold else [1] for i in y_pred_og]
    f1 = f1_score(y_test, y_pred, average='binary')
    if f1 > best_f1:
      best_f1 = f1
      best_threshold = threshold
      best_y_pred = y_pred
  return best_threshold, best_f1, best_y_pred





In [None]:
#Find best threshold and calculate metrics
best_threshold, best_f1, best_y_pred = test_thresholds(y_test, y_pred)

print(f"Best threshold:{best_threshold}")
print(f"Best f1:{best_f1}")
precision = precision_score(y_test, best_y_pred, average='binary')
recall = recall_score(y_test, best_y_pred, average='binary')
f1 = f1_score(y_test, best_y_pred, average='binary')
accuracy = accuracy_score(y_test, best_y_pred)
print(precision, recall,f1, accuracy)

##Save Model and Log Metrics

In [None]:
wandb.log({'accuracy': accuracy, 'loss':score[0], 'precision':precision, 'recall':recall, 'f1':f1, 'threshold':best_threshold})
model.save('lstm_sentiment.h5')
!aws s3 cp /content/lstm_sentiment.h5 s3://msml651

## Visualization

In [None]:
#visualize accuracy
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc,label = 'Training acc', color='red')
plt.plot(epochs, val_acc, label = 'Validation acc', color='blue')
plt.title('LSTM: Training and validation accuracy')
plt.legend()
image = plt
wandb.log({"LSTM Accuracy": wandb.Image(image)})

In [None]:
#visualize loss
plt.plot(epochs, loss,label = 'Training loss', color='red')
plt.plot(epochs, val_loss, label = 'Validation loss', color='blue')
plt.title('LSTM: Training and validation loss')
plt.legend()
image = plt
wandb.log({"LSTM Loss": wandb.Image(image)})

In [None]:
#finish run
wandb.finish()