In [None]:
# Preprocessing
import csv
from nltk.tokenize.casual import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk

dir = '/content/drive/My Drive/Colab Notebooks/'

# data source: https://www.kaggle.com/datasets/kazanova/sentiment140
"""
It contains the following 6 fields:

target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)

ids: The id of the tweet ( 2087)

date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)

flag: The query (lyx). If there is no query, then this value is NO_QUERY.

user: the user that tweeted (robotickilldozr)

text: the text of the tweet (Lyx is cool)
"""
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
import re
# preprocess the data
with open(dir+'labeled_tweets.csv', mode='r', encoding='utf-8', errors='replace') as src:
  with open(dir+"cleaned_tweets.csv", mode='w', encoding='utf-8', errors='replace') as dst:
    reader = csv.reader(src)
    writer = csv.writer(dst)
    tk = TweetTokenizer(strip_handles=True, preserve_case=False)
    nltk.download('stopwords')
    stopwords = set(stopwords.words('english'))
    stemmer = SnowballStemmer('english')
    # Extract the tweet text and sentiment labels
    count = 0
    for row in reader:
      text = row[5]
      text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
      tokens = tk.tokenize(text)
      # Remove stopwords, hyperlinks, user mentions, and stem the tokens
      tokens = [stemmer.stem(token) for token in tokens
                  if token not in stopwords]
      # store label and cleaned data for analysis
      if tokens != []:
        writer.writerow(['1' if int(row[0]) > 0 else '0', ' '.join(tokens)])
        count += 1
        if (count % 50000 == 0):
          print(f"Finished count: {count}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Finished count: 10000
Finished count: 20000
Finished count: 30000
Finished count: 40000
Finished count: 50000
Finished count: 60000
Finished count: 70000
Finished count: 80000
Finished count: 90000
Finished count: 100000
Finished count: 110000
Finished count: 120000
Finished count: 130000
Finished count: 140000
Finished count: 150000
Finished count: 160000
Finished count: 170000
Finished count: 180000
Finished count: 190000
Finished count: 200000
Finished count: 210000
Finished count: 220000
Finished count: 230000
Finished count: 240000
Finished count: 250000
Finished count: 260000
Finished count: 270000
Finished count: 280000
Finished count: 290000
Finished count: 300000
Finished count: 310000
Finished count: 320000
Finished count: 330000
Finished count: 340000
Finished count: 350000
Finished count: 360000
Finished count: 370000
Finished count: 380000
Finished count: 390000
Finished count: 400000
Finished count: 410000
Finished count: 420000
Finished count: 430000
Finished count: 4400

In [None]:
# Tokenization
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

dir = '/content/drive/My Drive/Colab Notebooks/'
np.random.seed(0)
df = pd.read_csv(dir+"cleaned_tweets.csv", header=None)
df.columns = ['label', 'text']
X = df['text']
y = df['label']

In [None]:
# Word Vector
# hyperparameter
EMBEDDING_DIM = 300
# """
# Load Pre-trained GloVe Vectors
# Ref: https://nlp.stanford.edu/projects/glove/
# Return:
#     wv_from_bin: All 400000 embeddings, each length EMBEDDING_DIM
# """
import gensim
documents = [_text.split() for _text in X]

glove_vectors = gensim.models.word2vec.Word2Vec(vector_size=EMBEDDING_DIM,
                                            window=7,
                                            min_count=10,
                                            workers=8)
glove_vectors.build_vocab(documents)
words = glove_vectors.wv
vocab_size = len(words)
print("Vocab size", vocab_size)

glove_vectors.train(documents, total_examples=len(documents), epochs=32)
glove_vectors.save(dir+'word2vec.model')

In [None]:
# Tokenization and Encoding
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


MAX_LENGTH= max([len(s.split()) for s in X])
print(f"Max Length: {MAX_LENGTH}")

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size: {vocab_size}")
print(f"example data: {X[0]}")

seq = tokenizer.texts_to_sequences(X)
print(f"example sequence: {seq[0]}")
X = pad_sequences(tokenizer.texts_to_sequences(X),
                        maxlen = MAX_LENGTH, padding='post')
print(f"example pad: {X[0]}")


TRAIN_SIZE = 0.8

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(y)
y = encoder.transform(y)
y = y.reshape(-1,1)

print(f"Total: {len(X)} samples")
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_SIZE, shuffle=True, random_state=0)
print(f"Training data: {len(X_train)} samples")
print(f"Testing data: {len(X_test)} samples")

Max Length: 50
Vocabulary size: 283761
example data: awww bummer shoulda got david carr third day
example sequence: [368, 1047, 3061, 11, 703, 7476, 1689, 3]
example pad: [ 368 1047 3061   11  703 7476 1689    3    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]
Total: 1592328 samples
Training data: 1273862 samples
Testing data: 318466 samples


In [None]:
# Embedding layer
from gensim.models import KeyedVectors
glove_vectors = KeyedVectors.load(dir+'word2vec.model')

EMBEDDING_DIM = 300

words = glove_vectors.wv.key_to_index
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))

for word, i in word_index.items():
    if word in words:
        embedding_matrix[i] = glove_vectors.wv[word]
print(f"Embedding Matrix shape: {embedding_matrix.shape}")

import tensorflow as tf
embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                          EMBEDDING_DIM,
                                          weights=[embedding_matrix],
                                          trainable=True)

Embedding Matrix shape: (283761, 300)


In [None]:
# Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

model = Sequential([
    embedding_layer,
    Dropout(0.5),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping


ReduceLROnPlateau = ReduceLROnPlateau(factor=0.1,
                                     monitor = 'val_loss',
                                     patience = 5,
                                     verbose = 1,
                                      cooldown=0)
EarlyStopping = EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5, mode='max')
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
BATCH_SIZE = 1024
EPOCHS = 15

print("Shape of X_train:", X_train.shape)
print("First sample in X_train:", X_train[0])

history = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_test, y_test), callbacks=[ReduceLROnPlateau, EarlyStopping])
model.save(dir+'LSTM.keras')

Shape of X_train: (1273862, 50)
First sample in X_train: [ 339  337 1259    3    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]
Epoch 1/15
[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m298s[0m 233ms/step - accuracy: 0.7028 - loss: 0.5643 - val_accuracy: 0.7786 - val_loss: 0.4679 - learning_rate: 0.0010
Epoch 2/15
[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 230ms/step - accuracy: 0.7660 - loss: 0.4869 - val_accuracy: 0.7849 - val_loss: 0.4547 - learning_rate: 0.0010
Epoch 3/15
[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 230ms/step - accuracy: 0.7821 - loss: 0.4602 - val_accuracy: 0.7906 - val_loss: 0.4458 - learning_rate: 0.0010
Epoch 4/15
[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m288s[0m 231ms/step - accuracy: 0.7944 -

In [None]:
from keras.models import load_model
model = load_model(dir+'LSTM.keras')
model.evaluate(X_test, y_test)

  saveable.load_own_variables(weights_store.get(inner_path))


[1m9953/9953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m349s[0m 35ms/step - accuracy: 0.7801 - loss: 0.4809


[0.48045244812965393, 0.7805857062339783]