In [1]:
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
nltk.download('stopwords')
nltk.download('wordnet')

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

import os

from keras import regularizers
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.models import load_model
from keras.callbacks import EarlyStopping
from keras_preprocessing.sequence import pad_sequences

import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abain\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# First we can import the data and setup the dataframe with the correct column labels
# Target column represents sentiment. 0: Negative, 4: Positive
# This is needed if .csv has not been created yet, otherwise ignore

columnLabels = ["target", "ids", "date", "flag", "user", "text"]
dataEncoding = "ISO-8859-1"
rawData = pd.read_csv('twitterKaggleData.csv', encoding=dataEncoding, names=columnLabels)

rawData.head(5)

# Here we can do some inital data processing and remove the columns that are not relevant to the model we are trying to make

data = rawData.drop(columns=["ids", "date", "flag", "user"])
data.head(5)

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [3]:
# As we can see in the text, there are links, usernames, and special characters that we do not want when we are training so we can remove them
# We can then perform the best practice preprocessing which is tokenization, lower casing, stop word removal, and lemmatize
# We chose lemmatization over stemming as stemming can sometimes be inaccurate

def processText(text):
    lemmatizer = WordNetLemmatizer()
    arr = []

    for token in text.split():
        if token not in stopwords.words("english") and token.isalnum():
            word = lemmatizer.lemmatize(token)
            arr.append(word)

    return " ".join(arr)

# If csv file exists, use it, otherwise process text
if os.path.exists('./processedTwitterData.csv'):
    data = pd.read_csv('processedTwitterData.csv')

    # Ensures that all values are of string as pd.read_csv automatically will convert the text into a type such as float or int
    data.text = data.text.astype(str)
else:
    length = len(data.text)
    curr = 0

    for index, text in enumerate(data.text):
        newText = processText(text)
        curr += 1
        data.at[index, 'text'] = newText

        print("Processing Progress: " + str(curr) + "/" + str(length), end="\r")

    # Save the process after it's complete so that the .csv can be instead since the kernel does not save session to session and the processing takes 2-3 hours
    data.to_csv('processedTwitterData.csv', index=False)

data.head(5)

Unnamed: 0,target,text
0,0,You shoulda got David Carr Third Day
1,0,upset update Facebook texting might cry result...
2,0,I dived many time Managed save The rest go bound
3,0,whole body feel itchy like fire
4,0,behaving I see


In [4]:
# Since the original Kaggle Twitter Data only has labels 0 for negative and 4 for positive, we can convert it to a binary 0 for negative 1 for positive so we can use a binary classification loss function

data['target'].replace({4: 1}, inplace=True)

data

Unnamed: 0,target,text
0,0,You shoulda got David Carr Third Day
1,0,upset update Facebook texting might cry result...
2,0,I dived many time Managed save The rest go bound
3,0,whole body feel itchy like fire
4,0,behaving I see
...,...,...
1599995,1,Just woke Having school best feeling ever
1599996,1,Very cool hear old Walt
1599997,1,Are ready MoJo Ask detail
1599998,1,Happy 38th Birthday boo alll Tupac Amaru Shakur


In [5]:
# Now that we have our pre-processed data we can undergo another important step in NLP: word embeddings
# Here we transform our words into vector representations as it allows for our model to be able to more easily figure out relations that words have to each other and the context in which the word is being used

LENGTH = 280

tokenizer = Tokenizer()
tokenizer.fit_on_texts(data.text)

In [None]:
# Run if you want to use the model that we trained

seqModel = load_model("trainedSequentialModel")
seqModel.summary() 

In [6]:
# We can split our data up into training data and testing date, from there we can turn our training and testing text into vector representations based on the Tokenizer we fit earlier

trainingData, testingData, trainingLabels, testingLabels = train_test_split(data.text, data.target, test_size=0.2, train_size=0.1)

# We can pad the sequences to make them the same size so the model can train on it
trainingData = pad_sequences(tokenizer.texts_to_sequences(trainingData), maxlen=LENGTH) # Convert the data to vector form
testingData = pad_sequences(tokenizer.texts_to_sequences(testingData), maxlen=LENGTH) # Convert the data to vector form

In [11]:
# Run if you are preparing to train a new model

BATCH_SIZE = 2048

seqModel = Sequential()
# We can add in an embedding layer that will extract the similar representations of words from their vector form
seqModel.add(layers.Embedding(len(tokenizer.word_index) + 1, 280)) 
# Size of LTSM layer has been selected through experimentation
seqModel.add(layers.LSTM(200, dropout=0.2))
seqModel.add(layers.Flatten())
# Determined ReLU is best from trial and error
# Determine one Dense layer is best from trial and error
seqModel.add(layers.Dense(1, activation='relu'))

seqModel.compile(optimizer='Adam', loss='mse', metrics=['accuracy'])

seqModel.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 280)         58699760  
                                                                 
 lstm_1 (LSTM)               (None, 200)               384800    
                                                                 
 flatten_1 (Flatten)         (None, 200)               0         
                                                                 
 dense_1 (Dense)             (None, 200)               40200     
                                                                 
 dense_2 (Dense)             (None, 1)                 201       
                                                                 
Total params: 59,124,961
Trainable params: 59,124,961
Non-trainable params: 0
_________________________________________________________________


In [8]:
# DO NOT RUN THIS UNLESS YOU YOU WANT TO TRAIN THE MODEL, TAKES A VERY LONG TIME
callback = EarlyStopping(monitor='loss', patience=1)

history = seqModel.fit(trainingData, trainingLabels, epochs=4, batch_size=BATCH_SIZE, callbacks=callback)

# Various attempt results and what we are doing next:
 
# Sigmoid Runs
# 0.05 data split, 4 epochs, 2048 batch size: loss: 0.3991 - accuracy: 0.8196 -> try to increase epochs and training size
# 0.1 data split, 12 epohcs, 2048 batch size: loss: 0.2537 - accuracy: 0.8794 -> try to increase epochs 
# 0.1 data split, 20 epohcs, 2048 batch size: loss: 0.1451 - accuracy: 0.9329 -> good accuracy on testing data but only about 70% accuracy when testing so overfitting. Lower epochs and increasing training data
# 0.2 data split, 10 epochs, 2048 batch size: loss: 0.2831 - accuracy: 0.8662 -> accuracy on testing data is about 73% so generalizing better than before. Increase epochs slightly and increase training data size
# 0.3 data split, 12 epochs, 2048 batch size: loss: 0.2669 - accuracy: 0.8735 338 minutes -> still only 73% accurate despite increased epochs and data size. Lower epochs and data again and also batch size
# 0.2 data split, 10 epochs 512 batch size: loss: 0.2215 - accuracy: 0.8967 268 minutes -> still only 73% accurate, so its probably overfitting. Increasing data and lower epochs
# 0.4 data split, 8 epochs, 512 batch size: 0.2591 - accuracy: 0.8789 327 minutes -> 74% accurate, obvious at this point its not data. Go back down to 0.1 data, 4 epochs, and try to increase LTSM layer from 100 to 200
# 0.1 data split, 4 epochs 512 batch size: loss: 0.3542 - accuracy: 0.8363 -> 74% accurate with way less data and much quicker training time. Increase data amount and see if any improvements
# 0.2 data split, 4 epochs, 512 batch size (encountered errors so raised size and seemed to fix): 0.3647 - accuracy: 0.8307 -> 177 mintes, increase data and see what happens
# 0.4 data split, 4 epochs, 2048 batch size: loss: 0.3974 - accuracy: 0.8139 -> ~75% accuracy when evaluting
# seeing if adding another dense layer or two will improve the accuracy with a 0.1 data split to see if there are improvements. did this last minute because of neural network slides
# 0.1 data split, 4 epochs 512 batch size: 2048 loss:0.4028 - accuracy: 0.8167, so the extra dense layer is worse when comparing it to the previous attempt with 10% of the data

# After doing Sigmoid for a while and not seeing any improvements we did ReLU

# ReLU Runs: 
# 0.1 data split, 4 epochs 512 batch size: 2048: loss: 0.1303 - accuracy: 0.8175, accuracy was about 73% but trained waaaay faster so maybe we can retry with more data and see
# 0.3 data split, 4 epochs 512 batch size: 2048: loss: 0.1328 - accuracy: 0.8115, accuracy is 75% but again trained much faster
# 0.8 data split, 4 epochs, 512 batch size: 2048: loss: 0.1311 - accuracy: 0.8121, accuracy was 76% which is a new record! this can be the final model.

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [9]:
# Evaluate the model here

results = seqModel.evaluate(testingData, testingLabels, batch_size=BATCH_SIZE)
print("test loss, test acc:", results)

test loss, test acc: [0.16414092481136322, 0.7632781267166138]


In [10]:
# Since the model takes a really long time to train we can save it here after its done

seqModel.save('trainedSequentialModel')



INFO:tensorflow:Assets written to: trainedSequentialModel\assets


INFO:tensorflow:Assets written to: trainedSequentialModel\assets
