In [26]:
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import LSTM
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import os
import sys
import re

In [27]:
#Loading in the data
data = pd.read_csv('training_data.csv', names = ['Sentiment', 'Id', 'Date', 'Flag', 'User', 'Text'])
#Data was gotten from https://www.kaggle.com/kazanova/sentiment140
del data['Id']
del data['Date']
del data['Flag']
del data['User']

In [28]:
data.head()

Unnamed: 0,Sentiment,Text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [29]:
##Loading in Embedding dims  dimensional
embeddings = {}
dims = 100
with open("glove.twitter.27B/glove.twitter.27B.100d.txt", encoding = "utf8")  as file:
# Word Embeddings gotten from the twitter pre-trained vector at https://nlp.stanford.edu/projects/glove/
    for line in file:
        word, coefficients = line.split(maxsplit = 1)
        coefficients = coefficients.split(" ")
        coefficients = np.array(coefficients, dtype=np.float32)
        embeddings[word] = coefficients

In [30]:
#pre-processing function
def preprocessTweet(X):
    #lowercase
    X = X.lower()
    # separate @ and user
    X = X.replace("@", " @ ")
    # replace urls with "url"
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', X)
    for url in urls:
        X = X.replace(url, "url")
    #replace multiple punctuation with single
    repeatpunctuations = re.findall('[.,!?]{2,}',X)
    for repeatpunctuation in repeatpunctuations:
        X = X.replace(repeatpunctuation, repeatpunctuation[0])
    #Emoji handling
    smile = re.findall('[8;:=]['"`""\\"'-][)d]',X)
    lolface = re.findall('[8;:=]['"`""\\"'-][p]',X)
    sadface = re.findall('[8;:=]['"`""\\"'-][(|/]',X)
    neutralface = re.findall('[8;:=]['"`""\\"'-][\1]',X)
    heart = re.findall('[<][3]',X)
    for i in smile:
        X = X.replace(i, " smile ")
    for i in lolface:
        X = X.replace(i, " lolface ")
    for i in sadface:
        X = X.replace(i, " sadface ")
    for i in neutralface:
        X = X.replace(i, " neutralface ")
    for i in heart:
        X = X.replace(i, " heart ")
    # number handling
    numbers = re.findall('[0-9]{1,}',X)
    for i in numbers:
        X = X.replace(i, " number ")
    #remove contractions
    contractions = re.findall("[']",X)
    for i in contractions:
        X  = X.replace(i,"")
    # add spaces between last word and punctuation
    puncs = re.findall('[.!?,]',X)
    for i in puncs:
        X = X.replace(i," "+i[0]+" ")
    # remove extended words ie 'wayyyyy' NEED to MAKE this part
    # later find a way to determine whether the ending letters should be 1 or 2 letters ie hellll -> hell not hel
    #extendedWords = re.findall('[a-z]{3,}',X)
    #for i in extendedWords:
        #X = X.replace(i, i[0])
    #remove double spaces
    X = re.sub("\s\s+" , " ", X)
    return X

In [31]:
#Preprocess the data
data['Text'] = data['Text'].apply(preprocessTweet)

In [32]:
#shuffle the data
shuffled_data = data.reindex(np.random.RandomState(seed=2020).permutation(data.index))
shuffled_data.head()

Unnamed: 0,Sentiment,Text
1432532,4,in a dark stadium &quot;painting&quot; with jo...
1291931,4,bird out though . now i must check all the dis...
606800,0,the guitar is still stuck in morsdorf . i am g...
623532,0,@ bronxbebe number lol . nawww that was yeste...
296942,0,very sad to read about a number .


In [33]:
#split into training, dev, and test
X_data = shuffled_data['Text'].to_numpy()
Y_data = shuffled_data['Sentiment'].to_numpy()
#Convert Y to one-hot
Y_data = Y_data/2
Y_data = to_categorical(Y_data)
#training: first 1.4 mil
X_training = X_data[0:1400000]
Y_training = Y_data[0:1400000]
#dev: next 100 k
X_dev = X_data[1400001:1500000]
Y_dev = Y_data[1400001:1500000]
#test:last 100 k
X_test = X_data[150001:1600000]
Y_test = Y_data[150001:1600000]

In [34]:
# Tokenize the input
#creates tokenizer
tokenizer = Tokenizer()
#fits the input to the text, ie most common words being closer to 0 and more obscure being father away
tokenizer.fit_on_texts(X_data) 
#converts the input to token indices
X_training_tokens = tokenizer.texts_to_sequences(X_training)
X_dev_tokens = tokenizer.texts_to_sequences(X_dev)
X_test_tokens = tokenizer.texts_to_sequences(X_training)
#get largest list of words
maxLen = max([len(s.split()) for s in X_data])
#padding so all inputs are the same size
X_train_pad = pad_sequences(X_training_tokens, maxlen = maxLen)
X_dev_pad = pad_sequences(X_dev_tokens, maxlen = maxLen)
X_train_pad = pad_sequences(X_test_tokens, maxlen = maxLen)

In [35]:
#time to make the embedding matrix
#instantiate embedding matrix of zeroes
embedding_matrix = np.zeros((len(tokenizer.word_index)+1, dims))
#go through each word in the token list
for word, i in tokenizer.word_index.items():
    #get the corresponding embedding vector (if it exists)
    embedding_vector = embeddings.get(word)
    #check if its not none
    if embedding_vector is not None:
        #add that to the embedding matrix
        embedding_matrix[i] = embedding_vector

In [52]:
#Make the model
Model = Sequential()
Model.add(
    Embedding(
        input_dim = len(tokenizer.word_index) + 1,
        output_dim = dims,
        weights = [embedding_matrix],
        input_length = maxLen,
        trainable = False
    )
)
Model.add(
    LSTM(
        units = maxLen,
        return_sequences = True
        #possibly add dropout
    )
)
Model.add(
    LSTM(#
        units = maxLen,
        return_sequences = False
    )
)
Model.add(
    Dense(
        maxLen,
        activation = 'relu'
    )
)
Model.add(
    Dense(
        3,
        activation = 'softmax'
    )
)

In [53]:
Model.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 76, 100)           57388900  
_________________________________________________________________
lstm_18 (LSTM)               (None, 76, 76)            53808     
_________________________________________________________________
lstm_19 (LSTM)               (None, 76)                46512     
_________________________________________________________________
dense_7 (Dense)              (None, 76)                5852      
_________________________________________________________________
dense_8 (Dense)              (None, 3)                 231       
Total params: 57,495,303
Trainable params: 106,403
Non-trainable params: 57,388,900
_________________________________________________________________


In [54]:
Model.compile(
    optimizer = 'Adam',
    loss = 'categorical_crossentropy',
    metrics = ['accuracy']
)

In [55]:
Training_Loss = Model.fit(
    x = X_train_pad,
    y = Y_training,
    batch_size = 2048,
    epochs = 10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


TypeError: cannot unpack non-iterable History object

In [56]:
Dev_Loss, Dev_Accuracy = Model.evaluate(
    x = X_dev_pad,
    y = Y_dev,
    batch_size = 2048
)



In [57]:
print("Dev Loss : " + str(Dev_Loss))
print("Dev Accuracy : " + str(Dev_Accuracy))

Dev Loss : 0.3789831615569364
Dev Accuracy : 0.8293383121490479
