In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, LSTM, Bidirectional
from keras import optimizers
from keras import losses
from keras import metrics
from keras import callbacks

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import io
import json

Please note that the datasets used to train the following models exceed the limit of 100mb set by github so they have not been uploaded to our online repository. All 

## Create the dataset

In [16]:
def augment_datasets(proportion=0.5):
    data = pd.read_csv('./data/trainingandtestdata/train.csv', encoding='utf-8')
    air_reviews = pd.read_csv('./data/twitter-airline-sentiment/Tweets.csv')
    
    data = data.rename(columns={'0': 'sentiment', '@switchfoot http://twitpic.com/2y1zl - Awww, that\'s a bummer.  You shoulda got David Carr of Third Day to do it. ;D': 'tweet'})
    data_proc = data.sample(frac=proportion, replace=False)
    data_proc = data_proc.loc[:, ['tweet', 'sentiment']]
    data_proc.loc[data_proc['sentiment'] == 4, 'sentiment'] = 1
    
    air_reviews = air_reviews.rename(columns={'airline_sentiment': 'sentiment', 'text': 'tweet'})
    air_reviews_proc = air_reviews.loc[air_reviews['sentiment'] != 'neutral', ['tweet', 'sentiment']]
    air_reviews_proc.loc[air_reviews['sentiment'] == 'positive', 'sentiment'] = 1
    air_reviews_proc.loc[air_reviews['sentiment'] == 'negative', 'sentiment'] = 0
    
    data_concat = pd.concat([data_proc, air_reviews_proc], ignore_index=True)
    data_concat = data_concat.sample(frac=1, replace=False).reset_index(drop=True)
    return data_concat

In [17]:
augmented_df = augment_datasets(proportion = 1)

In [18]:
print(augmented_df.shape)
augmented_df.head()

(1611540, 2)


Unnamed: 0,tweet,sentiment
0,Wow! Congrats @charlene29 very nice,1
1,@dadiaperbank ventral sounds more serious! goo...,0
2,It's too early,0
3,THE THING THAT SUCKS MOST ABOUT BRACES IS THAT...,0
4,Is sleepy sleepy time...last full day is tomor...,0


## Remove hashtags, mentions and links

In [19]:
#function to remove hashtags and mentions
def preprocess(tweet):
    tweet_words = tweet.split()
    for word in tweet_words:
        if word.startswith('@') or word.startswith('#') or word.startswith('http'):
            tweet_words.remove(word)
    return ' '.join(tweet_words)

In [20]:
aug_df_proc = augmented_df.copy(deep=True)

In [21]:
aug_df_proc.iloc[:,0] = aug_df_proc.iloc[:,0].map(preprocess)

In [22]:
aug_df_proc.head()

Unnamed: 0,tweet,sentiment
0,Wow! Congrats very nice,1
1,ventral sounds more serious! good luck w/festi...,0
2,It's too early,0
3,THE THING THAT SUCKS MOST ABOUT BRACES IS THAT...,0
4,Is sleepy sleepy time...last full day is tomorrow,0


## Tokenize and build training, validation and test sets

In [23]:
#number of words to consider in the dataset
max_words = 20000
tokenizer = Tokenizer(num_words = 20000)
texts = list(aug_df_proc['tweet'].values)
#create the token index based on tweets
tokenizer.fit_on_texts(texts)

In [17]:
print('Found %s unique tokens.' % len(tokenizer.word_index))

Found 336434 unique tokens.


In [10]:
#transform the tweets to sequences
sequences = tokenizer.texts_to_sequences(texts)

In [13]:
#set the maximum length of each tweet based on dataset
lens = [len(x) for x in sequences]
max_length = max(lens)

In [15]:
padded_seq = pad_sequences(sequences, maxlen=max_length)
labels = augmented_df['sentiment'].values

train_proportion = 0.6
val_proportion = 0.2


x_train = padded_seq[:int(train_proportion*len(padded_seq))]
y_train = labels[:int(train_proportion*len(padded_seq))]

x_val = padded_seq[int(train_proportion*len(padded_seq)):int(train_proportion*len(padded_seq))+int(val_proportion*len(padded_seq))]
y_val = labels[int(train_proportion*len(padded_seq)):int(train_proportion*len(padded_seq))+int(val_proportion*len(padded_seq))]

x_test = padded_seq[int(train_proportion*len(padded_seq))+int(val_proportion*len(padded_seq)):]
y_test = labels[int(train_proportion*len(padded_seq))+int(val_proportion*len(padded_seq)):]


print(len(x_train))
print(len(y_train))
print(len(x_val))
print(len(y_val))
print(len(x_test))
print(len(y_test))





966924
966924
322308
322308
322308
322308


## Fit a baseline neural network with one hidden Dense layer on top of Embedding layer

In [68]:
#set the dimensions of the embedding layer, each word now is a vector in
#embedding_dim-dimensional space
embedding_dim = 100 
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_length))
model.add(Flatten())
#one hidden layer with 32 neurons
model.add(Dense(32, activation='relu'))
#output layer
model.add(Dense(1, activation='sigmoid'))
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           2000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                160032    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 2,160,065
Trainable params: 2,160,065
Non-trainable params: 0
_________________________________________________________________


In [69]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])

In [73]:
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))

Instructions for updating:
Use tf.cast instead.
Train on 966924 samples, validate on 322308 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [74]:
model.save_weights('simple_model.h5')

In [97]:
print(f'Model 1 accuracy on test set: {model.evaluate(x_test, y_test)[1]}')


Model 1 accuracy on test set: 0.7900145053863525


## Use LSTM hidden layers 

In [128]:
callbacks_LSTM1 = [
        callbacks.EarlyStopping(
            monitor='acc',
            patience=1, #stop training if accuracy has not improved for 2 epochs
        ),
        callbacks.ModelCheckpoint(
            filepath='LSTM_model1.h5',
            monitor='val_loss',
            save_best_only=True,
        )
]

In [134]:
embedding_dim = 100 
lstm_model = Sequential()
lstm_model.add(Embedding(max_words, embedding_dim, input_length=max_length))
lstm_model.add(LSTM(64, return_sequences=True))
lstm_model.add(LSTM(32))
lstm_model.add(Dense(32, activation='relu'))
#output layer
lstm_model.add(Dense(1, activation='sigmoid'))
lstm_model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 50, 100)           2000000   
_________________________________________________________________
lstm_9 (LSTM)                (None, 50, 64)            42240     
_________________________________________________________________
lstm_10 (LSTM)               (None, 32)                12416     
_________________________________________________________________
dense_6 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 33        
Total params: 2,055,745
Trainable params: 2,055,745
Non-trainable params: 0
_________________________________________________________________


In [135]:
lstm_model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = lstm_model.fit(x_train, y_train, 
                    epochs=10, 
                    batch_size=32, 
                    callbacks=callbacks_LSTM1,
                    validation_data=(x_val, y_val))

Train on 966924 samples, validate on 322308 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [136]:
print(f'Model 1 accuracy on test set: {lstm_model.evaluate(x_test, y_test)[1]}')


Model 1 accuracy on test set: 0.8211400508880615


## Use Bidirectional LSTM layers 

In [172]:
callbacks_LSTM2 = [
        callbacks.EarlyStopping(
            monitor='acc',
            patience=1, #stop training if accuracy has not improved for 2 epochs
        ),
        callbacks.ModelCheckpoint(
            filepath='LSTM_model2.h5',
            monitor='val_loss',
            save_best_only=True,
        )
]

In [182]:
embedding_dim = 100 
lstm_model2 = Sequential()
lstm_model2.add(Embedding(max_words, embedding_dim, input_length=max_length))
lstm_model2.add(Bidirectional(LSTM(64, return_sequences=True)))
lstm_model2.add(Bidirectional(LSTM(32)))
lstm_model2.add(Dense(32, activation='relu'))
#output layer
lstm_model2.add(Dense(1, activation='sigmoid'))
lstm_model2.summary()

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 50, 100)           2000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 128)           84480     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense_14 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 33        
Total params: 2,127,809
Trainable params: 2,127,809
Non-trainable params: 0
_________________________________________________________________


In [183]:
lstm_model2.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = lstm_model2.fit(x_train, y_train, 
                    epochs=10, 
                    batch_size=32, 
                    callbacks=callbacks_LSTM2,
                    validation_data=(x_val, y_val))

Train on 966924 samples, validate on 322308 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Use Pretrained GloVe Embedding

In [24]:
embeddings_index = {}
f = open('./glove.twitter.27B/glove.twitter.27B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [25]:
max_words = 200000
word_index = tokenizer.word_index
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [22]:
callbacks_LSTM_pre = [
        callbacks.EarlyStopping(
            monitor='acc',
            patience=1, #stop training if accuracy has not improved for 2 epochs
        ),
        callbacks.ModelCheckpoint(
            filepath='LSTM_model3_pre.h5',
            monitor='val_loss',
            save_best_only=True,
        )
]

lstm_model3 = Sequential()
lstm_model3.add(Embedding(max_words, embedding_dim, input_length=max_length))
lstm_model3.add(LSTM(64, return_sequences=True))
lstm_model3.add(LSTM(32))
lstm_model3.add(Dense(80, activation='relu'))
#output layer
lstm_model3.add(Dense(1, activation='sigmoid'))
lstm_model3.summary()

lstm_model3.layers[0].set_weights([embedding_matrix])
lstm_model3.layers[0].trainable = False

lstm_model3.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])
history = lstm_model3.fit(x_train, y_train, 
                    epochs=10, 
                    batch_size=32, 
                    callbacks=callbacks_LSTM_pre,
                    validation_data=(x_val, y_val))


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 100)           10000000  
_________________________________________________________________
lstm_3 (LSTM)                (None, 50, 64)            42240     
_________________________________________________________________
lstm_4 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense_3 (Dense)              (None, 80)                2640      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 81        
Total params: 10,057,377
Trainable params: 10,057,377
Non-trainable params: 0
_________________________________________________________________
Train on 966924 samples, validate on 322308 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
E

In [161]:
print(f'Model 3 accuracy on test set: {lstm_model3.evaluate(x_test, y_test)[1]}')


Model 3 accuracy on test set: 0.8208855986595154


In [66]:
political_tweets = ["You will come in 4th in New Hampshire in February and withdraw in shame. But don't worry... you can use Waze to get back to Massachusetts. You remember Mass., don't you",
                   "You Can NEVER EARN My Vote ! ",
                   "’ll donate 0. Go home.",
                   "You are not a true progressive.",
                   "That’s like throwing cash on the floor and pissing on it!",
                   "'m a continuing contributor and have been since the day she announced. Every Plan that comes out: showing how it is needed, how it is paid for and how it will increase economic growth in a Green Economy - makes me proud to support her all the way to the White House!",
                   "Your toes? Yuck, no thanks",
                   "God bless him . @PeteButtigieg by far the most honest and innovative candidate with real solutions to our Problems  !!!!",
                   "Go Warren I'm proud of you",
                   "You are bad"]
political_tweets_proc = list(map(preprocess, political_tweets))
pol_seqs = tokenizer.texts_to_sequences(political_tweets_proc)
pol_seqs_padded = pad_sequences(pol_seqs, maxlen=max_length)
lstm_model3.predict(pol_seqs_padded)

array([[0.6185163 ],
       [0.41497543],
       [0.44421557],
       [0.869727  ],
       [0.2883256 ],
       [0.8102244 ],
       [0.79652214],
       [0.71278673],
       [0.9978855 ],
       [0.3832898 ]], dtype=float32)

## Add some more related labelled data: 
### Tweets from the GOP debate in 2016

In [4]:
def augment_datasets2(proportion=0.5):
    data = pd.read_csv('./data/trainingandtestdata/train.csv', encoding='utf-8')
    gop_debate = pd.read_csv("./data/trainingandtestdata/Sentiment.csv")
    
    data = data.rename(columns={'0': 'sentiment', '@switchfoot http://twitpic.com/2y1zl - Awww, that\'s a bummer.  You shoulda got David Carr of Third Day to do it. ;D': 'tweet'})
    data_proc = data.sample(frac=proportion, replace=False)
    data_proc = data_proc.loc[:, ['tweet', 'sentiment']]
    data_proc.loc[data_proc['sentiment'] == 4, 'sentiment'] = 1
    
    gop_debate = gop_debate.rename(columns={'text': 'tweet'})
    gop_debate_proc = gop_debate.loc[gop_debate['sentiment'] != 'Neutral', ['tweet', 'sentiment']]
    gop_debate_proc.loc[gop_debate_proc['sentiment'] == 'Positive', 'sentiment'] = 1
    gop_debate_proc.loc[gop_debate_proc['sentiment'] == 'Negative', 'sentiment'] = 0
    
    data_concat = pd.concat([data_proc, gop_debate_proc], ignore_index=True)
    data_concat = data_concat.sample(frac=1, replace=False).reset_index(drop=True)
    return data_concat

In [5]:
augmented_df2 = augment_datasets2(proportion = 1)

In [6]:
print(augmented_df2.shape)
augmented_df2.head()

(1610728, 2)


Unnamed: 0,tweet,sentiment
0,is watchin friends,1
1,I can't stand this heat roll on winter,0
2,@mariancall I'm just glad that you didn't thin...,1
3,@Leelian972 Its not found yet n its extremely ...,0
4,Misses matty poo! he'll get a dutch rudder wh...,0


## Remove stopwords, emoticons, hashtags and mentions

In [8]:
stopwords = []
with open("./data/stopwords.txt") as f:
    lines = f.readlines()
for i in range(1,len(lines)):
    stopwords.append(lines[i].strip())
stopwords

['me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'over',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'only',
 'own',
 'same',
 'so',
 'too',
 'can',
 'will',
 'just',
 'should',
 'now']

In [9]:
emoticons = []
with open("./data/emoticons.txt") as f:
    lines = f.readlines()
for i in range(1,len(lines)):
    emoticons.append(lines[i].strip())
emoticons

[':-@',
 '>:o',
 '>:0',
 'D:<',
 'D:',
 'D8',
 'D;',
 'D=',
 'Dx',
 '>.<',
 '>_<',
 'd:<',
 'd:',
 'd8',
 'd;',
 'd=',
 'dx',
 'v.v',
 ':/',
 ':\\',
 '=/',
 '=\\',
 '>:/',
 '>:\\',
 ':-/',
 ':-\\',
 ':)',
 '(:',
 ';)',
 ';(',
 '(;',
 ');',
 ':-)',
 ':3',
 ':d',
 ':D',
 'xd',
 ":')",
 '^_^',
 '^.^',
 ':]',
 ':}',
 ':p',
 ':b',
 '=p',
 '=b',
 ':-p',
 ':-b',
 '=)',
 ':(',
 '):',
 ":'(",
 ':c',
 ':-(',
 '</3',
 ':[',
 ':{',
 'T.T',
 'o_o',
 'O_O',
 '0_o',
 'o_0',
 '0_O',
 'O_0',
 'o.o',
 'O.O',
 '0.o',
 'o.0',
 ':o',
 ':-o',
 '<3',
 ':p',
 ':b',
 '=p',
 '=b',
 ':-p',
 ':-b',
 ':$']

In [10]:
#function to remove hashtags and mentions
#remove stopwords and emoticons
#trasform everything to lowercase
def preprocess_tweet(tweet):
    tweet_lower = tweet.lower()
    tweet_words = tweet_lower.split()
    toberemoved = []
    for word in tweet_words:
        if word.startswith('@') or word.startswith('#') or word.startswith('http'):
            toberemoved.append(word)
        elif word in stopwords or word in emoticons:
            toberemoved.append(word)
    for word in toberemoved:
        tweet_words.remove(word)

    return ' '.join(tweet_words)


In [10]:
augmented_df2.iloc[:,0] = augmented_df2.iloc[:,0].map(preprocess_tweet)
augmented_df2.head()

Unnamed: 0,tweet,sentiment
0,i kno! u would quit disappearin!!!,1
1,danny i one follower cause i don't let anyone ...,1
2,lmfao. ily guys know. i live ontario. it's bor...,0
3,alright? x,0
4,really never ended special,0


In [11]:
augmented_df2['sentiment'].value_counts()

0    808492
1    802236
Name: sentiment, dtype: int64

In [192]:
#number of words to consider in the dataset
max_words = 300000
tokenizer = Tokenizer(num_words = max_words)
texts = list(augmented_df2['tweet'].values)
#create the token index based on tweets
tokenizer.fit_on_texts(texts)

In [193]:
print('Found %s unique tokens.' % len(tokenizer.word_index))

Found 312520 unique tokens.


In [194]:
sequences1 = tokenizer.texts_to_sequences(texts)
#set the maximum length of each tweet based on dataset
lens1 = [len(x) for x in sequences1]
max_length1 = max(lens1)

padded_seq1 = pad_sequences(sequences1, maxlen=max_length1)
labels1 = augmented_df2['sentiment'].values

train_proportion = 0.6
val_proportion = 0.2


x_train1 = padded_seq1[:int(train_proportion*len(padded_seq1))]
y_train1 = labels1[:int(train_proportion*len(padded_seq1))]

x_val1 = padded_seq1[int(train_proportion*len(padded_seq1)):int(train_proportion*len(padded_seq1))+int(val_proportion*len(padded_seq1))]
y_val1 = labels1[int(train_proportion*len(padded_seq1)):int(train_proportion*len(padded_seq1))+int(val_proportion*len(padded_seq1))]

x_test1 = padded_seq1[int(train_proportion*len(padded_seq1))+int(val_proportion*len(padded_seq1)):]
y_test1 = labels1[int(train_proportion*len(padded_seq1))+int(val_proportion*len(padded_seq1)):]


print(len(x_train1))
print(len(y_train1))
print(len(x_val1))
print(len(y_val1))
print(len(x_test1))
print(len(y_test1))


966436
966436
322145
322145
322147
322147


In [195]:
word_index = tokenizer.word_index
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

            
callbacks_LSTM_pre2 = [
        callbacks.EarlyStopping(
            monitor='acc',
            patience=1, #stop training if accuracy has not improved for 2 epochs
        ),
        callbacks.ModelCheckpoint(
            filepath='LSTM_model4_nostop.h5',
            monitor='val_loss',
            save_best_only=True,
        )
]

lstm_model4 = Sequential()
lstm_model4.add(Embedding(max_words, embedding_dim, input_length=max_length))
lstm_model4.add(LSTM(64, return_sequences=True))
lstm_model4.add(LSTM(32))
lstm_model4.add(Dense(32, activation='relu'))
#output layer
lstm_model4.add(Dense(1, activation='sigmoid'))
lstm_model4.summary()

lstm_model4.layers[0].set_weights([embedding_matrix])
lstm_model4.layers[0].trainable = False

lstm_model4.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])
history = lstm_model4.fit(x_train1, y_train1, 
                    epochs=10, 
                    batch_size=32, 
                    callbacks=callbacks_LSTM_pre2,
                    validation_data=(x_val1, y_val1))


Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 50, 100)           30000000  
_________________________________________________________________
lstm_9 (LSTM)                (None, 50, 64)            42240     
_________________________________________________________________
lstm_10 (LSTM)               (None, 32)                12416     
_________________________________________________________________
dense_9 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 33        
Total params: 30,055,745
Trainable params: 30,055,745
Non-trainable params: 0
_________________________________________________________________
Train on 966436 samples, validate on 322145 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
E

## Test the behaviour of the classifier on some custom data which resemble the political tweets that we want to predict the sentiment of.

In [229]:
political_tweets = ["You will come in 4th in New Hampshire in February and withdraw in shame. But don't worry... you can use Waze to get back to Massachusetts. You remember Mass., don't you",
                   "You Can NEVER EARN My Vote ! ",
                   "’ll donate 0. Go home.",
                   "You are not a true progressive.",
                   "That’s like throwing cash on the floor and pissing on it!",
                   "'m a continuing contributor and have been since the day she announced. Every Plan that comes out: showing how it is needed, how it is paid for and how it will increase economic growth in a Green Economy - makes me proud to support her all the way to the White House!",
                   "Your toes? Yuck, no thanks",
                   "God bless him . @PeteButtigieg by far the most honest and innovative candidate with real solutions to our Problems  !!!!",
                   "Go Warren I'm proud of you",
                   "You are bad",
                   "you are a true fighter",
                   "you are a liar!",
                   "I do not support you",
                   "I support you!",
                   "Get out there and reach the people!! Out of all the democratic candidates I believe that your message can resonate with everyone. I trust you. Your logic and calm level headed approach reminds me of how politics should be. I want you to succeed.",
                   "It’s not your time Pete. Drop out and run for Governor you’re doing more harm to the party than good.",
                    "Hell No!",
                    "You're amazing. You're qualified, and you're ready",
                    "by far the most honest and innovative candidate with real solutions",
                    "HE IS PROGRESSIVE",
                    "by far the most honest and innovative candidate with real solutions",
                    "Yeah it's called lying she does it very well if you don't remember correctly last year she was a native American off with her head she is the scum of the Earth",
                    "0 votes for you during those elections"
                    "Yes, somehow you war criminals manage to get away with it every time.",
                    "Get 'em Joe."
                   ]
political_tweets_proc = list(map(preprocess_tweet, political_tweets))
pol_seqs = tokenizer.texts_to_sequences(political_tweets_proc)
pol_seqs_padded = pad_sequences(pol_seqs, maxlen=max_length)
lstm_model4.predict(pol_seqs_padded)

array([[0.3400808 ],
       [0.03073052],
       [0.6530683 ],
       [0.3990408 ],
       [0.63873905],
       [0.9362867 ],
       [0.21128505],
       [0.68271136],
       [0.9712548 ],
       [0.12916476],
       [0.8177171 ],
       [0.29179707],
       [0.02000201],
       [0.92986953],
       [0.7166861 ],
       [0.10063127],
       [0.12219265],
       [0.9922502 ],
       [0.93887365],
       [0.6897486 ],
       [0.93887365],
       [0.3357073 ],
       [0.5074039 ],
       [0.9185116 ]], dtype=float32)

In [213]:
print(f'Model 4 accuracy on test set: {lstm_model4.evaluate(x_test1, y_test1)[1]}')


Model 4 accuracy on test set: 0.8087487816810608


In [234]:
tokenizer_json = tokenizer.to_json()
with io.open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

## Reduce the model parameters by decreasing the max_words of the tokenizer to 200k

In [12]:
#number of words to consider in the dataset
max_words = 200000
tokenizer = Tokenizer(num_words = max_words)
texts = list(augmented_df2['tweet'].values)
#create the token index based on tweets
tokenizer.fit_on_texts(texts)

In [13]:
sequences1 = tokenizer.texts_to_sequences(texts)
#set the maximum length of each tweet based on dataset
lens1 = [len(x) for x in sequences1]
max_length1 = max(lens1)

padded_seq1 = pad_sequences(sequences1, maxlen=max_length1)
labels1 = augmented_df2['sentiment'].values

train_proportion = 0.6
val_proportion = 0.2


x_train1 = padded_seq1[:int(train_proportion*len(padded_seq1))]
y_train1 = labels1[:int(train_proportion*len(padded_seq1))]

x_val1 = padded_seq1[int(train_proportion*len(padded_seq1)):int(train_proportion*len(padded_seq1))+int(val_proportion*len(padded_seq1))]
y_val1 = labels1[int(train_proportion*len(padded_seq1)):int(train_proportion*len(padded_seq1))+int(val_proportion*len(padded_seq1))]

x_test1 = padded_seq1[int(train_proportion*len(padded_seq1))+int(val_proportion*len(padded_seq1)):]
y_test1 = labels1[int(train_proportion*len(padded_seq1))+int(val_proportion*len(padded_seq1)):]


print(len(x_train1))
print(len(y_train1))
print(len(x_val1))
print(len(y_val1))
print(len(x_test1))
print(len(y_test1))

966436
966436
322145
322145
322147
322147


## Fit a unidirectional LSTM netwotk with pretrained embedding layer

In [16]:
word_index = tokenizer.word_index
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

            
callbacks_LSTM_pre3 = [
        callbacks.EarlyStopping(
            monitor='acc',
            patience=1, #stop training if accuracy has not improved for 2 epochs
        ),
        callbacks.ModelCheckpoint(
            filepath='LSTM_model5_nostop.h5',
            monitor='val_loss',
            save_best_only=True,
        )
]

lstm_model5 = Sequential()
lstm_model5.add(Embedding(max_words, embedding_dim, input_length=max_length1))
lstm_model5.add(LSTM(64, return_sequences=True))
lstm_model5.add(LSTM(32))
lstm_model5.add(Dense(32, activation='relu'))
#output layer
lstm_model5.add(Dense(1, activation='sigmoid'))
lstm_model5.summary()

lstm_model5.layers[0].set_weights([embedding_matrix])
lstm_model5.layers[0].trainable = False

lstm_model5.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])
history = lstm_model5.fit(x_train1, y_train1, 
                    epochs=10, 
                    batch_size=32, 
                    callbacks=callbacks_LSTM_pre3,
                    validation_data=(x_val1, y_val1))

Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           20000000  
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 64)            42240     
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 20,055,745
Trainable params: 20,055,745
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.ca

In [18]:
political_tweets = ["You will come in 4th in New Hampshire in February and withdraw in shame. But don't worry... you can use Waze to get back to Massachusetts. You remember Mass., don't you",
                   "You Can NEVER EARN My Vote ! ",
                   "’ll donate 0. Go home.",
                   "You are not a true progressive.",
                   "That’s like throwing cash on the floor and pissing on it!",
                   "'m a continuing contributor and have been since the day she announced. Every Plan that comes out: showing how it is needed, how it is paid for and how it will increase economic growth in a Green Economy - makes me proud to support her all the way to the White House!",
                   "Your toes? Yuck, no thanks",
                   "God bless him . @PeteButtigieg by far the most honest and innovative candidate with real solutions to our Problems  !!!!",
                   "Go Warren I'm proud of you",
                   "You are bad",
                   "you are a true fighter",
                   "you are a liar!",
                   "I do not support you",
                   "I support you!",
                   "Get out there and reach the people!! Out of all the democratic candidates I believe that your message can resonate with everyone. I trust you. Your logic and calm level headed approach reminds me of how politics should be. I want you to succeed.",
                   "It’s not your time Pete. Drop out and run for Governor you’re doing more harm to the party than good.",
                    "Hell No!",
                    "You're amazing. You're qualified, and you're ready",
                    "by far the most honest and innovative candidate with real solutions",
                    "HE IS PROGRESSIVE",
                    "by far the most honest and innovative candidate with real solutions",
                    "Yeah it's called lying she does it very well if you don't remember correctly last year she was a native American off with her head she is the scum of the Earth",
                    "0 votes for you during those elections"
                    "Yes, somehow you war criminals manage to get away with it every time.",
                    "Get 'em Joe."
                   ]
political_tweets_proc = list(map(preprocess_tweet, political_tweets))
pol_seqs = tokenizer.texts_to_sequences(political_tweets_proc)
pol_seqs_padded = pad_sequences(pol_seqs, maxlen=max_length1)
lstm_model5.predict(pol_seqs_padded)

array([[0.36327988],
       [0.01246384],
       [0.6152877 ],
       [0.38871288],
       [0.10964805],
       [0.92565966],
       [0.08628792],
       [0.9773206 ],
       [0.99119425],
       [0.14640832],
       [0.92075986],
       [0.3329754 ],
       [0.07705918],
       [0.986609  ],
       [0.8022307 ],
       [0.43946102],
       [0.14107105],
       [0.96037275],
       [0.9534604 ],
       [0.86061513],
       [0.9534604 ],
       [0.51057696],
       [0.35316393],
       [0.8803505 ]], dtype=float32)

In [20]:
tokenizer_json = tokenizer.to_json()
with io.open('tokenizer_200k.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))