In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, LSTM, Bidirectional
from keras import optimizers
from keras import losses
from keras import metrics
from keras import callbacks

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import tokenizer_from_json


import io
import json

In [13]:
import sentiment as sent

## Load the Data

In [2]:
def augment_datasets2(proportion=0.5):
    data = pd.read_csv('./data/trainingandtestdata/train.csv', encoding='utf-8')
    gop_debate = pd.read_csv("./data/trainingandtestdata/Sentiment.csv")
    
    data = data.rename(columns={'0': 'sentiment', '@switchfoot http://twitpic.com/2y1zl - Awww, that\'s a bummer.  You shoulda got David Carr of Third Day to do it. ;D': 'tweet'})
    data_proc = data.sample(frac=proportion, replace=False)
    data_proc = data_proc.loc[:, ['tweet', 'sentiment']]
    data_proc.loc[data_proc['sentiment'] == 4, 'sentiment'] = 1
    
    gop_debate = gop_debate.rename(columns={'text': 'tweet'})
    gop_debate_proc = gop_debate.loc[gop_debate['sentiment'] != 'Neutral', ['tweet', 'sentiment']]
    gop_debate_proc.loc[gop_debate_proc['sentiment'] == 'Positive', 'sentiment'] = 1
    gop_debate_proc.loc[gop_debate_proc['sentiment'] == 'Negative', 'sentiment'] = 0
    
    data_concat = pd.concat([data_proc, gop_debate_proc], ignore_index=True)
    data_concat = data_concat.sample(frac=1, replace=False).reset_index(drop=True)
    return data_concat

In [3]:
augmented_df2 = augment_datasets2(proportion = 1)

In [4]:
print(augmented_df2.shape)
augmented_df2.head()

(1610728, 2)


Unnamed: 0,tweet,sentiment
0,I dont know what i did,0
1,@MajaPiraja are you looking for a home-based j...,1
2,@legallove Yay! Bring your dancing shoes - it'...,1
3,"@AndrewMoriarty LOL well, I'm pro life, yet I'...",1
4,@shweri we are not mad frank and i both test ...,1


## Remove stopwords, emoticons, hashtags and mentions

In [6]:
stopwords = []
with open("./data/stopwords.txt") as f:
    lines = f.readlines()
for i in range(1,len(lines)):
    stopwords.append(lines[i].strip())
stopwords

['me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'over',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'only',
 'own',
 'same',
 'so',
 'too',
 'can',
 'will',
 'just',
 'should',
 'now']

In [7]:
emoticons = []
with open("./data/emoticons.txt") as f:
    lines = f.readlines()
for i in range(1,len(lines)):
    emoticons.append(lines[i].strip())
emoticons

[':-@',
 '>:o',
 '>:0',
 'D:<',
 'D:',
 'D8',
 'D;',
 'D=',
 'Dx',
 '>.<',
 '>_<',
 'd:<',
 'd:',
 'd8',
 'd;',
 'd=',
 'dx',
 'v.v',
 ':/',
 ':\\',
 '=/',
 '=\\',
 '>:/',
 '>:\\',
 ':-/',
 ':-\\',
 ':)',
 '(:',
 ';)',
 ';(',
 '(;',
 ');',
 ':-)',
 ':3',
 ':d',
 ':D',
 'xd',
 ":')",
 '^_^',
 '^.^',
 ':]',
 ':}',
 ':p',
 ':b',
 '=p',
 '=b',
 ':-p',
 ':-b',
 '=)',
 ':(',
 '):',
 ":'(",
 ':c',
 ':-(',
 '</3',
 ':[',
 ':{',
 'T.T',
 'o_o',
 'O_O',
 '0_o',
 'o_0',
 '0_O',
 'O_0',
 'o.o',
 'O.O',
 '0.o',
 'o.0',
 ':o',
 ':-o',
 '<3',
 ':p',
 ':b',
 '=p',
 '=b',
 ':-p',
 ':-b',
 ':$']

In [8]:
#function to remove hashtags and mentions
#remove stopwords and emoticons
#trasform everything to lowercase
def preprocess_tweet(tweet):
    tweet_lower = tweet.lower()
    tweet_words = tweet_lower.split()
    toberemoved = []
    for word in tweet_words:
        if word.startswith('@') or word.startswith('#') or word.startswith('http'):
            toberemoved.append(word)
        elif word in stopwords or word in emoticons:
            toberemoved.append(word)
    for word in toberemoved:
        tweet_words.remove(word)

    return ' '.join(tweet_words)

In [9]:
augmented_df2.iloc[:,0] = augmented_df2.iloc[:,0].map(preprocess_tweet)
augmented_df2.head()

Unnamed: 0,tweet,sentiment
0,i dont know i,0
1,looking home-based job? would like offer servi...,1
2,yay! bring dancing shoes - it's gone hardcore ...,1
3,"lol well, i'm pro life, yet i'm christian, roo...",1
4,"not mad frank i test new ui recently, very pre...",1


In [10]:
augmented_df2['sentiment'].value_counts()

0    808492
1    802236
Name: sentiment, dtype: int64

## Tokenize the tweets and create training/test sets

In [16]:
#load tokenizer
with open('./data/tokenizer_200k.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

In [19]:
max_words = 200000
max_length = 50

sequences1 = tokenizer.texts_to_sequences(texts)
#set the maximum length of each tweet based on dataset

padded_seq1 = pad_sequences(sequences1, maxlen=max_length)
labels1 = augmented_df2['sentiment'].values

train_proportion = 0.6
val_proportion = 0.2


x_train1 = padded_seq1[:int(train_proportion*len(padded_seq1))]
y_train1 = labels1[:int(train_proportion*len(padded_seq1))]

x_val1 = padded_seq1[int(train_proportion*len(padded_seq1)):int(train_proportion*len(padded_seq1))+int(val_proportion*len(padded_seq1))]
y_val1 = labels1[int(train_proportion*len(padded_seq1)):int(train_proportion*len(padded_seq1))+int(val_proportion*len(padded_seq1))]

x_test1 = padded_seq1[int(train_proportion*len(padded_seq1))+int(val_proportion*len(padded_seq1)):]
y_test1 = labels1[int(train_proportion*len(padded_seq1))+int(val_proportion*len(padded_seq1)):]


print(len(x_train1))
print(len(y_train1))
print(len(x_val1))
print(len(y_val1))
print(len(x_test1))
print(len(y_test1))

966436
966436
322145
322145
322147
322147


## Compile the model and load pre-trained neural network

In [20]:
embedding_dim = 100

lstm_model4 = Sequential()
lstm_model4.add(Embedding(max_words, embedding_dim, input_length=max_length))
lstm_model4.add(LSTM(64, return_sequences=True))
lstm_model4.add(LSTM(32))
lstm_model4.add(Dense(32, activation='relu'))
#output layer
lstm_model4.add(Dense(1, activation='sigmoid'))
lstm_model4.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

lstm_model4.load_weights('LSTM_model5_nostop.h5')

## Test on training, validation and test set 

In [21]:
train_score = lstm_model4.evaluate(x_train1, y_train1)



In [23]:
print(f"Train accuracy = {train_score[1]}")

Train accuracy = 0.8201515674591064


In [24]:
val_score = lstm_model4.evaluate(x_val1, y_val1)



In [25]:
print(f"Validation accuracy = {val_score[1]}")

Validation accuracy = 0.8209471106529236


In [26]:
test_score = lstm_model4.evaluate(x_test1, y_test1)



In [27]:
print(f"Test accuracy = {test_score[1]}")

Test accuracy = 0.8212027549743652
