In [2]:
import numpy as np
import pandas as pd
import re
from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM, Dropout
from sklearn.model_selection import train_test_split
import keras
import keras_metrics as km

Using TensorFlow backend.


In [3]:
dataset = pd.read_csv('Sentiment.csv')

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13871 entries, 0 to 13870
Data columns (total 21 columns):
id                           13871 non-null int64
candidate                    13775 non-null object
candidate_confidence         13871 non-null float64
relevant_yn                  13871 non-null object
relevant_yn_confidence       13871 non-null float64
sentiment                    13871 non-null object
sentiment_confidence         13871 non-null float64
subject_matter               13545 non-null object
subject_matter_confidence    13871 non-null float64
candidate_gold               28 non-null object
name                         13871 non-null object
relevant_yn_gold             32 non-null object
retweet_count                13871 non-null int64
sentiment_gold               15 non-null object
subject_matter_gold          18 non-null object
text                         13871 non-null object
tweet_coord                  21 non-null object
tweet_created                13871 no

In [4]:
dataset = dataset[['text','sentiment']]
dataset = dataset[dataset.sentiment != "Neutral"]

In [5]:
dataset.head()

Unnamed: 0,text,sentiment
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive
5,"RT @GregAbbott_TX: @TedCruz: ""On my first day ...",Positive
6,RT @warriorwoman91: I liked her and was happy ...,Negative


In [6]:
dataset.shape

(10729, 2)

In [7]:
dataset['text'] = dataset['text'].apply(lambda x: x.lower())
dataset['text'] = dataset['text'].apply(lambda x: re.sub(r'[\W_]+', ' ', x))
dataset['text'] = dataset['text'].apply(lambda x: re.sub('rt','',x))
dataset['text'] = dataset['text'].apply(lambda x: x.lstrip())

In [8]:
dataset['sentiment'] = dataset['sentiment'].apply(lambda x: re.sub('Positive','1', x))
dataset['sentiment'] = dataset['sentiment'].apply(lambda x: re.sub('Negative','0', x))

In [9]:
dataset.head()

Unnamed: 0,text,sentiment
1,scottwalker didn t catch the full gopdebate la...,1
3,robgeorge that carly fiorina is trending hours...,1
4,danscavino gopdebate w realdonaldtrump deliver...,1
5,gregabbott tx tedcruz on my first day i will r...,1
6,warriorwoman91 i liked her and was happy when ...,0


In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [11]:
max_words = 2000

In [12]:
tokenizer = Tokenizer(num_words = max_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=' ')

In [13]:
tokenizer.fit_on_texts(dataset['text'].values)

In [14]:
x_tokenizer = tokenizer.texts_to_sequences(dataset['text'].values)

In [15]:
x_tokenizer = pad_sequences(x_tokenizer)

In [16]:
x_tokenizer

array([[   0,    0,    0, ...,   13,    4,   11],
       [   0,    0,    0, ...,  154,   14,   22],
       [   0,    0,    0, ...,   13,    4,   11],
       ...,
       [   0,    0,    0, ...,   77,   71,    3],
       [   0,    0,    0, ..., 1015, 1320,   79],
       [   0,    0,    0, ...,    4,   11,  716]])

In [17]:
y = np.array(dataset['sentiment'])

In [18]:
y

array(['1', '1', '1', ..., '1', '0', '1'], dtype=object)

# Data preprocessing is complete

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x_tokenizer, y, test_size = 0.33, random_state = 0)

In [20]:
x_train.shape

(7188, 30)

# Dataset has been split in training set and test set

# Time to build a NN using RNN and LSTM

In [81]:
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping

In [82]:
filepath="weights.best.hdf5"
MC = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
ES = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='min')
callbacks_list = [MC, ES]

In [83]:
model = Sequential()
model.add(Embedding(input_dim = max_words, output_dim = 128, embeddings_initializer = 'uniform',input_length = x_train.shape[1]))

In [84]:
model.add(Dropout(rate = 0.2))

In [85]:
model.add(LSTM(units = 200, return_sequences=False, dropout = 0.2, recurrent_dropout = 0.2))

In [86]:
##model.add(LSTM(units = 64, return_sequences=False))
##model.add(Dropout(rate = 0.3))

In [87]:
model.add(Dense(units = 10, activation = 'relu'))

In [88]:
model.add(Dense(units = 1, activation = 'sigmoid'))

In [89]:
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics = ['accuracy'])

In [90]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 30, 128)           256000    
_________________________________________________________________
dropout_5 (Dropout)          (None, 30, 128)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 200)               263200    
_________________________________________________________________
dense_11 (Dense)             (None, 10)                2010      
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 11        
Total params: 521,221
Trainable params: 521,221
Non-trainable params: 0
_________________________________________________________________


In [91]:
model.fit(x_train, y_train, epochs = 30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x22a60df7d30>

In [92]:
model.evaluate(x_test, y_test)



[1.002558620789148, 0.82942671563389]