In [40]:
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt

# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Packages for modeling
from keras import models, Model
from keras import layers
from keras import regularizers
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional, Input
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.engine.topology import Layer
#from keras import initializations
from keras import initializers, regularizers, constraints

In [41]:
NB_WORDS = 50000  # Parameter indicating the number of words we'll put in the dictionary
NB_START_EPOCHS = 32  # Number of epochs we usually start to train with
BATCH_SIZE = 128  #

In [42]:
df = pd.read_csv('Tweets.csv')
df = df.reindex(np.random.permutation(df.index))  
df = df[['text', 'airline_sentiment']]
df.head()

Unnamed: 0,text,airline_sentiment
668,@united 1k and had problem getting out of FLL ...,negative
319,@virginamerica may start service to Hawaii fro...,neutral
718,@united LHR arrival lounge #fail. Waited 20 mi...,negative
12694,@AmericanAir of delays and trapped on planes w...,negative
12918,@AmericanAir you should really explain custome...,negative


In [43]:
def remove_stopwords(input_text):
        stopwords_list = stopwords.words('english')
        # Some words which might indicate a certain sentiment are kept via a whitelist
        whitelist = ["n't", "not", "no"]
        words = input_text.split() 
        clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
        return " ".join(clean_words) 

In [44]:
def remove_mentions(input_text):
        return re.sub(r'@\w+', '', input_text)

In [45]:
df.text = df.text.apply(remove_stopwords).apply(remove_mentions)
df.head()

Unnamed: 0,text,airline_sentiment
668,1k problem getting FLL IAH sent DM making con...,negative
319,may start service Hawaii #SanFrancisco year h...,neutral
718,LHR arrival lounge #fail. Waited 20 mins show...,negative
12694,delays trapped planes no water air. Never eve...,negative
12918,really explain customer service gate agent 11...,negative


In [46]:
X_train, X_test, y_train, y_test = train_test_split(df.text, df.airline_sentiment, test_size=0.1, random_state=37)
print('# Train data samples:', X_train.shape[0])
print('# Test data samples:', X_test.shape[0])
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

# Train data samples: 13176
# Test data samples: 1464


In [47]:
X_train = np.array(X_train)
y_train = np.array(y_train)

In [48]:
X_test = np.array(X_test)
y_test = np.array(y_test)

In [49]:
tk = Tokenizer(num_words=NB_WORDS,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True,
               split=" ", 
               char_level=False)
tk.fit_on_texts(X_train)

In [50]:
tokenizer = TreebankWordTokenizer()
training = []
for i in range(len(X_train)):
    training.append(tokenizer.tokenize(X_train[i]))
for i in range(len(training)):
    training[i] = [x.lower() for x in training[i]]

In [51]:
tokenizer = TreebankWordTokenizer()
test = []
for i in range(len(X_test)):
    test.append(tokenizer.tokenize(X_test[i]))
for i in range(len(test)):
    test[i] = [x.lower() for x in test[i]]

In [13]:
embeddings_index = {}
with open('glove.6B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        
        coefs = [float(i) for i in values[1:]]
        embeddings_index[word] = coefs

embeddings_index['<PAD>'] = [0] * 300
embeddings_index['<UNK>'] = [1] * 300

In [52]:
punct = ['!','"', '#', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', 
         '[', '/', ']', '^', '_', '`', '{', '|', '}', '~', '\t', '\n']
train_X =[]
for i in range(len(training)):
    sentence = []
    for j in range(len(training[i])):
        if training[i][j] in punct:
            pass
        else:
            sentence.append(training[i][j])
    train_X.append(sentence)

In [53]:
test_X =[]
for i in range(len(test)):
    sentence = []
    for j in range(len(test[i])):
        if test[i][j] in punct:
            pass
        else:
            sentence.append(test[i][j])
    test_X.append(sentence)

In [54]:
MAX_SEQ = 20
for s in range(len(train_X)):
    n = MAX_SEQ - len(train_X[s])
    if n < 0:
        train_X[s] = train_X[s][:MAX_SEQ]
    else:
        for i in range(n):
            train_X[s].append('<PAD>')
    for v in range(len(train_X[s])):
        if train_X[s][v] not in embeddings_index:
            train_X[s][v] = embeddings_index['<UNK>']
        else:
            train_X[s][v] = embeddings_index[train_X[s][v]]

In [55]:
MAX_SEQ = 20
for s in range(len(test_X)):
    n = MAX_SEQ - len(test_X[s])
    if n < 0:
        test_X[s] = test_X[s][:MAX_SEQ]
    else:
        for i in range(n):
            test_X[s].append('<PAD>')
    for v in range(len(test_X[s])):
        if test_X[s][v] not in embeddings_index:
            test_X[s][v] = embeddings_index['<UNK>']
        else:
            test_X[s][v] = embeddings_index[test_X[s][v]]

In [56]:
reverse_word_map = dict(map(reversed, tk.word_index.items()))

In [57]:
X_train[1520]

" I've line half hour trying see representative, might even miss next flight too, unacceptable"

In [58]:
train_X = np.array(train_X)
train_X.shape

(13176, 20, 300)

In [59]:
test_X = np.array(test_X)
test_X.shape

(1464, 20, 300)

In [60]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)

print('"{}" is converted into {}'.format(y_train[2], y_train_le[1]))
print('"{}" is converted into {}'.format(y_train_le[1], y_train_oh[1]))

"negative" is converted into 1
"1" is converted into [0. 1. 0.]


In [61]:
input_layer = Input( batch_shape = (None, MAX_SEQ, 300))
lstm_layer = Bidirectional(LSTM(units=MAX_SEQ, dropout = 0.25, recurrent_dropout=0.25))(input_layer)
x = Dropout(0.25)(lstm_layer)
merged = Dense(units=20, activation='relu')(x)
merged = Dropout(0.25)(merged)
merged = BatchNormalization()(merged)
output_layer = Dense(3, activation="softmax")(merged)

model = Model(inputs=input_layer, outputs=output_layer)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 20, 300)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 40)                51360     
_________________________________________________________________
dropout_5 (Dropout)          (None, 40)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 20)                820       
_________________________________________________________________
dropout_6 (Dropout)          (None, 20)                0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 20)                80        
_________________________________________________________________
dense_6 (Dense)              (None, 3)                 63        
Total para

In [62]:
model.compile(optimizer='adam'
              , loss='categorical_crossentropy'
              , metrics=['accuracy'])

In [63]:
# model.load_weights('weight_twitter_embedding.32.hdf5')

In [64]:
checkpoint = ModelCheckpoint('weight_twitter_embedding.{epoch:02d}.hdf5', monitor='val_loss', save_best_only=True)
tb = TensorBoard(log_dir='./Graph', histogram_freq=0,  
          write_graph=True, write_images=True)
es = EarlyStopping(monitor='val_loss', 
                   min_delta=0.01,
                   patience=3,
                   verbose=0,
                   mode='auto')
callbacks_list = [checkpoint, tb]

In [65]:
X_valid = train_X[:10*BATCH_SIZE]
Y_valid = y_train_oh[:10*BATCH_SIZE]
train_X = train_X[10*BATCH_SIZE:]
y_train_oh = y_train_oh[10*BATCH_SIZE:]

In [77]:
history = model.fit(train_X, y_train_oh, validation_data=(X_valid, Y_valid), callbacks=callbacks_list, epochs=NB_START_EPOCHS
                       , batch_size=BATCH_SIZE)

Train on 11896 samples, validate on 1280 samples
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


In [78]:
max(history.history['val_acc'])

0.809375

In [86]:
model.load_weights('weight_twitter_embedding.26.hdf5')

In [87]:
model.evaluate(test_X, y_test_oh)



[0.5831101175214424, 0.8005464484131402]

In [88]:
def prep_data(inp):
    punct = ['!','"', '#', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', 
         '[', '/', ']', '^', '_', '`', '{', '|', '}', '~', '\t', '\n']
#     neg = negate_sequence(inp)
    for i in punct:
        inp.replace(i, '')
    tokenizer = TreebankWordTokenizer()
    inp = tokenizer.tokenize(inp)
    MAX_SEQ = 20
    n = MAX_SEQ - len(inp)
    if n < 0:
        inp = inp[:MAX_SEQ]
    else:
        for i in range(n):
            inp.append('<PAD>')
    for v in range(len(inp)):
        if inp[v] not in embeddings_index:
            inp[v] = embeddings_index['<UNK>']
        else:
            inp[v] = embeddings_index[inp[v]]
    return np.reshape(np.array(inp) , (1 , 20 , 300))

In [159]:
# def negate_sequence(text):
#     negation = False
#     tk = TreebankWordTokenizer()
#     negs = ["not", "n't", "no"]
#     words = tk.tokenize(text)
#     for word in words:
#         word = word.lower()
#         if word in negs:
#             negation = not negation
#     return negation

In [160]:
# def classify(sent):
#     neg = negate_sequence(sent)
#     prep = prep_data(sent)
#     arr = model.predict(prep)
#     print(arr)
#     argmax = arr.argmax()
#     if argmax == 0 and not neg:
#         print("negative")
#     elif argmax == 1:
#         print("neutral")
#     elif argmax == 2:
#         print("positive")
#     elif argmax == 0 and neg:
#         tk = TreebankWordTokenizer()
#         words = tk.tokenize(sent)
#         negs = ["not", "n't", "no"]
#         sent2 = []
#         for word in words:
#             if word not in negs:
#                 sent2.append(word)
#         new = ' '.join(sent2)
#         new_prep = prep_data(new)
#         new_argmax = model.predict(new_prep).argmax()
#         if new_argmax == argmax:
#             print('positive')
#         else:
#             print('negative')

In [173]:
sent = "our airplane's door was not working and we were stucked in there for hours"
model.predict(prep_data(sent))

array([[0.7832205 , 0.04439801, 0.17238142]], dtype=float32)