In [22]:
import pandas as pd
import numpy as np
import pickle
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, SpatialDropout1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [23]:
def save_tokenizer(tokenizer):
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
def load_tokenizer():
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
        return tokenizer

In [14]:
data = pd.read_csv('data/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')
data = data[['target', 'text']]

In [15]:
data

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


In [16]:
accepted_chars = 'abcdefghijklmnopqrstuvwxyz '

def normalize(line):
    new_chars = [c.lower() for c in line if c.lower() in accepted_chars]
    return ''.join(new_chars)

In [17]:
for idx,row in data.iterrows():
    # print(data.at[idx, 'text'])
    data.at[idx, 'text'] = normalize(data.at[idx, 'text'].replace('rt', ' '))
    ## row[1] = normalize(row[1].replace('rt',' '))

In [18]:
data

Unnamed: 0,target,text
0,0,switchfoot httptwitpiccomyzl awww thats a bum...
1,0,is upset that he cant update his facebook by t...
2,0,kenichan i dived many times for the ball manag...
3,0,my whole body feels itchy and like its on fire
4,0,nationwideclass no its not behaving at all im ...
...,...,...
1599995,4,just woke up having no school is the best feel...
1599996,4,thewdbcom very cool to hear old walt intervie...
1599997,4,are you ready for your mojo makeover ask me fo...
1599998,4,happy th bi hday to my boo of alll time tupac ...


In [19]:
max_features = 2000

In [20]:
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

In [11]:
X

array([[   0,    0,    0, ...,   40,    9,  386],
       [   0,    0,    0, ...,   42,  264, 1203],
       [   0,    0,    0, ...,   38,   34,   12],
       ...,
       [   0,    0,    0, ...,  612,   15,   10],
       [   0,    0,    0, ...,  503,   12,   50],
       [   0,    0,    0, ...,    0,    0,  117]], dtype=int32)

In [8]:
batch_size = 32
embed_dim = 128
lstm_out = 196
max_features = 2000

In [14]:
model = Sequential()
model.add(Embedding(max_fatures, embed_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# model = Sequential()
# model.add(Embedding(max_features, 128, input_length = X.shape[1]))
# model.add(LSTM(128))
# model.add(Dense(2, activation='softmax'))

# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 40, 128)           256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 40, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
Y = pd.get_dummies(data['target']).values

In [16]:
Y

array([[1, 0],
       [1, 0],
       [1, 0],
       ...,
       [0, 1],
       [0, 1],
       [0, 1]], dtype=uint8)

In [17]:
Y = pd.get_dummies(data['target']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 22)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(1072000, 40) (1072000, 2)
(528000, 40) (528000, 2)


In [18]:
model.fit(X_train, Y_train, epochs=10, batch_size=batch_size, verbose = 2)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/10
1072000/1072000 - 1346s - loss: 0.4499 - acc: 0.7879
Epoch 2/10
1072000/1072000 - 1305s - loss: 0.4263 - acc: 0.8020
Epoch 3/10
1072000/1072000 - 1304s - loss: 0.4198 - acc: 0.8056
Epoch 4/10
1072000/1072000 - 1304s - loss: 0.4164 - acc: 0.8075
Epoch 5/10
1072000/1072000 - 1304s - loss: 0.4146 - acc: 0.8089
Epoch 6/10
1072000/1072000 - 1304s - loss: 0.4137 - acc: 0.8092
Epoch 7/10
1072000/1072000 - 1507s - loss: 0.4131 - acc: 0.8095
Epoch 8/10
1072000/1072000 - 1472s - loss: 0.4127 - acc: 0.8096
Epoch 9/10
1072000/1072000 - 1538s - loss: 0.4126 - acc: 0.8099
Epoch 10/10
1072000/1072000 - 1564s - loss: 0.4125 - acc: 0.8100


<tensorflow.python.keras.callbacks.History at 0x7fe87192dc10>

In [19]:
model.save('trained_model.h5')

In [5]:
model = load_model('trained_model.h5')

In [15]:
validation_size = 500000

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

28000/1 - 5s - loss: 0.3611 - accuracy: 0.8103
score: 0.41
acc: 0.81


In [21]:
test_val = tokenizer.texts_to_sequences(['I love Bernie Sanders'])
# test_val
test_val = pad_sequences(test_val, maxlen=40)
test_val
prediction = model.predict(test_val)
prediction

array([[0.08736066, 0.9126394 ]], dtype=float32)

In [24]:
save_tokenizer(tokenizer)