In [66]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, CuDNNLSTM, CuDNNGRU, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [73]:
data = pd.read_csv('data/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')
data = data[['target', 'text']]

In [74]:
data

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


In [75]:
accepted_chars = 'abcdefghijklmnopqrstuvwxyz '

def normalize(line):
    new_chars = [c.lower() for c in line if c.lower() in accepted_chars]
    return ''.join(new_chars)

In [76]:
for idx,row in data.iterrows():
    # print(data.at[idx, 'text'])
    data.at[idx, 'text'] = normalize(data.at[idx, 'text'].replace('rt', ' '))
    ## row[1] = normalize(row[1].replace('rt',' '))

In [77]:
data

Unnamed: 0,target,text
0,0,switchfoot httptwitpiccomyzl awww thats a bum...
1,0,is upset that he cant update his facebook by t...
2,0,kenichan i dived many times for the ball manag...
3,0,my whole body feels itchy and like its on fire
4,0,nationwideclass no its not behaving at all im ...
...,...,...
1599995,4,just woke up having no school is the best feel...
1599996,4,thewdbcom very cool to hear old walt intervie...
1599997,4,are you ready for your mojo makeover ask me fo...
1599998,4,happy th bi hday to my boo of alll time tupac ...


In [78]:
max_features = 2000

In [79]:
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

In [80]:
X

array([[   0,    0,    0, ...,   40,    9,  386],
       [   0,    0,    0, ...,   42,  264, 1203],
       [   0,    0,    0, ...,   38,   34,   12],
       ...,
       [   0,    0,    0, ...,  612,   15,   10],
       [   0,    0,    0, ...,  503,   12,   50],
       [   0,    0,    0, ...,    0,    0,  117]], dtype=int32)

In [82]:
model = Sequential()
model.add(Embedding(max_features, 128, input_length = X.shape[1]))
model.add(CuDNNLSTM(128))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 40, 128)           256000    
_________________________________________________________________
cu_dnnlstm_11 (CuDNNLSTM)    (None, 128)               132096    
_________________________________________________________________
dense_13 (Dense)             (None, 2)                 258       
Total params: 388,354
Trainable params: 388,354
Non-trainable params: 0
_________________________________________________________________
None


In [83]:
Y = pd.get_dummies(data['target']).values

In [84]:
Y

array([[1, 0],
       [1, 0],
       [1, 0],
       ...,
       [0, 1],
       [0, 1],
       [0, 1]], dtype=uint8)

In [130]:
Y = pd.get_dummies(data['target']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.90, random_state = 22)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(160000, 40) (160000, 2)
(1440000, 40) (1440000, 2)


In [86]:
batch_size = 1000
model.fit(X_train, Y_train, epochs=100, batch_size=batch_size, verbose = 2)

Epoch 1/100
1072000/1072000 - 27s - loss: 0.4612 - acc: 0.7804
Epoch 2/100
1072000/1072000 - 25s - loss: 0.4291 - acc: 0.7996
Epoch 3/100
1072000/1072000 - 27s - loss: 0.4196 - acc: 0.8049
Epoch 4/100
1072000/1072000 - 29s - loss: 0.4116 - acc: 0.8093
Epoch 5/100
1072000/1072000 - 29s - loss: 0.4044 - acc: 0.8131
Epoch 6/100
1072000/1072000 - 29s - loss: 0.3976 - acc: 0.8170
Epoch 7/100
1072000/1072000 - 29s - loss: 0.3910 - acc: 0.8206
Epoch 8/100
1072000/1072000 - 29s - loss: 0.3848 - acc: 0.8241
Epoch 9/100
1072000/1072000 - 29s - loss: 0.3787 - acc: 0.8273
Epoch 10/100
1072000/1072000 - 29s - loss: 0.3725 - acc: 0.8308
Epoch 11/100
1072000/1072000 - 29s - loss: 0.3663 - acc: 0.8341
Epoch 12/100
1072000/1072000 - 29s - loss: 0.3598 - acc: 0.8373
Epoch 13/100
1072000/1072000 - 29s - loss: 0.3531 - acc: 0.8406
Epoch 14/100
1072000/1072000 - 29s - loss: 0.3458 - acc: 0.8444
Epoch 15/100
1072000/1072000 - 29s - loss: 0.3385 - acc: 0.8483
Epoch 16/100
1072000/1072000 - 29s - loss: 0.3306

<tensorflow.python.keras.callbacks.History at 0x7eff27587ed0>

In [131]:
validation_size = 800000

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

640000/640000 - 6s - loss: 0.4869 - acc: 0.8964
score: 0.49
acc: 0.90
