In [1]:
import pandas as pd

data = pd.read_csv('../sentiment140-subset_doc.csv', nrows=50000)

In [2]:
data.columns


Index(['polarity', 'text'], dtype='object')

In [13]:
data.head()

Unnamed: 0,polarity,text
0,0,@kconsidder You never tweet
1,0,Sick today coding from the couch.
2,1,"@ChargerJenn Thx for answering so quick,I was ..."
3,1,Wii fit says I've lost 10 pounds since last ti...
4,0,@MrKinetik Not a thing!!! I don't really have...


In [3]:
import re
import tensorflow as tf

max_features = 4000


In [4]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = tf.keras.preprocessing.sequence.pad_sequences(X)

In [5]:
X.shape

(50000, 35)

In [6]:
X[49999]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,  297,  326,   20, 2994,   45,  970,
        124,  129])

In [7]:
embed_dim = 256
lstm_out = 196

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(max_features, embed_dim, input_length = X.shape[1]))
model.add(tf.keras.layers.SpatialDropout1D(0.4))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_out, dropout=0.05, recurrent_dropout=0.2)))
model.add(tf.keras.layers.Dense(2, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 35, 256)           1024000   
                                                                 
 spatial_dropout1d (Spatial  (None, 35, 256)           0         
 Dropout1D)                                                      
                                                                 
 bidirectional (Bidirection  (None, 392)               710304    
 al)                                                             
                                                                 
 dense (Dense)               (None, 2)                 786       
                                                                 
Total params: 1735090 (6.62 MB)
Trainable params: 1735090 (6.62 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [8]:
from sklearn.model_selection import train_test_split
import numpy as np
Y = pd.get_dummies(data['polarity'])
Y.head()

Unnamed: 0,0,1
0,True,False
1,True,False
2,False,True
3,False,True
4,True,False


In [9]:
result_dict = {'False': 'Negative', : 'Positive'}
y_arr = np.vectorize(result_dict.get)(Y.columns)


In [10]:
Y = Y.values
Y

array([[ True, False],
       [ True, False],
       [False,  True],
       ...,
       [ True, False],
       [False,  True],
       [False,  True]])

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.33, random_state = 42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(33500, 35) (33500, 2)
(16500, 35) (16500, 2)


In [12]:
model.fit(X_train, Y_train, epochs=20, batch_size=128, verbose=2)

Epoch 1/20


KeyboardInterrupt: 