In [4]:
import numpy as np 
import pandas as pd 
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.models import Sequential
from sklearn.feature_extraction.text import CountVectorizer
from random import shuffle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
data = pd.read_csv('train.tsv', sep='\t')

In [6]:
data.head(20)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [7]:
data = data.reindex(np.random.permutation(data.index))
#shuffle(data).head()

In [8]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
115596,115597,6160,mostly wordless ethnographic extras,1
45825,45826,2229,The film boasts at least a few good ideas and ...,3
111326,111327,5903,compelling than the circumstances of its making,2
11537,11538,497,About Schmidt,2
153562,153563,8386,the plot grinds itself out in increasingly inc...,1


In [9]:
data.Sentiment.value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [10]:
Sentiments = to_categorical(data['Sentiment'], num_classes=5)

In [11]:
Sentiments[11]

array([1., 0., 0., 0., 0.], dtype=float32)

In [12]:
n_most_common_words = 20000
max_len = 49
tokenizer = Tokenizer(num_words=n_most_common_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(data['Phrase'].values)
sequences = tokenizer.texts_to_sequences(data['Phrase'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
#print(sequences)
#print(word_index)
X = pad_sequences(sequences)
print(X)

Found 15288 unique tokens.


[[    0     0     0 ... 10117 10118 10119]
 [    0     0     0 ...    65   709   152]
 [    0     0     0 ...     3    14   228]
 ...
 [    0     0     0 ...     0  1040  1709]
 [    0     0     0 ...     0     0  5219]
 [    0     0     0 ...   801     1    15]]


In [100]:
len(X[1290])

49

In [101]:
sequences[9]

[206, 282]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X , Sentiments, test_size=0.25, random_state=42)

In [14]:
y_test[0]

array([0., 0., 1., 0., 0.], dtype=float32)

In [15]:
epochs = 2
emb_dim = 64
batch_size = 64
#labels[:2]

In [16]:
model = Sequential()
model.add(Embedding(n_most_common_words, emb_dim, input_length=X.shape[1]))
#model.add(SpatialDropout1D(0.2))#to prevent the overfitting in the model
model.add(Dense(64,activation='relu'))
model.add(LSTM(64))
model.add(Dense(5, activation='softmax'))#activation function
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])#adam adjusts the learning rate and learning rate calculates how fast weight will calculate 
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 49, 64)            1280000   
_________________________________________________________________
dense_1 (Dense)              (None, 49, 64)            4160      
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 325       
Total params: 1,317,509
Trainable params: 1,317,509
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
history = model.fit(X_train, y_train, epochs=2, batch_size=batch_size,validation_split=0.2)

Instructions for updating:
Use tf.cast instead.


Train on 93636 samples, validate on 23409 samples
Epoch 1/2


   64/93636 [..............................] - ETA: 30:42 - loss: 1.6116 - acc: 0.0938

  128/93636 [..............................] - ETA: 17:50 - loss: 1.6089 - acc: 0.2578

  192/93636 [..............................] - ETA: 13:33 - loss: 1.6042 - acc: 0.3385

  256/93636 [..............................] - ETA: 11:46 - loss: 1.6001 - acc: 0.3828

  320/93636 [..............................] - ETA: 10:28 - loss: 1.5943 - acc: 0.4281

  384/93636 [..............................] - ETA: 9:34 - loss: 1.5916 - acc: 0.4219 

  448/93636 [..............................] - ETA: 8:53 - loss: 1.5878 - acc: 0.4241

  512/93636 [..............................] - ETA: 8:21 - loss: 1.5822 - acc: 0.4375

  576/93636 [..............................] - ETA: 7:58 - loss: 1.5773 - acc: 0.4358

  640/93636 [..............................] - ETA: 7:40 - loss: 1.5718 - acc: 0.4406

  704/93636 [..............................] - ETA: 7:34 - loss: 1.5644 - acc: 0.4531

  768/93636 [..............................] - ETA: 7:21 - loss: 1.5572 - acc: 0.4570

  832/93636 [..............................] - ETA: 7:10 - loss: 1.5484 - acc: 0.4615

  896/93636 [..............................] - ETA: 6:58 - loss: 1.5393 - acc: 0.4632

  960/93636 [..............................] - ETA: 6:49 - loss: 1.5259 - acc: 0.4677

 1024/93636 [..............................] - ETA: 6:43 - loss: 1.5141 - acc: 0.4707

 1088/93636 [..............................] - ETA: 6:36 - loss: 1.4978 - acc: 0.4761

 1152/93636 [..............................] - ETA: 6:31 - loss: 1.4856 - acc: 0.4800

 1216/93636 [..............................] - ETA: 6:30 - loss: 1.4723 - acc: 0.4844

 1280/93636 [..............................] - ETA: 6:26 - loss: 1.4645 - acc: 0.4859

 1344/93636 [..............................] - ETA: 6:21 - loss: 1.4588 - acc: 0.4859

 1408/93636 [..............................] - ETA: 6:16 - loss: 1.4432 - acc: 0.4901

 1472/93636 [..............................] - ETA: 6:13 - loss: 1.4345 - acc: 0.4918

 1536/93636 [..............................] - ETA: 6:09 - loss: 1.4211 - acc: 0.4967

 1600/93636 [..............................] - ETA: 6:10 - loss: 1.4223 - acc: 0.4938

 1664/93636 [..............................] - ETA: 6:07 - loss: 1.4128 - acc: 0.4958

 1728/93636 [..............................] - ETA: 6:04 - loss: 1.4071 - acc: 0.4965

 1792/93636 [..............................] - ETA: 6:02 - loss: 1.4020 - acc: 0.4967

 1856/93636 [..............................] - ETA: 6:03 - loss: 1.3997 - acc: 0.4952

 1920/93636 [..............................] - ETA: 6:00 - loss: 1.3955 - acc: 0.4958

 1984/93636 [..............................] - ETA: 5:57 - loss: 1.3926 - acc: 0.4955

 2048/93636 [..............................] - ETA: 5:55 - loss: 1.3895 - acc: 0.4961

 2112/93636 [..............................] - ETA: 5:53 - loss: 1.3807 - acc: 0.5005

 2176/93636 [..............................] - ETA: 5:55 - loss: 1.3792 - acc: 0.4995

 2240/93636 [..............................] - ETA: 5:52 - loss: 1.3735 - acc: 0.5004

 2304/93636 [..............................] - ETA: 5:50 - loss: 1.3643 - acc: 0.5061

 2368/93636 [..............................] - ETA: 5:49 - loss: 1.3623 - acc: 0.5059

 2432/93636 [..............................] - ETA: 5:47 - loss: 1.3550 - acc: 0.5090

 2496/93636 [..............................] - ETA: 5:45 - loss: 1.3551 - acc: 0.5068

 2560/93636 [..............................] - ETA: 5:43 - loss: 1.3531 - acc: 0.5055

 2624/93636 [..............................] - ETA: 5:41 - loss: 1.3492 - acc: 0.5072

 2688/93636 [..............................] - ETA: 5:39 - loss: 1.3492 - acc: 0.5067

 2752/93636 [..............................] - ETA: 5:38 - loss: 1.3497 - acc: 0.5055

 2816/93636 [..............................] - ETA: 5:36 - loss: 1.3489 - acc: 0.5046

 2880/93636 [..............................] - ETA: 5:33 - loss: 1.3440 - acc: 0.5066

 2944/93636 [..............................] - ETA: 5:31 - loss: 1.3412 - acc: 0.5065

 3008/93636 [..............................] - ETA: 5:30 - loss: 1.3381 - acc: 0.5070

 3072/93636 [..............................] - ETA: 5:31 - loss: 1.3365 - acc: 0.5065

 3136/93636 [>.............................] - ETA: 5:30 - loss: 1.3320 - acc: 0.5086

 3200/93636 [>.............................] - ETA: 5:29 - loss: 1.3297 - acc: 0.5088

 3264/93636 [>.............................] - ETA: 5:27 - loss: 1.3272 - acc: 0.5089

 3328/93636 [>.............................] - ETA: 5:26 - loss: 1.3272 - acc: 0.5075

 3392/93636 [>.............................] - ETA: 5:25 - loss: 1.3263 - acc: 0.5071

 3456/93636 [>.............................] - ETA: 5:24 - loss: 1.3237 - acc: 0.5075

 3520/93636 [>.............................] - ETA: 5:23 - loss: 1.3227 - acc: 0.5065

 3584/93636 [>.............................] - ETA: 5:22 - loss: 1.3214 - acc: 0.5059

 3648/93636 [>.............................] - ETA: 5:19 - loss: 1.3203 - acc: 0.5063

 3712/93636 [>.............................] - ETA: 5:19 - loss: 1.3194 - acc: 0.5067

 3776/93636 [>.............................] - ETA: 5:18 - loss: 1.3180 - acc: 0.5072

 3840/93636 [>.............................] - ETA: 5:17 - loss: 1.3140 - acc: 0.5089

 3904/93636 [>.............................] - ETA: 5:18 - loss: 1.3134 - acc: 0.5085

 3968/93636 [>.............................] - ETA: 5:15 - loss: 1.3123 - acc: 0.5078

 4032/93636 [>.............................] - ETA: 5:14 - loss: 1.3107 - acc: 0.5087

 4096/93636 [>.............................] - ETA: 5:13 - loss: 1.3085 - acc: 0.5098

 4160/93636 [>.............................] - ETA: 5:13 - loss: 1.3085 - acc: 0.5091

 4224/93636 [>.............................] - ETA: 5:14 - loss: 1.3104 - acc: 0.5083

 4288/93636 [>.............................] - ETA: 5:13 - loss: 1.3103 - acc: 0.5072

 4352/93636 [>.............................] - ETA: 5:12 - loss: 1.3097 - acc: 0.5078

 4416/93636 [>.............................] - ETA: 5:11 - loss: 1.3090 - acc: 0.5070



In [18]:
accr = model.evaluate(X_test,y_test)

   32/39015 [..............................] - ETA: 1:05

   96/39015 [..............................] - ETA: 59s 

  192/39015 [..............................] - ETA: 49s

  288/39015 [..............................] - ETA: 46s

  384/39015 [..............................] - ETA: 44s

  480/39015 [..............................] - ETA: 43s

  576/39015 [..............................] - ETA: 42s

  704/39015 [..............................] - ETA: 40s

  768/39015 [..............................] - ETA: 41s

  864/39015 [..............................] - ETA: 41s

  960/39015 [..............................] - ETA: 41s

 1088/39015 [..............................] - ETA: 39s

 1184/39015 [..............................] - ETA: 39s

 1280/39015 [..............................] - ETA: 39s

 1376/39015 [>.............................] - ETA: 39s

 1472/39015 [>.............................] - ETA: 39s



 1568/39015 [>.............................] - ETA: 39s

 1664/39015 [>.............................] - ETA: 39s

 1760/39015 [>.............................] - ETA: 38s

 1888/39015 [>.............................] - ETA: 38s

 1984/39015 [>.............................] - ETA: 38s

 2080/39015 [>.............................] - ETA: 38s

 2176/39015 [>.............................] - ETA: 38s

 2272/39015 [>.............................] - ETA: 37s

 2368/39015 [>.............................] - ETA: 37s

 2464/39015 [>.............................] - ETA: 37s

 2592/39015 [>.............................] - ETA: 37s

 2656/39015 [=>............................] - ETA: 37s

 2752/39015 [=>............................] - ETA: 37s

 2848/39015 [=>............................] - ETA: 37s

 2880/39015 [=>............................] - ETA: 38s

 2976/39015 [=>............................] - ETA: 38s

 3072/39015 [=>............................] - ETA: 37s

 3168/39015 [=>............................] - ETA: 37s

 3264/39015 [=>............................] - ETA: 37s

 3360/39015 [=>............................] - ETA: 37s

 3456/39015 [=>............................] - ETA: 37s

 3552/39015 [=>............................] - ETA: 37s

 3648/39015 [=>............................] - ETA: 37s

 3744/39015 [=>............................] - ETA: 37s

 3840/39015 [=>............................] - ETA: 37s

 3904/39015 [==>...........................] - ETA: 37s

 3968/39015 [==>...........................] - ETA: 37s

 4064/39015 [==>...........................] - ETA: 37s

 4128/39015 [==>...........................] - ETA: 37s

 4192/39015 [==>...........................] - ETA: 37s

 4288/39015 [==>...........................] - ETA: 37s

 4384/39015 [==>...........................] - ETA: 37s

 4480/39015 [==>...........................] - ETA: 37s

 4576/39015 [==>...........................] - ETA: 37s

 4640/39015 [==>...........................] - ETA: 37s

 4736/39015 [==>...........................] - ETA: 37s

 4832/39015 [==>...........................] - ETA: 37s

 4928/39015 [==>...........................] - ETA: 36s

 5024/39015 [==>...........................] - ETA: 36s

 5120/39015 [==>...........................] - ETA: 36s

 5216/39015 [===>..........................] - ETA: 36s

 5312/39015 [===>..........................] - ETA: 36s

 5408/39015 [===>..........................] - ETA: 36s

 5504/39015 [===>..........................] - ETA: 36s

 5600/39015 [===>..........................] - ETA: 36s

 5728/39015 [===>..........................] - ETA: 35s

 5824/39015 [===>..........................] - ETA: 35s

 5920/39015 [===>..........................] - ETA: 35s

 6016/39015 [===>..........................] - ETA: 35s

 6112/39015 [===>..........................] - ETA: 35s

 6240/39015 [===>..........................] - ETA: 34s

 6368/39015 [===>..........................] - ETA: 34s

 6464/39015 [===>..........................] - ETA: 34s

 6592/39015 [====>.........................] - ETA: 34s

 6720/39015 [====>.........................] - ETA: 33s

 6816/39015 [====>.........................] - ETA: 33s

 6944/39015 [====>.........................] - ETA: 33s

 7040/39015 [====>.........................] - ETA: 33s

 7136/39015 [====>.........................] - ETA: 33s

 7264/39015 [====>.........................] - ETA: 33s

 7360/39015 [====>.........................] - ETA: 32s

 7456/39015 [====>.........................] - ETA: 32s

 7584/39015 [====>.........................] - ETA: 32s

 7680/39015 [====>.........................] - ETA: 32s



 7808/39015 [=====>........................] - ETA: 32s

 7872/39015 [=====>........................] - ETA: 32s

 7968/39015 [=====>........................] - ETA: 32s

 8032/39015 [=====>........................] - ETA: 32s

 8096/39015 [=====>........................] - ETA: 32s

 8192/39015 [=====>........................] - ETA: 32s

 8320/39015 [=====>........................] - ETA: 31s

 8416/39015 [=====>........................] - ETA: 31s

 8512/39015 [=====>........................] - ETA: 31s

 8608/39015 [=====>........................] - ETA: 31s

 8704/39015 [=====>........................] - ETA: 31s

 8736/39015 [=====>........................] - ETA: 31s

 8800/39015 [=====>........................] - ETA: 31s

 8896/39015 [=====>........................] - ETA: 31s

 8992/39015 [=====>........................] - ETA: 31s

 9056/39015 [=====>........................] - ETA: 31s

















In [19]:
print('Test set\n  Loss: {:f}\n  Accuracy: {:f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.829854
  Accuracy: 0.662668


In [23]:
txt = ["this is very,amazing,excellent, good"]
seq = tokenizer.texts_to_sequences(txt)
padded = pad_sequences(seq, maxlen=max_len)
pred = model.predict(padded)
sentiment_list = [0,1,2,3,4]
print(np.argmax(pred))
print(pred, sentiment_list[np.argmax(pred)])
print(pred)

4
[[6.4033520e-05 1.0741857e-04 2.3329575e-03 1.5843785e-01 8.3905774e-01]] 4
[[6.4033520e-05 1.0741857e-04 2.3329575e-03 1.5843785e-01 8.3905774e-01]]


In [24]:
test_data = pd.read_csv('test.tsv', sep='\t')

In [25]:
test_data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [26]:
#phrase_id = []
sent_list = []
sentiment_list = [0,1,2,3,4]
for i in range(len(test_data)):
    txt = [test_data['Phrase'][i]]
    seq = tokenizer.texts_to_sequences(txt)
    padded = pad_sequences(seq, maxlen=max_len)
    pred = model.predict(padded)
    sent_list.append(sentiment_list[np.argmax(pred)])
    #phrase_id.append(test_data['PhraseId'][i])
    #print(pred, sentiment_list[np.argmax(pred)])

In [50]:
final_dataframe = pd.DataFrame()

In [51]:

final_dataframe['PhraseId'] = test_data['PhraseID']
final_dataframe['Sentiment'] = sent_list
final_dataframe['Phrase']=test_data['Phrase']

In [52]:
#final_dataframe.to_csv('submission.csv')

In [53]:

'''sent_list = []
sentiment_list = [0,1,2,3,4]

seq=tokenizer.texts_to_sequences(test_data["Phrase"].values)
padded = pad_sequences(seq, maxlen=max_len)
pred = model.predict(padded)
pred
for i in pred:
    sent_list.append(sentiment_list[np.argmax(i)])'''




In [54]:
f_df=pd.DataFrame()
f_df['PhraseId']=test_data['PhraseId']
f_df['sentiment']=sent_list
f_df['Phrase']=test_data['Phrase']

In [55]:
f_df.head(10)

Unnamed: 0,PhraseId,sentiment,Phrase
0,156061,2,An intermittently pleasing but mostly routine ...
1,156062,2,An intermittently pleasing but mostly routine ...
2,156063,2,An
3,156064,2,intermittently pleasing but mostly routine effort
4,156065,2,intermittently pleasing but mostly routine
5,156066,2,intermittently pleasing but
6,156067,2,intermittently pleasing
7,156068,2,intermittently
8,156069,3,pleasing
9,156070,2,but


In [56]:
len(f_df)

66292

In [57]:
type(pred[0])


numpy.ndarray