In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import keras
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import keras.backend as K
from sklearn.externals import joblib
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [2]:
feat = pd.read_csv('train_data.csv')
targets = pd.read_csv('train_reponse.csv')

In [3]:
df = feat.merge(targets)

In [4]:
df

Unnamed: 0,review_id,text,stars
0,CuAG9zG2VDdd4hPGMip2Xg,"Deeeelicious!\n\nFirst, the upstairs is a pret...",4
1,oipDuz40GWRdhFI_ck0hmQ,Just had a baby!\n\n...about a month ago. Had ...,5
2,k7IdlhwtZ2evNJOkaKmPoQ,Since I was staying in the hotel we had a $25 ...,3
3,xg1qeM_nYE0r0PYBZQedzg,I can't believe that I haven't reviewed this s...,5
4,u3eX3oMz3hC2KfX3mdKBlA,Mmmm so much choice and it's all completely aw...,5
5,weJ-rLGzWKTIzhyDgomLSA,tried to go today at lunchtime: but didn't re...,3
6,6ZCeScwRyLiupX-7iOtX4w,"To me, this is the BEST Sushi place in town an...",5
7,ePrwqwZ0w8RmeMcxym3EpA,"i miss this place so much, it's insanity. \n\n...",5
8,5eGl3SQ_9F6pf7MAZ4JykQ,Love San Tan Village. It's conveniently locate...,4
9,ZBxeVSGhxgS00_8Kyi9u5g,no hanky panky here.. the idea of these places...,5


In [5]:
X = pd.DataFrame(df['text'])
y = df['stars']

In [6]:
vect = TfidfVectorizer(max_features=10000)

In [8]:
X = vect.fit_transform(X['text'])

In [23]:
joblib.dump(vect, 'vector.pkl')

['vector.pkl']

In [9]:
y = pd.get_dummies(y)
y.columns = ['stars_{}'.format(x) for x in y.columns]

In [10]:
y.head()

Unnamed: 0,stars_1,stars_2,stars_3,stars_4,stars_5
0,0,0,0,1,0
1,0,0,0,0,1
2,0,0,1,0,0
3,0,0,0,0,1
4,0,0,0,0,1


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
X_train.shape

(375000, 10000)

In [13]:
K.clear_session()
model = Sequential()
opt = Adam()
es = EarlyStopping(patience=2)

In [15]:
model.add(Dense(units=64, activation='tanh', input_dim=len(vect.get_feature_names())))
model.add(Dense(4, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(5, activation='softmax'))

model.compile(opt, loss='categorical_crossentropy')

In [14]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 64)                640064    
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 260       
_________________________________________________________________
dense_3 (Dense)              (None, 16)                80        
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 85        
Total params: 640,489
Trainable params: 640,489
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.fit(X_train, y_train, batch_size=500, epochs=10, validation_split=0.1, callbacks=[es])

Train on 337500 samples, validate on 37500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.callbacks.History at 0x25280c7d780>

In [15]:
model.evaluate(X_test, y_test)



0.8167321469116211

In [16]:
predictions = model.predict(X_test)

In [17]:
from sklearnearn import metrics

In [20]:
target = df['stars']

In [44]:
pred = predictions.argmax(axis=1)+1
target = y_test.values.argmax(axis=1)+1

In [46]:
print(metrics.classification_report(target, pred))

              precision    recall  f1-score   support

           1       0.71      0.79      0.75     12553
           2       0.51      0.45      0.48     11263
           3       0.53      0.44      0.48     17722
           4       0.57      0.60      0.58     37163
           5       0.74      0.77      0.76     46299

   micro avg       0.64      0.64      0.64    125000
   macro avg       0.61      0.61      0.61    125000
weighted avg       0.64      0.64      0.64    125000



In [47]:
print(metrics.confusion_matrix(target, pred))

[[ 9885  1758   358   284   268]
 [ 2578  5114  2443   856   272]
 [  667  2440  7797  5892   926]
 [  312   521  3455 22147 10728]
 [  393   145   622  9644 35495]]


In [48]:
print(metrics.accuracy_score(target, pred))

0.643504


In [20]:
joblib.dump(model, 'model.pkl')

['model.pkl']

In [21]:
predictions = pd.DataFrame(predictions)
predictions.columns = ['stars_1', 'stars_2', 'stars_3', 'stars_4', 'stars_5']

In [22]:
predictions.head()

Unnamed: 0,stars_1,stars_2,stars_3,stars_4,stars_5
0,0.002619189,0.018879,0.17054,0.323833,0.484129
1,2.313532e-07,1.2e-05,0.000843,0.131217,0.867928
2,0.631966,0.284619,0.059927,0.010904,0.012584
3,5.68764e-06,5.8e-05,0.001648,0.144409,0.853878
4,1.446419e-05,0.000559,0.023755,0.357861,0.617811
