This notebook records our codes for prediction model in goal 2. The procedure might be very time consuming and memory consuming. Please run our codes cautiously.

### Load Packages

In [39]:
import pandas as pd
import json
import string
import math
import spacy
import numpy as np
from math import sqrt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim import corpora, models, similarities
from gensim.matutils import sparse2full
from keras.layers.core import Activation, Dense
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

#### Read  training data

In [2]:
with open('./Data_Module2/review_train.json') as f:
    r_train = f.readlines()
    r_train = list(map(json.loads,r_train))
    
r_train = pd.DataFrame(r_train)
review = r_train.loc[:,'text']

#### Data process

In [4]:
stop_words = set(stopwords.words('english'))
stop_words.remove('no')
stop_words.remove('not')
table = str.maketrans('', '', string.punctuation)
wnl = WordNetLemmatizer()
trysize = len(review)
t = [[1]]*trysize
for i in tqdm(range(trysize)):
    #Split into words
    x = word_tokenize(review[i])
    #Convert to lower case
    x = [w.lower() for w in x]
    ## lemmatization
    x = [wnl.lemmatize(w) for w in x]
    #Remove punctuation
    x = [w.translate(table) for w in x]
    #Remove not alphabetic
    x = [word for word in x if word.isalpha()]
    #Change n't into not
    x = ['not' if w=='n\'t' else w for w in x ]
    #Remove stop words
    x = [w for w in x if not w in stop_words]
    t[i] = x

del review

100%|██████████| 5364626/5364626 [3:17:22<00:00, 452.98it/s]    


In [5]:
review_train_clean = [" ".join(item) for item in t]
del t

In [9]:
with open('review_train.txt', 'w') as f:
    for item in review_train_clean:
        f.write("%s\n" % item)

*Set data into appropriate forms, so that each observation has the same length.*

In [16]:
max_features = 30000
max_length = 300
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(review_train_clean)
X=tokenizer.texts_to_sequences(review_train_clean)
X=pad_sequences(X,maxlen=max_length)
Y = to_categorical(r_train.loc[:,'stars'])

In [34]:
Y = to_categorical(r_train.loc[:,'stars']-1)
Y[:5]

array([[1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.]], dtype=float32)

In [15]:
r_train.head()

Unnamed: 0,business_id,date,stars,text
0,31292,2013-05-07 04:34:36,1.0,Total bill for this horrible service? Over $8G...
1,35344,2017-01-14 21:30:33,5.0,I *adore* Travis at the Hard Rock's new Kelly ...
2,152538,2016-11-09 20:09:03,5.0,I have to say that this office really has it t...
3,71871,2018-01-09 20:56:38,5.0,Went in for a lunch. Steak sandwich was delici...
4,64913,2018-01-30 23:07:38,1.0,Today was my second out of three sessions I ha...


#### Construct model

In [53]:
model2 = Sequential()
model2.add(Embedding(max_features, 128, input_length=max_length))
model2.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(LSTM(150, dropout=0.2, recurrent_dropout=0.2))
model2.add(Dense(5))
model2.add(Activation("softmax"))
model2.compile(loss="categorical_crossentropy", optimizer="adam",metrics=["accuracy"])
print(model2.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 300, 128)          3840000   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 300, 64)           24640     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 150, 64)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 150)               129000    
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 755       
_________________________________________________________________
activation_4 (Activation)    (None, 5)                 0         
Total params: 3,994,395
Trainable params: 3,994,395
Non-trainable params: 0
_________________________________________________________________


In [38]:
model.fit(X, Y,validation_split=0.20, epochs=1,verbose=1, batch_size=32)

Train on 4291700 samples, validate on 1072926 samples
Epoch 1/1


<keras.callbacks.History at 0x1a8fc8d1d0>

#### Read test data, process and predict

In [40]:
with open('./Data_Module2/review_test.json') as f:
    r_test = f.readlines()
    r_test = list(map(json.loads,r_test))
    
r_test = pd.DataFrame(r_test)
review = r_test.loc[:,'text']
ID = r_test.loc[:,'KaggleID']
del r_test
stop_words = set(stopwords.words('english'))
stop_words.remove('no')
stop_words.remove('not')
table = str.maketrans('', '', string.punctuation)
wnl = WordNetLemmatizer()
trysize = len(review)
t = [[1]]*trysize
for i in tqdm(range(trysize)):
    #Split into words
    x = word_tokenize(review[i])
    #Convert to lower case
    x = [w.lower() for w in x]
    ## lemmatization
    x = [wnl.lemmatize(w) for w in x]
    #Remove punctuation
    x = [w.translate(table) for w in x]
    #Remove not alphabetic
    x = [word for word in x if word.isalpha()]
    #Change n't into not
    x = ['not' if w=='n\'t' else w for w in x ]
    #Remove stop words
    x = [w for w in x if not w in stop_words]
    t[i] = x

del review
review_test_clean = [" ".join(item) for item in t]
del t
max_features = 30000
max_length = 300
X_test = tokenizer.texts_to_sequences(review_test_clean)
X_test = pad_sequences(X_test,maxlen=max_length)
results = model.predict_classes(X_test)
df = pd.DataFrame({'Expected':results,'ID':ID})
df = df[['ID','Expected']]
df.to_csv('try2.csv',index=False)

100%|██████████| 1321274/1321274 [45:46<00:00, 481.04it/s]  


In [51]:
df = pd.DataFrame({'Expected':results+1,'ID':ID})
df = df[['ID','Expected']]
df.to_csv('try2.csv',index=False)

*Adjust model parameters*

In [52]:
max_features = 30000
max_length = 300
model = Sequential()
model.add(Embedding(max_features, 64, input_length=max_length))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(150, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(5))
model.add(Activation("softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam",metrics=["accuracy"])
model.fit(X, Y, epochs=1,verbose=1, batch_size=32)
results = model.predict_classes(X_test)
df = pd.DataFrame({'Expected':results+1,'ID':ID})
df = df[['ID','Expected']]
df.to_csv('try3.csv',index=False)

Epoch 1/1


In [61]:
np.save('X_test.npy',X_test)

np.save('X_train.npy',X)

np.save('Y_train.npy',Y)