## Load Libraries

### Load required librarires

In [1]:
# Load Libraries
import numpy as np
import pandas as pd
import collections
import nltk
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import pickle
import os.path
from keras.models import Sequential,load_model
from keras.layers.core import Activation,Dense,Dropout,SpatialDropout1D
from keras.layers.wrappers import Bidirectional
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM,GRU
from keras import regularizers
from keras.callbacks import ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train=pd.read_csv("train.csv",sep="~", encoding='latin-1')
test=pd.read_csv("test.csv",sep="~",encoding='latin-1')

In [3]:
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,11755,After reading mixed reviews I almost didn't bo...,Google Chrome,Desktop,Good
1,33912,This motor inn is located about - city blocks ...,Firefox,Tablet,Good
2,10143,It was our first time there and surely not our...,Google Chrome,Mobile,Good
3,33114,"Great hotel in an excellent location, just off...",Mozilla,Desktop,Good
4,17464,We stayed at the hotel for - weeks to get away...,Google Chrome,Desktop,Good


In [4]:
test.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used
0,9602,A friend and I stayed in this hotel when we we...,Edge,Desktop
1,8749,I enjoy staying here when I have early flights...,Google Chrome,Mobile
2,15500,I stopped off in Seattle during a train tour o...,Chrome,Mobile
3,5495,I have stayed at this hotel - or - times now f...,Mozilla Firefox,Desktop
4,18570,Excellent location with hop on hop off city tr...,Edge,Mobile


## Prepare Data

### prepare train data

In [5]:
maxlen=0
word_freqs=collections.Counter()

In [6]:
train_description=train["Description"]
train_description.head()

0    After reading mixed reviews I almost didn't bo...
1    This motor inn is located about - city blocks ...
2    It was our first time there and surely not our...
3    Great hotel in an excellent location, just off...
4    We stayed at the hotel for - weeks to get away...
Name: Description, dtype: object

In [7]:
test_description=test["Description"]
test_description.head()

0    A friend and I stayed in this hotel when we we...
1    I enjoy staying here when I have early flights...
2    I stopped off in Seattle during a train tour o...
3    I have stayed at this hotel - or - times now f...
4    Excellent location with hop on hop off city tr...
Name: Description, dtype: object

In [8]:
for sentence in train_description:
    words=nltk.word_tokenize(sentence.lower())
    if len(words)>maxlen:
        maxlen=len(words)
    for word in words:
        word_freqs[word]+=1

In [9]:
maxlen,len(word_freqs)

(3767, 56413)

In [10]:
MAX_SENTENCE_LENGTH=500
MAX_FEATURES=30000
vocab_size=min(MAX_FEATURES,len(word_freqs))+2
word2index={x[0]:i+2 for i,x in enumerate(word_freqs.most_common(MAX_FEATURES))}
word2index["PAD"]=0
word2index["UNK"]=1
index2word={v:k for k,v in word2index.items()}

In [11]:
vocab_size

30002

In [12]:
word2index["PAD"],index2word[0]

(0, 'PAD')

In [13]:
X_train_filename="X_train.p"
X_test_filename="X_test.p"
y_train_filename="y_train.p"
model_filename="model.h5"

def normalize(train_description):
    X=np.empty((train_description.size,),dtype=list)
    i=0
    for sentence in train_description:
        words=nltk.word_tokenize(sentence.lower())
        seqs=[]
        for word in words:
            if word in word2index:
                seqs.append(word2index[word])
            else:
                seqs.append(word2index["UNK"])
        X[i]=seqs
        i+=1
    return sequence.pad_sequences(X,maxlen=MAX_SENTENCE_LENGTH)

def denormalize_response(predictions):
    return ['Good' if x > 0.5  else 'Bad' for x in predictions]

def normalize_response(predictions):
    return [1 if x == 'Good' else 0 for x in predictions]

def load_data(force=False):
    if os.path.exists(X_train_filename) and os.path.exists(X_test_filename) and os.path.exists(y_train_filename) and not force:        
        X_train=pickle.load( open( X_train_filename, "rb" ) )
        X_test=pickle.load( open( X_test_filename, "rb" ) )
        y_train=pickle.load( open( y_train_filename, "rb" ) )
    else:
        X_train=normalize(train_description)
        X_test=normalize(test_description)
        y_train=normalize_response(train["Is_Response"])
        pickle.dump( X_train, open( X_train_filename, "wb" ))
        pickle.dump( X_test, open( X_test_filename, "wb" ))
        pickle.dump( y_train, open( y_train_filename, "wb" ))
    return X_train,X_test,y_train

X_train,X_test,y_train=load_data()
denormalize_response(normalize_response(train["Is_Response"]))[:10]

['Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Bad', 'Bad', 'Good', 'Good']

In [14]:
Xtrain,Xtest,ytrain,ytest=train_test_split(X_train,y_train,test_size=0.3,random_state=40)

## Train Model

### Train model on prepared data

In [15]:
EMBEDDING_SIZE=128
HIDDEN_LAYER_SIZE=64
BATCH_SIZE=32
NUM_EPOCHS=2
DROPOUT=0.1

def load_train_model(force=False):
    if os.path.exists(model_filename) and not force:
        model=load_model(model_filename)
    else:
        print("Force load model.")
        model=Sequential()
        model.add(Embedding(vocab_size,EMBEDDING_SIZE,input_length=MAX_SENTENCE_LENGTH))
        model.add(SpatialDropout1D(DROPOUT))
        model.add(Bidirectional(LSTM(HIDDEN_LAYER_SIZE,dropout=DROPOUT,recurrent_dropout=DROPOUT)))
        model.add(Dense(1))
        model.add(Activation("sigmoid"))
        model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=["accuracy"])
    return model

checkpoint=ModelCheckpoint(model_filename, monitor='val_acc', verbose=0, save_best_only=False, mode='auto', period=1)
model=load_train_model()
history=model.fit(Xtrain,ytrain,batch_size=BATCH_SIZE,epochs=NUM_EPOCHS,validation_data=(Xtest,ytest),callbacks=[checkpoint])

Train on 21120 samples, validate on 9052 samples
Epoch 1/2
Epoch 2/2


In [18]:
model=load_model(model_filename)
model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=["accuracy"])
predictions=model.predict(X_test)
predictions=denormalize_response(predictions)
predictions[:10]

['Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Bad', 'Good', 'Good', 'Good']

In [19]:
test_result_filename="Approach_1.csv"
test_result=pd.concat([test['User_ID'],pd.DataFrame(predictions)],axis=1)
test_result.columns=['User_ID','Is_Response']
test_result.to_csv(test_result_filename,sep="~",index=False)
print("File Saved!")
test_result.head()

File Saved!


Unnamed: 0,User_ID,Is_Response
0,9602,Good
1,8749,Good
2,15500,Good
3,5495,Good
4,18570,Good
