In [124]:
# Load Libraries
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to /Users/vadym/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [125]:
# load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train_size = train.shape[0]
print train_size
train.head()

38932


Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [126]:
# function to clean data

stops = set(stopwords.words("english"))
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [127]:
## join data
test['Is_Response'] = np.nan
alldata = pd.concat([train, test]).reset_index(drop=True)

In [128]:
# clean description
# May take some time
alldata['Description'] = alldata['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))

In [129]:
# vectorize words
countvec = CountVectorizer(analyzer='word', ngram_range = (1,1), min_df=150, max_features=500)
bagofwords = countvec.fit_transform(alldata['Description'])
bow_df = pd.DataFrame(bagofwords.todense())

In [130]:
# this representation must be familiar to you
bow_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,2,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,3,0,0,0,0,0


In [131]:
# Keras tokenizer
# Encode each review to a sequence of word indexes sorted by occurance frequency
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=500)
tokenizer.fit_on_texts(alldata['Description']) # we kinda mess with our test set right here, so I'm not sure about it

In [136]:
# transform the inputs
X = tokenizer.texts_to_sequences(alldata['Description'])
y = np.array([1 if x == 'happy' else 0 for x in alldata['Is_Response']])

In [137]:
# the result of the tokenization
X[0]

[1, 332, 13, 319, 395, 414, 262, 3, 296, 7, 392, 3, 47, 65, 20, 42, 111, 198]

In [142]:
# neural nerworks are bad in variable length inputs, so we need to pad the sequences
from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(X, maxlen=100) # maximum count of words in a review

In [143]:
# now each review looks like below
X[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   1, 332,  13, 319, 395, 414, 262,   3, 296,
         7, 392,   3,  47,  65,  20,  42, 111, 198], dtype=int32)

In [144]:
X_train = X[:train_size]
y_train = y[:train_size]
X_test = X[train_size:] 
# there is no y_test, it's online

In [145]:
# split into train and validation set (test is remote)
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

In [146]:
# build the model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

# create the model
model = Sequential()
model.add(Embedding(500, 32, input_length=100))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 32)           16000     
_________________________________________________________________
flatten_4 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 250)               800250    
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 251       
Total params: 816,501
Trainable params: 816,501
Non-trainable params: 0
_________________________________________________________________
None


In [147]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=1, batch_size=128)
# Final evaluation of the model
scores = model.evaluate(X_val, y_val, verbose=0)
print scores

Train on 26084 samples, validate on 12848 samples
Epoch 1/1
[0.34269295842531755, 0.85149439601494392]


In [151]:
# predict the labels for the train set
predicted = ['not_happy' if x < .5 else 'happy' for x in model.predict(X_test)]

In [182]:
# show some prediction results 
from IPython.display import display, Markdown
for i in range(0, 5):
    print test['Description'][i]
    print display(Markdown('**{}**'.format(predicted[i])))

Looking for a motel in close proximity to TV taping of a Dr. Phil show, we chose the Dunes on Sunset Blvd in West Hollywood. Although the property displayed the AAA emblem, it certainly left a lot to be desired. There were chips & scrapes on the bottom of the door frame in the bathroom and the lotion containers were half full--apparently not replaced by housekeeping. We needed an early wakeup call, but couldn't use the clock radio alarm as there wasn't a radio in the room. There was no TV channel listing on the remote, or on the TV menu making viewing a chore.The TV remote had to be returned when checking-out. This place served its purpose, but not a place to revisit.


**not_happy**

None
Walking distance to Madison Square Garden and the Penn Subway Station. Also very close to lots of retail stores and restaurants. The hotel offers free internet, a gym and a hot breakfast. Our room was on the top floor facing --th St. The room was clean and the staff very helpful and accommodating. The downside was that the A-C was making a noise and the faucet in the bathroom was loose - things that can be easily fixed. We walked from the hotel to Madison Square Gardens, to Times Square and on to Broadway. Everything considered, I will definitely recommend this hotel to my friends.


**happy**

None
Visited Seattle on business. Spent - nights in the Vintage Park. Great location. Room was nice. Bathroom was nice and clean. Thermostat didn't work-had to open the windows to cool the room off. Upon check-in, they asked if I wanted turn-down service. I said yes. Never happened. Asked front desk for a local Seattle restaurant on my first night. She suggested two cookie-cutter wine bars and a mall restaurant. Not a big deal, but as I became familiar with downtown Seattle, I couldn't believe she didn't recommend The Brooklyn or one of the many other restaurants with true local flavor. Joined the Kimpton loyalty club upon arriving. Was supposed to get the WSJ. Never even received USA Today. But I did get free internet, which saved me some money. No pool. I knew this before I made the reservation, but it was not immediately clear--had to do some searching.


**happy**

None
This hotel location is excellent and the rooms are very clean. Our suite was a one bedroom apartment with a full kitchen but a very small bathroom. The staff is friendly and helpful but the rooms do need to be revised. The linens and bedding was hard and outdated. The air conditioner did not work and the heater was on and could not be turned off or adjusted. Althought it was cold outside we felt we were in a sauna and had to open the windows for air. We were told by the staff that at a certain temperature outside the air would not turn on and the heater would go on automatically. This was not a good thing......we were sweating even with the windows open.......
I would however stay there again if they can guarantee this be corrected because of price, size and location.


**not_happy**

None
This hotel is awesome I love the service Anthony is really a great guy you see at the front desk! It is close to everything and is wonderful for kids I love it. The best hotel ever but wonderful cleanliness and quality great hotel for couples and singles.


**happy**

None


In [159]:
# function for downloading results
from IPython.display import HTML
import base64

def create_download_link(df, filename, cols):  
    csv = df.to_csv(index=False, header = True, columns=cols)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{filename}</a>'
    html = html.format(payload=payload,filename=filename)
    return HTML(html)

In [160]:
# save to file and create download link
submission = pd.DataFrame({'User_ID':test['User_ID'], 'Is_Response':predicted})
create_download_link(submission, "submission.csv", ['User_ID', 'Is_Response'])

## Ideas for improvement:
1. change the size of vocabulary
2. cut off the most common words
3. build another model for browser and device variables
4. use recurrent network for learning more complex features like sentence structure
5. Follow me on github :D https://github.com/bolein