In [1]:
# Load Libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder

import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer

# Reading Data & Pre-Analysis

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [4]:
test.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used
0,id80132,Looking for a motel in close proximity to TV t...,Firefox,Mobile
1,id80133,Walking distance to Madison Square Garden and ...,InternetExplorer,Desktop
2,id80134,Visited Seattle on business. Spent - nights in...,IE,Tablet
3,id80135,This hotel location is excellent and the rooms...,Edge,Mobile
4,id80136,This hotel is awesome I love the service Antho...,Mozilla,Mobile


In [5]:
train.describe()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
count,38932,38932,38932,38932,38932
unique,38932,38932,11,3,2
top,id34061,CHECK OUT THIS LINK- http:--www.youtube.com-wa...,Firefox,Desktop,happy
freq,1,1,7367,15026,26521


In [6]:
test.describe()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used
count,29404,29404,29404,29404
unique,29404,29404,11,3
top,id100477,The best thing to say about the Sheraton is th...,Firefox,Desktop
freq,1,1,5676,11349


In [7]:
#Mapping {'happy':1,'not happy':0}

train['Is_Response']=train['Is_Response'].map({'happy':1,'not happy':0}).astype(int)

In [8]:
train[['Browser_Used' , 'Is_Response']].groupby(['Browser_Used'], as_index=False).mean().sort_values(by='Is_Response',ascending=False)

Unnamed: 0,Browser_Used,Is_Response
5,Internet Explorer,0.877318
3,Google Chrome,0.866066
6,InternetExplorer,0.864647
4,IE,0.860599
0,Chrome,0.851417
8,Mozilla Firefox,0.740065
10,Safari,0.728205
9,Opera,0.665746
7,Mozilla,0.595408
2,Firefox,0.500204


In [9]:
train[['Device_Used' , 'Is_Response']].groupby(['Device_Used'], as_index=False).mean().sort_values(by='Is_Response',ascending=False)

Unnamed: 0,Device_Used,Is_Response
1,Mobile,0.707933
0,Desktop,0.705111
2,Tablet,0.596193


# Data Cleaning

In [10]:
stops=pd.read_csv('stop-word-list.csv')
stops=set(stops)

In [11]:
# function to clean data

def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [12]:
## join train and test data

test['Is_Response'] = np.nan
alldata = pd.concat([train, test]).reset_index(drop=True)

In [13]:
alldata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68336 entries, 0 to 68335
Data columns (total 5 columns):
User_ID         68336 non-null object
Description     68336 non-null object
Browser_Used    68336 non-null object
Device_Used     68336 non-null object
Is_Response     38932 non-null float64
dtypes: float64(1), object(4)
memory usage: 2.6+ MB


In [14]:
#cleaning data

alldata['Description']=alldata['Description'].map(lambda x:cleanData(x, lowercase=True, remove_stops=True, stemming=True))

#Function for Vectorization --- CountVectorizer & TfidfVectorizer

In [15]:
countvec=CountVectorizer(analyzer='word',ngram_range=(1,1),min_df=150,max_features=500)

tfidfvec=TfidfVectorizer(analyzer='word',ngram_range=(1,1),min_df=150,max_features=500)

In [16]:
#create features

bagofwords=countvec.fit_transform(alldata['Description'])

tfidfdata=tfidfvec.fit_transform(alldata['Description'])

In [17]:
bagofwords

<68336x500 sparse matrix of type '<class 'numpy.int64'>'
	with 4545044 stored elements in Compressed Sparse Row format>

In [18]:
tfidfdata

<68336x500 sparse matrix of type '<class 'numpy.float64'>'
	with 4545044 stored elements in Compressed Sparse Row format>

In [19]:
#label encoding categorical data

cols=['Browser_Used','Device_Used']

for x in cols:
    lbl=LabelEncoder()
    alldata[x]=lbl.fit_transform(alldata[x])

In [20]:
#checking label encoding

alldata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68336 entries, 0 to 68335
Data columns (total 5 columns):
User_ID         68336 non-null object
Description     68336 non-null object
Browser_Used    68336 non-null int64
Device_Used     68336 non-null int64
Is_Response     38932 non-null float64
dtypes: float64(1), int64(2), object(2)
memory usage: 2.6+ MB


In [21]:
#creating dataframe for features

bow_df=pd.DataFrame(bagofwords.todense())
tfidf_df=pd.DataFrame(tfidfdata.todense())

In [24]:
print(bow_df.info())
print('-'*30)
tfidf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68336 entries, 0 to 68335
Columns: 500 entries, 0 to 499
dtypes: int64(500)
memory usage: 260.7 MB
None
------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68336 entries, 0 to 68335
Columns: 500 entries, 0 to 499
dtypes: float64(500)
memory usage: 260.7 MB


In [26]:
# set column names
bow_df.columns = ['col'+ str(x) for x in bow_df.columns]
tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]

In [28]:
# train and test of bag of words & tf-idf

bow_df_train=bow_df[:len(train)]
bow_df_test=bow_df[len(train):]

tfid_df_train = tfidf_df[:len(train)]
tfid_df_test = tfidf_df[len(train):]


In [29]:
#split merged alldata back to train and test

train_feats=alldata[~pd.isnull(alldata.Is_Response)]
test_feats=alldata[pd.isnull(alldata.Is_Response)]

train_feats.shape, test_feats.shape

((38932, 5), (29404, 5))

In [31]:
#merge (bag of word) features into train

#used cols=['Browser_Used','Device_Used'] as this only is required
#Description is being replaced by bag of words features dataframe

train_feats_bow=pd.concat([train_feats[cols],bow_df_train],axis=1)
test_feats_bow=pd.concat([test_feats[cols],bow_df_test],axis=1)

test_feats_bow.reset_index(drop=True,inplace=True)

train_feats_bow.shape , test_feats_bow.shape

((38932, 502), (29404, 502))

In [35]:
# merge into a new data frame with tf-idf features

#used cols=['Browser_Used','Device_Used'] as this only is required
#Description is being replaced by tf-idf features dataframe

train_feats2 = pd.concat([train_feats[cols], tfid_df_train], axis=1)
test_feats2 = pd.concat([test_feats[cols], tfid_df_test], axis=1)

In [38]:
# for classification

target=train_feats['Is_Response']

# NaiveBayes

In [41]:
mod1=GaussianNB()

In [42]:
#Checking cv score for bow 

print(cross_val_score(mod1,train_feats_bow,target,cv=5,scoring=make_scorer(accuracy_score)))

[ 0.75577812  0.74877986  0.7489083   0.74903673  0.75443103]


In [43]:
#Checking cv score for tf-idf

print(cross_val_score(mod1,train_feats2,target,cv=5,scoring=make_scorer(accuracy_score)))

#And this one is better

[ 0.80071905  0.80503468  0.80169535  0.80452094  0.80105317]


In [44]:
#making separate classifiers for bow and tf-idf

clf1=GaussianNB()
clf1.fit(train_feats_bow,target)

clf2=GaussianNB()
clf2.fit(train_feats2,target)

GaussianNB(priors=None)

In [45]:
#creating prediction files

preds1=clf1.predict(test_feats_bow) #bow
preds2=clf2.predict(test_feats2) #tf-idf

In [50]:
#function for labelling back prediction file

def to_labels(x):
    if x==1:
        return "happy"
    else:
        return "not_happy"

In [51]:
#Submission file for bag of words 

sub1=pd.DataFrame({'User_ID':test.User_ID,'Is_Response':preds1})
sub1['Is_Response']=sub1['Is_Response'].map(lambda x:to_labels(x))

In [52]:
#Submission file for tf-idf

sub2=pd.DataFrame({'User_ID':test.User_ID,'Is_Response':preds2})
sub2['Is_Response']=sub2['Is_Response'].map(lambda x:to_labels(x))

In [57]:
#Arranging columns

sub1 = sub1[['User_ID', 'Is_Response']]
sub2 = sub2[['User_ID', 'Is_Response']]

In [59]:
#Writing to csv files

sub1.to_csv('sub1_cv_my.csv', index=False)
sub2.to_csv('sub2_tf_my.csv', index=False)