Here we do sentiment analysis after preprocessing the data with porter stemmer and then embedding data using TfidfVectorizer, and then model it using LinearSVC classifier from sklearn.

In [1]:
# Load Libraries
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [4]:
# remove if any user duplicates exist
train.drop_duplicates(subset=['User_ID'], keep=False).head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [5]:
# function to clean data

stops = set(stopwords.words("english"))
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [6]:
## join data
test['Is_Response'] = np.nan
alldata = pd.concat([train, test]).reset_index(drop=True)

# clean description
alldata['Description'] = alldata['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))

In [8]:
# initialise the functions - we'll create separate models for each type.
stop = {}
tfidfvec = TfidfVectorizer(
    analyzer='word',
    ngram_range = (1, 3),
    min_df=200,
    max_features=20000,
    max_df=0.7,
    sublinear_tf=True,
    norm='l2',
)

In [9]:
# create features
tfidfdata = tfidfvec.fit_transform(alldata['Description'])

In [None]:
# label encode categorical features in data given
# cols = ['Browser_Used','Device_Used']

# for x in cols:
#     lbl = LabelEncoder()
#     alldata[x] = lbl.fit_transform(alldata[x])

In [10]:
alldata['browser_device'] = alldata['Browser_Used'] + alldata['Device_Used']

In [11]:
all_browsers = pd.get_dummies(alldata['browser_device'])
print all_browsers.columns.values

['ChromeDesktop' 'ChromeMobile' 'ChromeTablet' 'EdgeDesktop' 'EdgeMobile'
 'EdgeTablet' 'FirefoxDesktop' 'FirefoxMobile' 'FirefoxTablet'
 'Google ChromeDesktop' 'Google ChromeMobile' 'Google ChromeTablet'
 'IEDesktop' 'IEMobile' 'IETablet' 'Internet ExplorerDesktop'
 'Internet ExplorerMobile' 'Internet ExplorerTablet'
 'InternetExplorerDesktop' 'InternetExplorerMobile'
 'InternetExplorerTablet' 'Mozilla FirefoxDesktop' 'Mozilla FirefoxMobile'
 'Mozilla FirefoxTablet' 'MozillaDesktop' 'MozillaMobile' 'MozillaTablet'
 'OperaDesktop' 'OperaMobile' 'OperaTablet' 'SafariDesktop' 'SafariMobile'
 'SafariTablet']


Lots of mobile and desktop counter parts of same browser... opportunity to do some feature engg. here.

Based on hunch that if people on new version of a browser are unhappy, their similar browser version counterparts might also be. Also some of these are plain duplicates. 

Doing this mapping improved the LB score, so this proved useful.

In [12]:
# manually plugging in values for similar browsers
browser_mapping = {
    'edge_desktop': ['EdgeDesktop'],
    'firefox_mobile': ['FirefoxMobile', 'Mozilla FirefoxMobile', 'MozillaMobile'],
    'opera_tablet': ['OperaTablet'],
    'chrome_mobile': ['ChromeMobile', 'Google ChromeMobile'],
    'explorer_tablet': [],
    'chrome_tablet': ['ChromeTablet', 'Google ChromeTablet'],
    'chrome_desktop': ['ChromeDesktop', 'Google ChromeDesktop'],
    'opera_mobile': ['OperaMobile'],
    'edge_mobile': ['EdgeMobile'],
    'safari_mobile': ['SafariMobile'],
    'safari_desktop': ['SafariDesktop'],
    'firefox_desktop': ['FirefoxDesktop', 'Mozilla FirefoxDesktop', 'MozillaDesktop'],
    'firefox_tablet': ['FirefoxTablet', 'Mozilla FirefoxTablet', 'MozillaTablet'],
    'opera_desktop': ['OperaDesktop'],
    'safari_tablet': ['SafariTablet'],
    'edge_tablet': ['EdgeTablet'],
    'explorer_tablet': ['IETablet', 'Internet ExplorerTablet', 'InternetExplorerTablet'],
    'explorer_desktop': ['IEDesktop', 'Internet ExplorerDesktop', 'InternetExplorerDesktop'],
    'explorer_mobile': ['IEMobile', 'Internet ExplorerMobile', 'InternetExplorerMobile']
 }

In [13]:
def user_agent_replace(user_agent):
    for name, values in browser_mapping.iteritems():
        for value in values:
            if value == user_agent:
                return name

In [14]:
alldata['browser_device'] = alldata['browser_device'].apply(user_agent_replace)
one_hot_encoded_browser = pd.get_dummies(alldata['browser_device'])
alldata = alldata.join(one_hot_encoded_browser)

In [15]:
# create dataframe for features
tfidf_df = pd.DataFrame(tfidfdata.todense())

# set column names
tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]

# create separate data frame for tf-idf
tfid_df_train = tfidf_df[:len(train)].applymap(str)
tfid_df_test = tfidf_df[len(train):].applymap(str)

# create dataframe for features
tfidf_df = pd.DataFrame(tfidfdata.todense())

# set column names
tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]

cleaned_cols = ['Description', 'Browser_Used', 'Device_Used', 'browser_device', 'User_ID']
alldata.drop(cleaned_cols, inplace=True, axis=1)

# split the merged data file into train and test respectively
train_feats = alldata[~pd.isnull(alldata.Is_Response)]
test_feats = alldata[pd.isnull(alldata.Is_Response)]

In [21]:
### set target variable
train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [22]:
# merge into a new data frame with tf-idf features
train_feats2 = train_feats.join(tfid_df_train)
test_feats2 = test_feats.join(tfid_df_test)

In [23]:
target = train_feats2['Is_Response']

In [52]:
def to_labels(x):
    if x == 1:
        return "happy"
    return "not_happy"

In [38]:
# forgot to remove is_response
train_feats2.drop(['Is_Response'], inplace=True, axis=1)
test_feats2.drop(['Is_Response'], inplace=True, axis=1)

In [39]:
train_feats2.head()

Unnamed: 0,chrome_desktop,chrome_mobile,chrome_tablet,edge_desktop,edge_mobile,edge_tablet,explorer_desktop,explorer_mobile,explorer_tablet,firefox_desktop,...,col4231,col4232,col4233,col4234,col4235,col4236,col4237,col4238,col4239,col4240
0,0,0,0,0,1,0,0,0,0,0,...,0.0,0.185342918018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
test_feats2.head()

Unnamed: 0,chrome_desktop,chrome_mobile,chrome_tablet,edge_desktop,edge_mobile,edge_tablet,explorer_desktop,explorer_mobile,explorer_tablet,firefox_desktop,...,col4231,col4232,col4233,col4234,col4235,col4236,col4237,col4238,col4239,col4240
38932,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38933,0,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38934,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38935,0,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38936,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Linear SVC

In [26]:
# http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
from sklearn.svm import LinearSVC

In [41]:
clf = LinearSVC(max_iter=20000, verbose=True)

In [42]:
# using support vector machine instead
clf.fit(train_feats2, target)

[LibLinear]

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=20000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=True)

In [43]:
sv_preds = clf.predict(test_feats2)

In [49]:
sub_svc = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response': sv_preds})
sub_svc['Is_Response'] = sub_svc['Is_Response'].map(lambda x: to_labels(x))

In [50]:
sub_svc = sub_svc[['User_ID', 'Is_Response']]

In [51]:
sub_svc.to_csv('submissions/out_of_names.csv', index=False)

This gave a public LB score of 0.89040