Yet another script for some sentiment analysis hackathon! Here we use auto-sklearn 
(which only works on python3) to do model selection and parameter tuning for us.

In [2]:
# Load Libraries
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer

In [3]:
# load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [5]:
# remove if any user duplicates exist
train.drop_duplicates(subset=['User_ID'], keep=False).head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [6]:
# function to clean data

stops = set(stopwords.words("english"))
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [7]:
## join data
test['Is_Response'] = np.nan
alldata = pd.concat([train, test]).reset_index(drop=True)

# clean description
alldata['Description'] = alldata['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))

In [10]:
tfidfvec = TfidfVectorizer(
    analyzer='word',
    ngram_range = (1, 3),
    min_df=200,
    max_features=20000,
    max_df=0.7,
    sublinear_tf=True,
    norm='l2',
)

In [11]:
# create features
tfidfdata = tfidfvec.fit_transform(alldata['Description'])

# label encode categorical features in data given
cols = ['Browser_Used','Device_Used']

for x in cols:
    lbl = LabelEncoder()
    alldata[x] = lbl.fit_transform(alldata[x])
    
# create dataframe for features
tfidf_df = pd.DataFrame(tfidfdata.todense())

# set column names
tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]

# create separate data frame for tf-idf
tfid_df_train = tfidf_df[:len(train)].applymap(str)
tfid_df_test = tfidf_df[len(train):].applymap(str)

# split the merged data file into train and test respectively
train_feats = alldata[~pd.isnull(alldata.Is_Response)]
test_feats = alldata[pd.isnull(alldata.Is_Response)]

In [17]:
### set target variable
train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [18]:
# merge into a new data frame with tf-idf features
train_feats2 = pd.concat([train_feats[cols], tfid_df_train], axis=1)
test_feats2 = pd.concat([test_feats[cols], tfid_df_test], axis=1)

# let's check cross validation score of the model
# cv score acts a unbiased estimate of models accuracy on unseen data
target = train_feats['Is_Response']

In [20]:
def to_labels(x):
    if x == 1:
        return "happy"
    return "not_happy"

In [21]:
target = train_feats['Is_Response']

## Auto sklearn

In [22]:
import autosklearn.classification
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics

In [27]:
params = {
    'time_left_for_this_task':  7200,
    'per_run_time_limit': 450,
    'ensemble_size': 100,
    'ensemble_nbest': 100,
    'ml_memory_limit': 8192,
    'resampling_strategy': 'cv',
    'resampling_strategy_arguments': {'folds': 3},
}

In [None]:
automl = autosklearn.classification.AutoSklearnClassifier(**params)
automl.fit(train_feats2, target)

In [None]:
auto_preds = automl.predict(test_feats2)

In [None]:
auto_sub = pd.DataFrame({'User_ID': test.User_ID, 'Is_Response': auto_preds})
auto_sub['Is_Response'] = auto_sub['Is_Response'].map(lambda x: to_labels(x))
auto_sub = auto_sub[['User_ID', 'Is_Response']]
auto_sub.to_csv('submissions/auto_sub2.csv', index=False)

As far as I can remember, these predictions weren't particulary good or bad. But definitely serve their purpose as a good staring point, and for ensembling with other models.