### Introduction

Here you'll learn to build models using Catboost, Lightgbm and NaiveBayes algorithm in Python. Given the text classification problem, you'll also learn to clean data, create bag of words matrix, tf-idf matrix. 

On top of what's done here, next you can create a simple voting ensemble from the predictions generated from these models here.

In [0]:
# Load Libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer

In [0]:
# load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [0]:
train.head(8)

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy
5,id10331,We had - rooms. One was very nice and clearly ...,InternetExplorer,Desktop,happy
6,id10332,My husband and I have stayed in this hotel a f...,Firefox,Tablet,not happy
7,id10333,My wife & I stayed in this glorious city a whi...,Google Chrome,Mobile,happy


In [0]:
# function to clean data
# import nltk

# nltk.download()
# stops = set(stopwords.words("english"))

stops = []

def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [0]:
## join data
test['Is_Response'] = np.nan
alldata = pd.concat([train, test]).reset_index(drop=True)

In [0]:
# clean description
alldata['Description'] = alldata['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=False, stemming=False))

In [0]:
# initialise the functions - we'll create separate models for each type.
countvec = CountVectorizer(analyzer='word', ngram_range = (1,1), min_df=150, max_features=500)
tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,1), min_df = 150, max_features=500)

In [0]:
# create features
bagofwords = countvec.fit_transform(alldata['Description'])
tfidfdata = tfidfvec.fit_transform(alldata['Description'])

In [0]:
# label encode categorical features in data given
cols = ['Browser_Used','Device_Used']

for x in cols:
    lbl = LabelEncoder()
    alldata[x] = lbl.fit_transform(alldata[x])

In [0]:
# create dataframe for features
bow_df = pd.DataFrame(bagofwords.todense())
tfidf_df = pd.DataFrame(tfidfdata.todense())

In [0]:
# set column names
bow_df.columns = ['col'+ str(x) for x in bow_df.columns]
tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]

In [0]:
# create separate data frame for bag of words and tf-idf

bow_df_train = bow_df[:len(train)]
bow_df_test = bow_df[len(train):]

tfid_df_train = tfidf_df[:len(train)]
tfid_df_test = tfidf_df[len(train):]

In [0]:
# split the merged data file into train and test respectively
train_feats = alldata[~pd.isnull(alldata.Is_Response)]
test_feats = alldata[pd.isnull(alldata.Is_Response)]

In [0]:
### set target variable

train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [0]:
# merge count (bag of word) features into train
train_feats1 = pd.concat([train_feats[cols], bow_df_train], axis = 1)
test_feats1 = pd.concat([test_feats[cols], bow_df_test], axis=1)

test_feats1.reset_index(drop=True, inplace=True)

In [0]:
# merge into a new data frame with tf-idf features
train_feats2 = pd.concat([train_feats[cols], tfid_df_train], axis=1)
test_feats2 = pd.concat([test_feats[cols], tfid_df_test], axis=1)

### Kmeans ,SVM,Logistic Regression ,KNN etc

In [0]:
# let's check cross validation score of the model
# cv score acts a unbiased estimate of models accuracy on unseen data
# from sklearn import linear_model
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn import svm
# from sklearn.cluster import KMeans
# from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
# from sklearn.cluster import MiniBatchKMeans

target = train_feats['Is_Response']
# l=  KNeighborsClassifier(n_neighbors=29,weights='distance')
# l = KMeans(n_clusters=2).fit(train_feats1,target)
# l = RandomForestClassifier( n_estimators=100,criterion='gini')
l=LogisticRegression(C=0.5, penalty='l2',solver='sag')
l.fit(train_feats2,target)

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)

In [0]:
## Naive Bayes 1
# print(cross_val_score(mod1, train_feats1, target, cv=5, scoring=make_scorer(accuracy_score)))

In [0]:
# Naive Bayes 2 - tfidf is giving higher CV score
# print(cross_val_score(mod1, train_feats2, target, cv=5, scoring=make_scorer(accuracy_score)))
# l.labels_

In [0]:
# make our first set of predictions
preds3=l.predict(test_feats2)
# clf1 = l
# clf1.fit(train_feats1, target)

# clf2 = l
# clf2.fit(train_feats2, target)
# l.cluster_centers_

In [0]:
# preds1 = clf1.predict(test_feats1)
# preds2 = clf2.predict(test_feats2)

In [0]:
def to_labels(x):
    if x == 1:
        return "happy"
    return "not_happy"

In [0]:
# sub1 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds1})
# sub1['Is_Response'] = sub1['Is_Response'].map(lambda x: to_labels(x))
sub3 = pd.DataFrame({'User_ID':test.User_ID,'Is_Response':preds3})
sub3['Is_Response'] = sub3['Is_Response'].map(lambda x: to_labels(x))

In [0]:
# sub2 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds2})
# sub2['Is_Response'] = sub2['Is_Response'].map(lambda x: to_labels(x))

In [0]:


# sub1 = sub1[['User_ID', 'Is_Response']]
# sub2 = sub2[['User_ID', 'Is_Response']]
sub3 = sub3[['User_ID','Is_Response']]

In [0]:
## write submission files
# sub1.to_csv('sub1_cv.csv', index=False)
# sub2.to_csv('sub2_tf.csv', index=False)
sub3.to_csv('lg_8.csv',index=False)