#                                                Review Sentiment Analysis

**Upload the training & test data**

As my system is not having the requied H/W configuration, will take the help of Google Colab(GPU & Python 3).

In [8]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

**Import the required libraries**

In [0]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score

**Get the   training & test data**

In [4]:
data = pd.read_csv('train.csv', sep='~')
data_test = pd.read_csv('test.csv', sep='~')

NameError: ignored

In [85]:
data.head(3)

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,11755,After reading mixed reviews I almost didn't bo...,Google Chrome,Desktop,Good
1,33912,This motor inn is located about - city blocks ...,Firefox,Tablet,Good
2,10143,It was our first time there and surely not our...,Google Chrome,Mobile,Good


In [87]:
print('Training Data Shape : {}, / Test Data Shape : {}'.format(data.shape,data_test.shape))

Training Data Shape : (30172, 5), / Test Data Shape : (8760, 4)


In [88]:
data.Browser_Used.value_counts()

Firefox              5754
Edge                 5530
Google Chrome        3597
InternetExplorer     3538
Mozilla Firefox      3340
Mozilla              2397
IE                   1920
Chrome               1881
Internet Explorer    1635
Safari                306
Opera                 274
Name: Browser_Used, dtype: int64

In [89]:
data.Device_Used.value_counts()

Mobile     11639
Desktop    11630
Tablet      6903
Name: Device_Used, dtype: int64

**Feature Engineering**

**Create feature for text message length**

In [90]:
data['Description_len'] = data['Description'].apply(lambda text: len(text) - text.count(' ') )
data.head(3)

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,Description_len
0,11755,After reading mixed reviews I almost didn't bo...,Google Chrome,Desktop,Good,1015
1,33912,This motor inn is located about - city blocks ...,Firefox,Tablet,Good,508
2,10143,It was our first time there and surely not our...,Google Chrome,Mobile,Good,412


**Create feature for % of text that is punctuation**

In [99]:
# A function to find the % of punctuation in a doc
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100
  
data['punct%'] = data['Description'].apply(lambda text: count_punct(text))
data.head(3)

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,Description_len,punct%
0,11755,After reading mixed reviews I almost didn't bo...,Google Chrome,Desktop,Good,1015,4.1
1,33912,This motor inn is located about - city blocks ...,Firefox,Tablet,Good,508,3.3
2,10143,It was our first time there and surely not our...,Google Chrome,Mobile,Good,412,2.2


**Evaluate created features**

In [0]:
from matplotlib import pyplot
import numpy as np
%matplotlib inline

In [3]:
bins = np.linspace(0, 200, 40)

pyplot.hist(data[data['Is_Response']=='Good']['Description_len'], bins, alpha=0.5, normed=True, label='Good')
pyplot.hist(data[data['Is_Response']=='Bad']['Description_len'], bins, alpha=0.5, normed=True, label='Bad')
pyplot.legend(loc='upper left')
pyplot.show()

NameError: ignored

In [0]:
bins = np.linspace(0, 50, 40)

pyplot.hist(data[data['Is_Response']=='Good']['punct%'], bins, alpha=0.5, normed=True, label='Good')
pyplot.hist(data[data['Is_Response']=='Bad']['punct%'], bins, alpha=0.5, normed=True, label='Bad')
pyplot.legend(loc='upper right')
pyplot.show()

**Plot the two new features**

In [0]:
bins = np.linspace(0, 200, 40)

pyplot.hist(data['Description_len'], bins)
pyplot.title("Description Length Distribution")
pyplot.show()

In [0]:
bins = np.linspace(0, 50, 40)

pyplot.hist(data['punct%'], bins)
pyplot.title("Punctuation % Distribution")
pyplot.show()

**Transform the punctuation % feature**

**Box-Cox Transformation**

                Base Form : y^x

We are not going to include the extracted as they do not explain resonable variance towards the response variable.

**Get the engish stopwords**

In [3]:
stopwords = nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

[nltk_data] Downloading package stopwords to /content/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**Data Cleaning **

- Remove the punctuation
- Remove the stopwords
- Remove anything other than english words
- Get the root words by stemming (PorterStemmer)
- We will use Stemming (PorterStemmer) over Lematizing(nltk.WordNetLemmatizer) as we gain speed though stemming does not consider the context.

In [0]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text
  

**Vectorize & find the list of features to include for model building**

##### Get the dummy variables for the Categorical varaibles & drop the first column to get rid of the dummy variable trap

In [0]:
X = data.drop(['Description','User_ID','Is_Response'], axis=1)
X_dummies = pd.get_dummies(X,columns=['Browser_Used','Device_Used'], drop_first=True)

In [0]:
X_test_dummies = data_test.drop(['Description','User_ID'], axis=1)
X_test_dummies = pd.get_dummies(X_test_dummies,columns=['Browser_Used','Device_Used'], drop_first=True)

**Vectorize using the TfidfVectorizer**

**TF-IDF**
Creates a document-term matrix where the columns represent single unique terms (unigrams) but the cell represents a weighting meant to represent how important a word is to a document.

In [0]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text,min_df=3, max_features=6000)

Fit the vectorizer to the 'Description' column

In [0]:
X_tfidf = tfidf_vect.fit(data['Description'])

Tranform training & test data sets

In [0]:
X_train_vectorized_features=X_tfidf.transform(data['Description'])
X_test_vectorized_features=X_tfidf.transform(data_test['Description'])

In [13]:
X_train_vectorized_features.shape

(30172, 6000)

In [14]:
X_test_vectorized_features.shape

(8760, 6000)

####Trainng Features

In [18]:
X_features= pd.concat([X_dummies,pd.DataFrame(X_train_vectorized_features.toarray())], axis=1)
X_features.shape

(30172, 6012)

In [0]:
y=data.iloc[:,-1]
y = pd.get_dummies(y,columns=['Is_Response'],drop_first=True)
y.head()
y.shape

##### Test features

In [19]:
X_tst_features= pd.concat([X_test_dummies,pd.DataFrame(X_test_vectorized_features.toarray())], axis=1)
X_tst_features.shape

(8760, 6012)

### Split the datatset to training& test sets

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.3)

In [24]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((21120, 6012), (9052, 6012), (21120, 1), (9052, 1))

# Function to train multiple RandomForest classifier
   Manual Grid Search

In [0]:
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train.values)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, average='binary')
    print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        n_est, depth, round(precision, 3), 
        round(recall, 3),
        round(accuracy_score(y_test,y_pred))))

In [83]:
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_est, depth)

  This is separate from the ipykernel package so we can avoid doing imports until


Est: 10 / Depth: 10 ---- Precision: 0.728 / Recall: 0.986 / Accuracy: 1.0


  This is separate from the ipykernel package so we can avoid doing imports until


Est: 10 / Depth: 20 ---- Precision: 0.778 / Recall: 0.967 / Accuracy: 1.0


  This is separate from the ipykernel package so we can avoid doing imports until


Est: 10 / Depth: 30 ---- Precision: 0.796 / Recall: 0.952 / Accuracy: 1.0


  This is separate from the ipykernel package so we can avoid doing imports until


Est: 10 / Depth: None ---- Precision: 0.834 / Recall: 0.901 / Accuracy: 1.0


  This is separate from the ipykernel package so we can avoid doing imports until


Est: 50 / Depth: 10 ---- Precision: 0.722 / Recall: 0.994 / Accuracy: 1.0


  This is separate from the ipykernel package so we can avoid doing imports until


Est: 50 / Depth: 20 ---- Precision: 0.776 / Recall: 0.984 / Accuracy: 1.0


  This is separate from the ipykernel package so we can avoid doing imports until


Est: 50 / Depth: 30 ---- Precision: 0.8 / Recall: 0.976 / Accuracy: 1.0


  This is separate from the ipykernel package so we can avoid doing imports until


Est: 50 / Depth: None ---- Precision: 0.828 / Recall: 0.961 / Accuracy: 1.0


  This is separate from the ipykernel package so we can avoid doing imports until


Est: 100 / Depth: 10 ---- Precision: 0.719 / Recall: 0.997 / Accuracy: 1.0


  This is separate from the ipykernel package so we can avoid doing imports until


Est: 100 / Depth: 20 ---- Precision: 0.777 / Recall: 0.987 / Accuracy: 1.0


  This is separate from the ipykernel package so we can avoid doing imports until


Est: 100 / Depth: 30 ---- Precision: 0.802 / Recall: 0.98 / Accuracy: 1.0


  This is separate from the ipykernel package so we can avoid doing imports until


Est: 100 / Depth: None ---- Precision: 0.833 / Recall: 0.968 / Accuracy: 1.0


**Exploring parameter settings using GridSearchCV**

In [0]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [0]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_tfidf_feat, data['Is_Response'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

**Model Evaluation & Selection**

Find the set of parameters that gives the best performance & Create a model accordingly.

In [28]:
rf = RandomForestClassifier(n_estimators=50, max_depth=30, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, average='binary')
print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    50, 30, round(precision, 3), 
    round(recall, 3),
    round(accuracy_score(y_test,y_pred))))

  


Est: 50 / Depth: 30 ---- Precision: 0.802 / Recall: 0.974 / Accuracy: 1.0


**Make Predictions for the test data**

In [0]:
y_pred_submission = rf_model.predict(X_tst_features)

In [47]:
y_pred_submission[-5:]

array([1, 1, 1, 1, 1], dtype=uint8)

In [0]:
Prediction_Submit = pd.DataFrame(y_pred_submission)

In [65]:
type(pd.DataFrame(data_test.User_ID))

pandas.core.frame.DataFrame

In [67]:
final = pd.concat([pd.DataFrame(data_test.User_ID),Prediction_Submit], axis=1)
final.head()


Unnamed: 0,User_ID,0
0,9602,0
1,8749,1
2,15500,1
3,5495,1
4,18570,1


In [0]:
final.columns = ['User_ID','Is_Response']

Save the predictions to a csv file

In [0]:
final.to_csv('Submission.csv',sep='~', index = False)

Download the predictions to local

In [0]:
from google.colab import files

files.download('Submission.csv')

In [49]:
data_test['Description'][8757]

'Came to the Alexander Inn, via TripAdvisor. John could not have been more helpful on the phone. His charm, combined with the excellent location of the hotel, and the\nprevious reviews, sealed the deal for me. Our room was on the -th floor, with super \nviews, and a comfy, firm bed. The included breakfast was much appreciated. We could \nwalk both to the South St area, and convention center with ease.'