# Tree based methods

In [23]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import re
import string
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score
from collections import Counter
from pylab import *
import nltk
import warnings
warnings.filterwarnings('ignore')


# classification models 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier

lemmatizer = WordNetLemmatizer()

In [3]:
data_patio_lawn_garden = pd.read_json('data/reviews_Patio_Lawn_and_Garden_5.json', lines = True)
print(data_patio_lawn_garden.shape)
data_patio_lawn_garden.head()

(13272, 9)


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1JZFGZEZVWQPY,B00002N674,"Carter H ""1amazonreviewer@gmail . com""","[4, 4]",Good USA company that stands behind their prod...,4,Great Hoses,1308614400,"06 21, 2011"
1,A32JCI4AK2JTTG,B00002N674,"Darryl Bennett ""Fuzzy342""","[0, 0]",This is a high quality 8 ply hose. I have had ...,5,Gilmour 10-58050 8-ply Flexogen Hose 5/8-Inch ...,1402272000,"06 9, 2014"
2,A3N0P5AAMP6XD2,B00002N674,H B,"[2, 3]",It's probably one of the best hoses I've ever ...,4,Very satisfied!,1336176000,"05 5, 2012"
3,A2QK7UNJ857YG,B00002N674,Jason,"[0, 0]",I probably should have bought something a bit ...,5,Very high quality,1373846400,"07 15, 2013"
4,AS0CYBAN6EM06,B00002N674,jimmy,"[1, 1]",I bought three of these 5/8-inch Flexogen hose...,5,Good Hoses,1375660800,"08 5, 2013"


In [4]:
# value counts for overall 
data_patio_lawn_garden['overall'].value_counts()

5    7037
4    3384
3    1659
2     673
1     519
Name: overall, dtype: int64

In [6]:
data_patio_lawn_garden['cleaned_review_text'] = data_patio_lawn_garden['reviewText'].apply(\
lambda x : ' '.join([lemmatizer.lemmatize(word.lower()) \
    for word in word_tokenize(re.sub(r'([^\s\w]|_)+', ' ', str(x)))]))

In [9]:
data_patio_lawn_garden[['reviewText','cleaned_review_text','overall']].head()

Unnamed: 0,reviewText,cleaned_review_text,overall
0,Good USA company that stands behind their prod...,good usa company that stand behind their produ...,4
1,This is a high quality 8 ply hose. I have had ...,this is a high quality 8 ply hose i have had g...,5
2,It's probably one of the best hoses I've ever ...,it s probably one of the best hose i ve ever h...,4
3,I probably should have bought something a bit ...,i probably should have bought something a bit ...,5
4,I bought three of these 5/8-inch Flexogen hose...,i bought three of these 5 8 inch flexogen hose...,5


In [8]:
tfidf_model = TfidfVectorizer(max_features=500)
tfidf_df = pd.DataFrame(tfidf_model.fit_transform(data_patio_lawn_garden['cleaned_review_text']).todense())
tfidf_df.columns = sorted(tfidf_model.vocabulary_)
tfidf_df.head()

Unnamed: 0,10,20,34,8217,able,about,actually,add,after,again,...,work,worked,working,worth,would,yard,year,yet,you,your
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.120568,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161561,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.116566,0.0,0.216988,0.0,0.049357
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.064347,0.0,0.0,0.070857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.083019,0.0,0.0,0.0,0.0


In [11]:
#Let's consider review with overall score <= 4 to be negative (encode it as 0) 
#and overall score > 4 to be positive (encode it as 1)

data_patio_lawn_garden['target'] = data_patio_lawn_garden['overall'].apply(lambda x : 0 if x<=4 else 1)
data_patio_lawn_garden['target'].value_counts()

1    7037
0    6235
Name: target, dtype: int64

In [21]:
# generic functions


def show_performance(model,X_train,y_train,roc=False):
    model.fit(X_train,y_train)
    print('Training accuracy - ',accuracy_score(model.predict(X_train),y_train))
    print('Training F1 score -',f1_score(model.predict(X_train),y_train))
    
    if roc:
        print('')
        print('Training ROC - ',roc_auc_score(y_train,model.predict_proba(X_train)[:,1]))
        

def run_classification_models(X_train,y_train,roc=True):
    models = [DecisionTreeClassifier(),GradientBoostingClassifier(), RandomForestClassifier(),XGBClassifier()]
    model_names = ['Decision Tree','Gradient Boosting','Random Forest','XGBclassifier']
    
    for i in range(len(models)):
        print('-------------------------------------------')
        print('For ',model_names[i],' -')
        print('')
        show_performance(models[i],X_train,y_train,roc)
        print('-------------------------------------------')

In [24]:
run_classification_models(tfidf_df,data_patio_lawn_garden['target'])

-------------------------------------------
For  Decision Tree  -

Training accuracy -  0.9993218806509946
Training F1 score - 0.9993608408493716

Training ROC -  0.9999987578544339
-------------------------------------------
-------------------------------------------
For  Gradient Boosting  -

Training accuracy -  0.755801687763713
Training F1 score - 0.771840901091165

Training ROC -  0.8349545460191572
-------------------------------------------
-------------------------------------------
For  Random Forest  -

Training accuracy -  0.9993218806509946
Training F1 score - 0.9993609316196833

Training ROC -  0.9999977322296548
-------------------------------------------
-------------------------------------------
For  XGBclassifier  -

Training accuracy -  0.9486889692585895
Training F1 score - 0.9522540839935497

Training ROC -  0.989890564696468
-------------------------------------------
