In [52]:
#Import all libraries

import pandas as pd
import numpy as np
import chardet
import seaborn as sns
import matplotlib.pyplot as plt
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from autocorrect import Speller

from sklearn.model_selection import train_test_split,RepeatedStratifiedKFold,StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_curve, classification_report, roc_auc_score

from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics.pairwise import pairwise_distances

import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance

import joblib


## Loading of data

In [53]:
pd.set_option('display.max_colwidth', None)

In [54]:
ratings_orig = pd.read_csv('sample30.csv')
#data_desc = pd.read_csv('Data+Attribute+Description.csv')

## Data Cleaning

In [55]:
ratings = ratings_orig.copy()

In [56]:
ratings.shape

(30000, 15)

In [57]:
ratings.head(3)

Unnamed: 0,id,brand,categories,manufacturer,name,reviews_date,reviews_didPurchase,reviews_doRecommend,reviews_rating,reviews_text,reviews_title,reviews_userCity,reviews_userProvince,reviews_username,user_sentiment
0,AV13O1A8GV-KLJ3akUyj,Universal Music,"Movies, Music & Books,Music,R&b,Movies & TV,Movie Bundles & Collections,CDs & Vinyl,Rap & Hip-Hop,Bass,Music on CD or Vinyl,Rap,Hip-Hop,Mainstream Rap,Pop Rap",Universal Music Group / Cash Money,Pink Friday: Roman Reloaded Re-Up (w/dvd),2012-11-30T06:21:45.000Z,,,5,i love this album. it's very good. more to the hip hop side than her current pop sound.. SO HYPE! i listen to this everyday at the gym! i give it 5star rating all the way. her metaphors are just crazy.,Just Awesome,Los Angeles,,joshua,Positive
1,AV14LG0R-jtxr-f38QfS,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Cookies & Chips,Rice Cakes,Cakes",Lundberg,Lundberg Organic Cinnamon Toast Rice Cakes,2017-07-09T00:00:00.000Z,True,,5,Good flavor. This review was collected as part of a promotion.,Good,,,dorothy w,Positive
2,AV14LG0R-jtxr-f38QfS,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Cookies & Chips,Rice Cakes,Cakes",Lundberg,Lundberg Organic Cinnamon Toast Rice Cakes,2017-07-09T00:00:00.000Z,True,,5,Good flavor.,Good,,,dorothy w,Positive


In [58]:
ratings.shape

(30000, 15)

In [59]:
ratings.isnull().sum()

id                          0
brand                       0
categories                  0
manufacturer              141
name                        0
reviews_date               46
reviews_didPurchase     14068
reviews_doRecommend      2570
reviews_rating              0
reviews_text                0
reviews_title             190
reviews_userCity        28071
reviews_userProvince    29830
reviews_username           63
user_sentiment              1
dtype: int64

reviews_userCity and reviews_userProvince have more than 93% null values. No amount of imputation would help here and we would not like to add any imaginary data. User city and province does not influence the sentiment classification nor the recommendation system so it is safe to remove them. 

In [60]:
ratings = ratings.drop(['reviews_userCity', 'reviews_userProvince'], axis=1)

Now, checking again for the null values - 

In [61]:
round(((ratings.isnull().sum()/ratings.shape[0])*100),2)

id                      0.00
brand                   0.00
categories              0.00
manufacturer            0.47
name                    0.00
reviews_date            0.15
reviews_didPurchase    46.89
reviews_doRecommend     8.57
reviews_rating          0.00
reviews_text            0.00
reviews_title           0.63
reviews_username        0.21
user_sentiment          0.00
dtype: float64

Removing the column for reviews_didPurchase as well. The values cannot be imputed as there are close to 50% values which are null. 

In [62]:
ratings = ratings.drop(['reviews_didPurchase'], axis=1)

Checking null values again - 

In [63]:
ratings.isnull().sum(axis=0)

id                        0
brand                     0
categories                0
manufacturer            141
name                      0
reviews_date             46
reviews_doRecommend    2570
reviews_rating            0
reviews_text              0
reviews_title           190
reviews_username         63
user_sentiment            1
dtype: int64

In [64]:
round(((ratings.isnull().sum()/ratings.shape[0])*100),2)

id                     0.00
brand                  0.00
categories             0.00
manufacturer           0.47
name                   0.00
reviews_date           0.15
reviews_doRecommend    8.57
reviews_rating         0.00
reviews_text           0.00
reviews_title          0.63
reviews_username       0.21
user_sentiment         0.00
dtype: float64

There is one row for user_sentiment, which has a null value. Let's check that row - 

In [65]:
ratings[ratings['user_sentiment'].isnull()]['reviews_text']

28354    my kids absolutely loved this film so much that we watched it twice. Having a digital copy means that every time we get in the car we get to watch it wherever we go. we even got to use our $5 reward coupon towards the purchase of this movie so we got an additional $5 off + we got to add the points towards our next rewards coupon we love the savings we get at Best Buy
Name: reviews_text, dtype: object

It looks like the user likes the product and has recommended the product. The review text also shows that the user loved this product and has given it the highest rating. We can make the sentiment for this user as positive. It is safe to classify the review/sentiment as a positive one.

In [66]:
ratings.loc[ratings['user_sentiment'].isnull(),'user_sentiment'] = 'Positive'

In [67]:
ratings.isnull().sum(axis=0)

id                        0
brand                     0
categories                0
manufacturer            141
name                      0
reviews_date             46
reviews_doRecommend    2570
reviews_rating            0
reviews_text              0
reviews_title           190
reviews_username         63
user_sentiment            0
dtype: int64

We can remove the column reviews_date as it is not important for the sentiment classfier or the recommendation system.

In [68]:
ratings = ratings.drop(['reviews_date'], axis=1)

In [69]:
ratings.isnull().sum(axis=0)

id                        0
brand                     0
categories                0
manufacturer            141
name                      0
reviews_doRecommend    2570
reviews_rating            0
reviews_text              0
reviews_title           190
reviews_username         63
user_sentiment            0
dtype: int64

Let's look at the null values for manufacturer and its brands - 

In [70]:
ratings[ratings['manufacturer'].isnull()]['brand'].value_counts()

Summit Entertainment    141
Name: brand, dtype: int64

So for all 141 null values for manufacturer column, the brand is the same - Summit Entertainment. Let's check what is the manufacturer of Summit entertainment brand if there are more rows for the same - 

In [71]:
ratings[ratings['brand'] == 'Summit Entertainment']['manufacturer'].value_counts()

Summit Entertainment    672
Name: manufacturer, dtype: int64

So for all other Summit Entertainment brands, the manufacturer seems to be Summit Entertainment itself. It is unlikely that the brand is the same and the manufacturer is different. It is possible to have different brands under the same manufacturer but not the same brand under different manufacturers. So it is safe to fill in all the manufacturer null values to Summit Entertainment. 

In [72]:
ratings.loc[ratings['manufacturer'].isnull(), 'manufacturer'] = 'Summit Entertainment'

In [73]:
ratings.isnull().sum()

id                        0
brand                     0
categories                0
manufacturer              0
name                      0
reviews_doRecommend    2570
reviews_rating            0
reviews_text              0
reviews_title           190
reviews_username         63
user_sentiment            0
dtype: int64

Username column obviously cannot be removed as we need to make recommendations for them. But if we do not know which user made the review, we cannot impute it with some random name. So it is better to delete the rows that do not have a username. 

In [74]:
ratings = ratings[~ratings['reviews_username'].isnull()]

In [75]:
ratings.isnull().sum()

id                        0
brand                     0
categories                0
manufacturer              0
name                      0
reviews_doRecommend    2541
reviews_rating            0
reviews_text              0
reviews_title           189
reviews_username          0
user_sentiment            0
dtype: int64

There are 189 rows that do not have a review title. We cannot impute or fill it with some text nor is it safe to remove the column entirely as it could be useful in sentiment prediction. So better to delete the rows that have null title. 

In [76]:
ratings = ratings[~ratings['reviews_title'].isnull()]

In [77]:
ratings.isnull().sum()

id                        0
brand                     0
categories                0
manufacturer              0
name                      0
reviews_doRecommend    2376
reviews_rating            0
reviews_text              0
reviews_title             0
reviews_username          0
user_sentiment            0
dtype: int64

In [78]:
ratings.shape

(29748, 11)

We can drop the column reviews_doRecommend since it gives similar information as reviews_text and reviews_rating columns. If the user likes the product and rates it high, he will recommend it and vice versa. So this can be dropped without much info loss.

In [79]:
ratings = ratings.drop(['reviews_doRecommend'], axis=1)

Let's check null values again - 

In [80]:
ratings.isnull().sum()

id                  0
brand               0
categories          0
manufacturer        0
name                0
reviews_rating      0
reviews_text        0
reviews_title       0
reviews_username    0
user_sentiment      0
dtype: int64

In [81]:
ratings.shape

(29748, 10)

Let's see if there are any duplicate rows - 

In [82]:
ratings[ratings.duplicated()].shape

(279, 10)

So there are 279 data items that have duplicate entries. These should be removed - 

In [83]:
ratings = ratings[~(ratings.duplicated())]

In [84]:
ratings.shape

(29469, 10)

In [85]:
ratings.isnull().sum()

id                  0
brand               0
categories          0
manufacturer        0
name                0
reviews_rating      0
reviews_text        0
reviews_title       0
reviews_username    0
user_sentiment      0
dtype: int64

So now we have a clean dataset with no null values and duplicate rows. 

In [86]:
percent_loss = ((ratings_orig.shape[0]-ratings.shape[0])/ratings_orig.shape[0])*100
percent_loss

1.77

There's very little 1.7% data loss after cleaning which is good enough.

Let's check if all the columns are of the right datatype before proceeding further in data analysis - 

In [87]:
ratings.dtypes

id                  object
brand               object
categories          object
manufacturer        object
name                object
reviews_rating       int64
reviews_text        object
reviews_title       object
reviews_username    object
user_sentiment      object
dtype: object

All except the ratings are strings/objects which is good. Also the rating is int type which is correct. 

Let's also make sure there are no spelling mistakes in the brands/manufacturers - 

In [88]:
ratings['brand'].unique()

array(['Universal Music', 'Lundberg', 'K-Y', 'J.R. Watkins', 'AMBI',
       "Johnson's", 'Olay', 'Windex', 'Heinz', 'KIND', 'Pantene',
       'Aussie', 'Disney', 'CeraVe', 'Solo Foods', 'Finish',
       'Jake And The Neverland Pirates', 'Pearhead', 'No Brand',
       'Neutrogena', 'Meguiars', 'Banana Boat', 'Cetaphil', 'Suave',
       'Bumble Bee', 'Citrus Magic', 'La Tortilla Factory',
       'Target.com Use Only', 'Sea Gull Lighting', 'The Seaweed Bath Co.',
       'Way Basics', 'Biokleen', 'Yes to Carrots', 'Alberto VO5',
       'Warner Bros.', 'FOX', 'Progresso', "Herr's", 'Hawaiian Punch',
       'Universal Home Video', 'Holmes', 'Sony Music', "Nature's Path",
       'Lite Source', 'SLOAN VALVE COMPANY', 'Nearly Natural',
       'Toy Story', 'MaraNatha', 'Chex', 'TRESemme', 'Wagan',
       'Creme Of Nature', 'Arrid', 'Eagle Brand', 'Pinaud',
       'Hortense B. Hewitt', 'Canada Dry', 'Baxter of California', 'Ragu',
       'Sabra', 'Pendaflex', "Newman's Own Organics",
       'The 

In [89]:
ratings['manufacturer'].unique()

array(['Universal Music Group / Cash Money', 'Lundberg', 'K-Y',
       'J.R. Watkins', 'FLEMING & CO', "Johnson's", 'P&G', 'Windex',
       'Heinz North America', 'Kind Fruit & Nut Bars', 'Pantene',
       'Aussie', 'Disney/Pixar', 'CeraVe', 'Solo', 'Reckitt Benckiser',
       'Disney', 'Pearhead', '120', 'Johnson & Johnson SLC', 'Parts',
       'Energizer Personal Care', 'Cetaphil', 'UNILEVER', 'Bumble Bee',
       'Citrus Magic', 'La Tortilla Fac', 'Xenon', 'Sea Gull Lighting',
       'The Seaweed Bath', 'Way Basics', 'Biokleen Cleaners',
       'Yes To Inc.', 'High Ridge Brands Co.', 'TIME WARNER',
       'MGM (Video & DVD)', 'Twentieth Century Fox',
       'GENERAL MILLS SALES, INC.', "Herr's", 'Dr Pepper/Seven Up, Inc',
       'Universal', 'UNIVERSAL HOME ENTERTAINMENT',
       'Jarden Home Environment', 'Columbia', 'Pantene Pro-V Hair Care',
       "Nature's Path Foods, Inc.", 'Lite-Source', 'William H Harvey',
       'Nearly Natural', 'Disguise', 'Maranatha Natural Foods',
     

In [90]:
ratings['name'].unique()

array(['Pink Friday: Roman Reloaded Re-Up (w/dvd)',
       'Lundberg Organic Cinnamon Toast Rice Cakes',
       'K-Y Love Sensuality Pleasure Gel',
       'J.R. Watkins Hand Cream, Lemon Cream',
       'Ambi Complexion Cleansing Bar',
       "Johnson's Baby Bubble Bath and Wash, 15oz",
       'Olay Regenerist Deep Hydration Regenerating Cream',
       'Windex Original Glass Cleaner Refill 67.6oz (2 Liter)',
       'Heinz Tomato Ketchup, 38oz',
       'Kind Dark Chocolate Chunk Gluten Free Granola Bars - 5 Count',
       'Pantene Color Preserve Volume Shampoo, 25.4oz',
       'Aussie Aussome Volume Shampoo, 13.5 Oz',
       "Cars Toon: Mater's Tall Tales", 'CeraVe SA Renewing Cream',
       'Solo Foods Almond Paste',
       'Finish Quantum Dishwasher Detergent, Lemon Sparkle Scent, 45 Count',
       'Disney174 Jake And The Neverland Pirates 4 Piece Bedding Set - Toddler',
       'Pearhead Id Bracelet Frame',
       'Craft Punch Giga Scallop Circle 45 24687534 To 334',
       'Neutrogena

There seems to be a product name - "L'or233al Paris Elvive Extraordinary Clay Rebalancing Conditioner - 12.6 Fl Oz"] which is basically a L'oreal product but not named correctly. Let's correct all products with this faulty name - 

In [91]:
def extract(string):
    pattern = "^L'or[0-9]+al"
    if(re.search(pattern, string)):
        return string
    else:
        return False

In [92]:
ratings[ratings['name'] == ratings['name'].apply(lambda x:extract(x))]['name']

28814    L'or233al Paris Elvive Extraordinary Clay Rebalancing Conditioner - 12.6 Fl Oz
28815    L'or233al Paris Elvive Extraordinary Clay Rebalancing Conditioner - 12.6 Fl Oz
28816    L'or233al Paris Elvive Extraordinary Clay Rebalancing Conditioner - 12.6 Fl Oz
28817    L'or233al Paris Elvive Extraordinary Clay Rebalancing Conditioner - 12.6 Fl Oz
28818    L'or233al Paris Elvive Extraordinary Clay Rebalancing Conditioner - 12.6 Fl Oz
                                              ...                                      
29995    L'or233al Paris Elvive Extraordinary Clay Rebalancing Conditioner - 12.6 Fl Oz
29996    L'or233al Paris Elvive Extraordinary Clay Rebalancing Conditioner - 12.6 Fl Oz
29997    L'or233al Paris Elvive Extraordinary Clay Rebalancing Conditioner - 12.6 Fl Oz
29998    L'or233al Paris Elvive Extraordinary Clay Rebalancing Conditioner - 12.6 Fl Oz
29999    L'or233al Paris Elvive Extraordinary Clay Rebalancing Conditioner - 12.6 Fl Oz
Name: name, Length: 1118, dtype:

So there are 1118 rows which are faulty. Let's correct them - 

In [93]:
ratings.loc[ratings['name'] == ratings['name'].apply(lambda x:extract(x)),'name'] = ratings.loc[ratings['name'] == ratings['name'].apply(lambda x:extract(x)),'name'].str.replace('^L\'or[0-9]+al', 'L\'oreal') 

  ratings.loc[ratings['name'] == ratings['name'].apply(lambda x:extract(x)),'name'] = ratings.loc[ratings['name'] == ratings['name'].apply(lambda x:extract(x)),'name'].str.replace('^L\'or[0-9]+al', 'L\'oreal')


Now let's make sure the changes have been made - 

In [94]:
ratings[ratings['name'] == ratings['name'].apply(lambda x:extract(x))]['name']

Series([], Name: name, dtype: object)

In [95]:
ratings = ratings.reset_index(drop=True)

In [96]:
ratings.index

RangeIndex(start=0, stop=29469, step=1)

In [97]:
ratings.head()

Unnamed: 0,id,brand,categories,manufacturer,name,reviews_rating,reviews_text,reviews_title,reviews_username,user_sentiment
0,AV13O1A8GV-KLJ3akUyj,Universal Music,"Movies, Music & Books,Music,R&b,Movies & TV,Movie Bundles & Collections,CDs & Vinyl,Rap & Hip-Hop,Bass,Music on CD or Vinyl,Rap,Hip-Hop,Mainstream Rap,Pop Rap",Universal Music Group / Cash Money,Pink Friday: Roman Reloaded Re-Up (w/dvd),5,i love this album. it's very good. more to the hip hop side than her current pop sound.. SO HYPE! i listen to this everyday at the gym! i give it 5star rating all the way. her metaphors are just crazy.,Just Awesome,joshua,Positive
1,AV14LG0R-jtxr-f38QfS,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Cookies & Chips,Rice Cakes,Cakes",Lundberg,Lundberg Organic Cinnamon Toast Rice Cakes,5,Good flavor. This review was collected as part of a promotion.,Good,dorothy w,Positive
2,AV14LG0R-jtxr-f38QfS,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Cookies & Chips,Rice Cakes,Cakes",Lundberg,Lundberg Organic Cinnamon Toast Rice Cakes,5,Good flavor.,Good,dorothy w,Positive
3,AV16khLE-jtxr-f38VFn,K-Y,"Personal Care,Medicine Cabinet,Lubricant/Spermicide,Health,Sexual Wellness,Lubricants",K-Y,K-Y Love Sensuality Pleasure Gel,1,"I read through the reviews on here before looking in to buying one of the couples lubricants, and was ultimately disappointed that it didn't even live up to the reviews I had read. For starters, neither my boyfriend nor I could notice any sort of enhanced or 'captivating' sensation. What we did notice, however, was the messy consistency that was reminiscent of a more liquid-y vaseline. It was difficult to clean up, and was not a pleasant, especially since it lacked the 'captivating' sensation we had both been expecting. I'm disappointed that I paid as much as I did for a lube that I won't use again, when I could just use their normal personal lubricant for 1) less money and 2) less mess.",Disappointed,rebecca,Negative
4,AV16khLE-jtxr-f38VFn,K-Y,"Personal Care,Medicine Cabinet,Lubricant/Spermicide,Health,Sexual Wellness,Lubricants",K-Y,K-Y Love Sensuality Pleasure Gel,1,My husband bought this gel for us. The gel caused irritation and it felt like it was burning my skin. I wouldn't recommend this gel.,Irritation,walker557,Negative


There is no faulty name anymore. 

Now that the data is clean, spell corrected let's do some EDA

## Exploratory Data Analysis

In [98]:
ratings.shape

(29469, 10)

In [99]:
ratings.columns

Index(['id', 'brand', 'categories', 'manufacturer', 'name', 'reviews_rating',
       'reviews_text', 'reviews_title', 'reviews_username', 'user_sentiment'],
      dtype='object')

In [100]:
def data_info(x):
    print(ratings[x].value_counts())
    ratings[x].value_counts().plot(kind='bar')        

In [None]:
for col in ratings.columns:
    print("Stats for column : "+col )
    data_info(col)
    print("\n")

Stats for column : id
AVpf3VOfilAPnD_xjpun    8396
AVpfPaoqLJeJML435Xk9    3324
AVpfJP1C1cnluZ0-e3Xy    2022
AVpfRTh1ilAPnD_xYic2    1140
AVpfW8y_LJeJML437ySW    1118
                        ... 
AVpfRxSkilAPnD_xYrzm       1
AVpfsQoeilAPnD_xgfx5       1
AVpf7HOwilAPnD_xkl3L       1
AVpfshNsLJeJML43CB8q       1
AVpfa1joLJeJML4385hb       1
Name: id, Length: 252, dtype: int64


Stats for column : brand
Clorox                   10419
Warner Home Video         3324
Disney                    1197
L'oreal Paris             1118
FOX                        887
                         ...  
Touch of Color               1
Scotty                       1
Newman's Own Organics        1
Wilton                       1
LDR                          1
Name: brand, Length: 199, dtype: int64


Stats for column : categories
Household Essentials,Cleaning Supplies,Kitchen Cleaners,Cleaning Wipes,All-Purpose Cleaners,Health & Household,Household Supplies,Household Cleaning,Ways To Shop,Classroom Essentials,F



Stats for column : reviews_title
Great Product                        369
Great movie                          361
Clorox Wipes                         269
Great                                216
Great product                        205
                                    ... 
My Messy Boys                          1
great comeback for Godzilla            1
Will use for the rest of my life!      1
olay products                          1
Fragrance Change                       1
Name: reviews_title, Length: 18490, dtype: int64


Stats for column : reviews_username
byamazon customer    41
mike                 41
chris                31
rick                 15
sandy                15
                     ..
kerstenjay            1
rosey07               1
maxload               1
flavafraz             1
badboy4life           1
Name: reviews_username, Length: 24788, dtype: int64


Stats for column : user_sentiment
Positive    26163
Negative     3306
Name: user_sentiment, dtype: int64




It can be seen from the above stats, that Clorox is the most reviewed (and probably most bought) brand with 10419 reviews. Since Colorox is more of a household essentials brand, that specific category is the topmost. Obviously even the top most reviewed manufacturer is also Clorox and the top most reviewed product is the Clorox Disinfecting wipes which belongs to the Clorox brand. 

Most of the ratings are 5 i.e out of 29469 records, 20495 records are rated 5 i.e ~70%. Most of the reviews are made by a user names - Mike. 

Coming to the most important aspect - the user sentiment. 26163 records have positive sentiment out of the 29469 total records which is almost 89%. So this is a skewed/imbalanced dataset. 

Let's look at the top most reviewed products, brands, manufacturers to understand how their rating is distributed - 

In [None]:
top_5_products = ratings['name'].value_counts().sort_values(ascending=False)[:5]
top_5_products

In [None]:
top_products_df = ratings[ratings['name'].isin(top_5_products.index.tolist())]
top_products_df.head(1)

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x='reviews_rating',y='name',data=top_products_df, palette='rainbow')
plt.title("Distribution of ratings in products")

So for the top 5 products, most of the ratings are between 4 and 5. i.e they are good products. 

Similarly let's get the distribution of ratings for top 5 most reviewed brands and manufacturers also - 

In [None]:
top_5_brands = ratings['name'].value_counts().sort_values(ascending=False)[:5]
top_5_brands

In [None]:
top_brands_df = ratings[ratings['name'].isin(top_5_products.index.tolist())]
top_brands_df.head(1)

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x='reviews_rating',y='name',data=top_brands_df, palette='rainbow')
plt.title("Distribution of ratings in products")

It is the same behaviour as the products. 

Let's look at the user sentiment with these products and brands -

In [None]:
top_products_df.groupby('name')['user_sentiment'].value_counts().sort_values(ascending=False)

So we can see that the Clorox disinfecting wipes which got the max reviews also has the max positive reviews meaning that's a good product. 

Let's look at the unique number of products and users - 

In [None]:
len(ratings['name'].unique())

There are 252 unique products

In [None]:
len(ratings['reviews_username'].unique())

There are 24788 unique users. So if go with user-user recommendation system, the correlation matrix is going to be huge. in this case item-item similarity is better since the number of products are fewer. 

Let's look at the user who gave the max reviews to the products - 

In [None]:
ratings['reviews_username'].value_counts().sort_values(ascending=False)

Now after some EDA let's look at building the recommendation system.

## Recommendation System (item-item)

Dividing the dataset into train and test - 

In [None]:
# Let's just make a copy of ratings df so that we dont make any unnecessary changes in the final cleaned dataframe
ratings_reco = ratings.copy()
ratings_reco = ratings_reco[['name', 'reviews_username', 'reviews_rating']]
ratings_reco.head()

In [None]:
ratings_reco.isnull().sum()

In [None]:
# Test and Train split of the dataset.
train, test = train_test_split(ratings_reco, test_size=0.30, random_state=31)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
#joblib.dump(train, 'model/train')

In [None]:
df_pivot = train.pivot_table(
    index='reviews_username',
    columns='name',
    values='reviews_rating'
).T
df_pivot.head()

In [None]:
df_pivot.shape

In [None]:
# Item Similarity Matrix
item_correlation = 1 - pairwise_distances(df_pivot.fillna(0), metric='cosine')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation)

In [None]:
item_correlation.shape

Filtering the correlation only for which the value is greater than 0. (Positively correlated)

In [None]:
item_correlation[item_correlation<0]=0
item_correlation

### Prediction item-item

In [None]:
item_predicted_ratings = np.dot((df_pivot.fillna(0).T),item_correlation)
item_predicted_ratings

In [None]:
item_predicted_ratings.shape

In [None]:
dummy_train = (df_pivot.fillna(0)==0).astype(int)
dummy_train = dummy_train.T
dummy_train.shape

In [None]:
dummy_train

In [None]:
df_pivot.T.loc['zubb'][df_pivot.T.loc['zubb']>1]

In [None]:
dummy_train.loc['zubb'].sum()

In [None]:
dummy_train.loc['zubb','Planes: Fire Rescue (2 Discs) (includes Digital Copy) (blu-Ray/dvd)']

It's verified that the places where ratings are given are zeroed in dummy_train df.

In [None]:
dummy_train.shape

In [None]:
dummy_train.head()

#### Filtering the rating only for the movies not rated by the user for recommendation

In [None]:
item_final_rating = np.multiply(item_predicted_ratings,dummy_train)
item_final_rating.head()

In [None]:
item_final_rating.loc['zubb','Planes: Fire Rescue (2 Discs) (includes Digital Copy) (blu-Ray/dvd)']

In [None]:
joblib.dump(item_final_rating, 'model/item_final_rating') 

### Finding the top 20 recommendation for the *user*

In [None]:
d = item_final_rating.loc['zubb'].sort_values(ascending=False)[0:20]
d

In [None]:
print(train.shape)
train.columns

In [None]:
# Evaluation - Item Item

#Evaluation will we same as you have seen above for the prediction. The only difference being, you will evaluate for the movie already rated by the user insead of predicting it for the movie not rated by the user. 

test.columns

common =  test[test.name.isin(train.name)]
print(common.shape)
common.head(4)

In [None]:
common_item_based_matrix = common.pivot_table(index='reviews_username', columns='name', values='reviews_rating').T

In [None]:
common_item_based_matrix.shape

In [None]:
item_correlation_df = pd.DataFrame(item_correlation)

In [None]:
item_correlation_df.head(1)

In [None]:
df_pivot.index

In [None]:
item_correlation_df['name'] = df_pivot.index
item_correlation_df.set_index('name',inplace=True)
item_correlation_df.head()

In [None]:
list_name = common.name.tolist()

In [None]:
item_correlation_df.columns = df_pivot.index.tolist()

item_correlation_df_1 =  item_correlation_df[item_correlation_df.index.isin(list_name)]

In [None]:
item_correlation_df_2 = item_correlation_df_1.T[item_correlation_df_1.T.index.isin(list_name)]

item_correlation_df_3 = item_correlation_df_2.T

In [None]:
item_correlation_df_3.head()

In [None]:
item_correlation_df_3.shape

In [None]:
item_correlation_df_3[item_correlation_df_3<0]=0

common_item_predicted_ratings = np.dot(item_correlation_df_3, common_item_based_matrix.fillna(0))
print(common_item_predicted_ratings.shape)
common_item_predicted_ratings

In [None]:
common_item_predicted_ratings.shape

In [None]:
dummy_test = common.copy()
dummy_test = dummy_test.pivot_table(index='reviews_username', columns='name', values='reviews_rating').T.fillna(0)

In [None]:
dummy_test.T.loc['beverly']

In [None]:
dummy_test[dummy_test>0] = 1

In [None]:
dummy_test.shape

In [None]:
common_item_predicted_ratings = np.multiply(common_item_predicted_ratings,dummy_test)

In [None]:
common_ = common.pivot_table(index='reviews_username', columns='name', values='reviews_rating').T

In [None]:
common_.shape

In [None]:
common_item_predicted_ratings.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler
from numpy import *

X  = common_item_predicted_ratings.copy() 
X = X[X>0]

scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))

print(y)

In [None]:
# Finding total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(y))

In [None]:
total_non_nan

In [None]:
rmse = (sum(sum((common_ - y )**2))/total_non_nan)**0.5
print(rmse)

## Sentiment classification

### Preprocessing and cleaning the text data

In [None]:
ratings[['reviews_rating','reviews_text', 'reviews_title']]

Changing the decontracted statements to their original base and also performing some basic preproecssing on the reviews text and then performing tokenization and lemmatization on the text - 

In [None]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
spell = Speller(lang='en')

def preprocess(document, stem=True):
    'changes document to lower case and removes stopwords'

    # Decontract the text i.e I'm to I am etc
    document = decontracted(document)
    
    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)
    
    pat = '[a-zA-Z]*.*[0-9]+.*[a-zA-Z]*'
    pat1 = '^[a-zA-Z]'
    
    # Removing some unnecessary elements like ',', '.' etc 
    words = [d for d in words if (not re.match(pat,d)) and (re.match(pat1,d))]
    
    words = [spell(word) for word in words]
    
    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    # Choose stemming or lemmatization
    
    if(stem):
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word) for word in words]
    
  
    # Removing some unnecessary elements like ',', '.' etc 
    words = [d for d in words if (not re.match(pat,d)) and (re.match(pat1,d))]

    
    # join words to make sentence
    document = " ".join(words)
    
    return document

In [None]:
ratings['reviews_text'] = ratings['reviews_text'].apply(lambda x:preprocess(x, False))

In [None]:
ratings['reviews_title'] = ratings['reviews_title'].apply(lambda x:preprocess(x, False))

In [None]:
ratings['reviews_text'][:5]

In [None]:
ratings['reviews_title'][:5]

In [None]:
ratings['rev_text'] = ratings['reviews_text'] + ratings['reviews_title']

### Now after preprocessing, we are using Count vectorizer to get the bag of words representation of all the text in the title and the reviews text

In [None]:
vectorizer = TfidfVectorizer()
reviews_tfidf = vectorizer.fit_transform(ratings['rev_text'])

In [None]:
#joblib.dump(reviews_tfidf, 'model/reviews_tfidf') 

In [None]:
reviews_tfidf.toarray()

In [None]:
print(reviews_tfidf.shape)
vectorizer.get_feature_names()

In [None]:
ratings['sentiment'] = ratings['user_sentiment'].apply(lambda x:1 if x=='Negative' else 0)

In [None]:
ratings['user_sentiment'].value_counts()

In [None]:
X = reviews_tfidf
Y = ratings['sentiment']

In [None]:
X.shape

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X,Y,test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
y_train.value_counts()

In [None]:
lm = LogisticRegression(class_weight={0:0.2,1:0.8})
lm.fit(X_train, y_train) 

In [None]:
y_pred = lm.predict(X_test)

In [None]:
def print_metrics(y_test, y_pred):
    con_mat = confusion_matrix(y_test, y_pred)
    print(con_mat)

    accuracy = (con_mat[0][0] + con_mat[1][1])/(con_mat[0][0] + con_mat[1][1] +con_mat[0][1] + con_mat[1][0])
    print("accuracy :",accuracy)

    specificity = con_mat[0][0]/(con_mat[0][0]+con_mat[0][1])
    print("specificity: ", specificity)
    sensitivity = con_mat[1][1]/(con_mat[1][1]+con_mat[1][0])
    print("sensitivity : ", sensitivity)

In [None]:
print_metrics(y_test, y_pred)

Logistic classifier has an accuracy of around 90%. let's look at RandomClassifier - 

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [4,8,10],
    'min_samples_leaf': range(100, 400, 200),
    'min_samples_split': range(200, 500, 200),
    'n_estimators': [100,200, 300], 
    'max_features': [5, 10],
    'class_weight': [{0:0.1,1:0.9},{0:0.2,1:0.8}]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1,verbose = 1)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

In [None]:
# printing the optimal accuracy score and hyperparameters
print('We can get accuracy of',grid_search.best_score_,'using',grid_search.best_params_)

In [None]:
rfc = RandomForestClassifier()

In [None]:
# fit
rfc.fit(X_train,y_train)

In [None]:
# predict
predictions = rfc.predict(X_test)

In [None]:
print_metrics(y_test, predictions)

XGBoost classifier - 

#### So we need to do some hyperparmeter tuning to improve sensitivity that is identifying the negative reviews correctlt

In [None]:
# hyperparameter tuning with XGBoost

# creating a KFold object 
folds = 5

# specify range of hyperparameters
param_grid = {'scale_pos_weight':[i for i in range(20)]}          


# specify model
xgb_model = XGBClassifier()

# set up GridSearchCV()
model_cv = GridSearchCV(estimator = xgb_model, 
                        param_grid = param_grid, 
                        scoring= 'roc_auc', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True)      



In [None]:
model_cv.fit(X_train, y_train)

In [None]:
# printing the optimal accuracy score and hyperparameters
print('We can get accuracy of',model_cv.best_score_,'using',model_cv.best_params_)

In [None]:
# fit model on training data with default hyperparameters
model = XGBClassifier(scale_pos_weight=2)
model.fit(X_train, y_train)


# make predictions for test data
# use predict_proba since we need probabilities to compute auc
y_pred_proba = model.predict_proba(X_test)

from sklearn import metrics
# evaluate predictions
roc = metrics.roc_auc_score(y_test, y_pred_proba[:, 1])
print("AUC: %.2f%%" % (roc * 100.0))

y_pred = model.predict(X_test)
print(y_pred)

print_metrics(y_test, y_pred)

Let's save the XGBoost model (as that's our final chosen model)-

In [None]:
joblib.dump(model, 'model/xgboost_model.pkl') 

So there are three models which we have tried. Let's look at their accuracies and other metrics for comparison : 

1. Logistic regression : accuracy = 90%, specificity =93.4% , sensitivity=65.5%

2. Random Forest classifier : accuracy = 90.9%, specificity =99.8% , sensitivity=19.3% 

3. XGBoost Classifier : accuracy = 91.8%, specificity =94.9% , sensitivity=66.6% , AUC=92.7%

From the above numbers we can see that random forest classifier is overfitting the data and has a very low sensitivity while logistic regression and XGBoost classifier are doing quite well. XGBoost classifier has a higher accuracy, sensitivity and specificity as compared to the logistic regression classifier. Hence, we are going ahead with the XGBoost classifier for this particular usecase. 

Actual input dataset

In [None]:
ratings.shape

TFIDF vector of all input rows

In [None]:
reviews_tfidf

train input to the recommendation system - 

In [None]:
train.shape

In [None]:
train.head()

In [None]:
train.index

In [None]:
d = item_final_rating.loc['zubb'].sort_values(ascending=False)[0:20]
top_20_products = d.index.tolist()
top_20_products

In [None]:
reviews_dict = {}
for i in top_20_products:
    idx_prod= train[train['name'] == i].index.tolist()
    #idx_prod = sort(idx_prod).tolist()
    reviews_vector = [reviews_tfidf[i] for i in idx_prod] 
    if(i not in reviews_dict):
        reviews_dict[i] = reviews_vector
    else:
        reviews_dict[i].append(reviews_vector)

Now, we have a dictionary which is keyed with the top 20 reocmmended products to the user. Now we need to pass them through out model to get the sentiment predicted - 

In [None]:
from functools import reduce

sentiment_dict = {}

for key,value in reviews_dict.items():
    sentiment = [int(model.predict(i)) for i in value]
    pos_sent = 100-((reduce(lambda x,y:x+y,sentiment)/len(sentiment))*100)
    sentiment_dict[key] = pos_sent

In [None]:
top_5 = [key for key,value in sorted(sentiment_dict.items(), key = lambda x:x[1], reverse=True)[:5]]
top_5

In [None]:
users = list(train['reviews_username'].unique())
users

In [None]:
prods_list = train['name'].unique()
prods_list = list(prods_list)
prods_list

In [None]:
prod_idx_dict = {}
prod_tfidf = {}

for prod in prods_list:
    prod_idx_dict[prod] = list(sort(train[train['name']==prod].index.tolist()))
    

In [None]:
prod_idx_dict

In [None]:
for key,value in prod_idx_dict.items():
    prod_tfidf[key] = [reviews_tfidf[idx] for idx in value]
    

In [None]:
prod_tfidf

In [None]:
#joblib.dump(prod_tfidf, 'model/prod_tfidf')

In [None]:
top_20_products

In [None]:
sentiment_dict={}
for i in top_20_products:
    reviews_list = prod_tfidf[i]
    sentiment = [model.predict(rev) for rev in reviews_list]
    pos_sent = 100-((reduce(lambda x,y:x+y,sentiment)/len(sentiment))*100)
    sentiment_dict[i] = pos_sent

In [None]:
sentiment_dict

In [None]:
top_5 = [key for key,value in sorted(sentiment_dict.items(), key = lambda x:x[1], reverse=True)[:5]]

In [None]:
top_5
