# This module does the sentiment analysis of the reviews data

In [118]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
from dotenv import load_dotenv
import os
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config
import nltk
nltk.download('punkt_tab')
from joblib import Parallel, delayed
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
import urllib
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pickle

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\apoor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\apoor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\apoor\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [77]:
set_config(display='diagram')

### Loading the data

In [78]:
# Loading the environment varibles
try:
   load_dotenv("login.env", override=True)
   username = os.getenv("username")
   username = urllib.parse.quote_plus(username)
   password = os.getenv("password")
   password = urllib.parse.quote_plus(password)
   print("environment variable loaded successfully")
except Exception as e:
   print(f'environment variable not able to load: {e}')

connection_string = f'mongodb+srv://{username}:{password}@cluster0.elvspmq.mongodb.net/'

environment variable loaded successfully


In [79]:
# checking the connection with the databse
try:
   mongocli = MongoClient(connection_string)
   print("database connected successfully")
except Exception as e:
   print(f'Error occured while connection to database: {e}')

database connected successfully


In [80]:
#creating the database

cluster = mongocli["contentrecommendation"]
collection = cluster["reviews"]
data = collection.find()
reviews = list(data)
db = pd.DataFrame(reviews)

In [81]:
db = db.drop("_id", axis= 1)
db.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,1,True,"01 6, 2018",A2VOA9Z3QNDNRI,B000YFSR5G,"{'Size:': ' X-Large', 'Color:': ' Charcoal Hea...",Amazon Customer,These are cheaply made sweat shop pants not go...,These are cheaply made sweat shop pants not go...,1515196800,,
1,5,True,"01 5, 2018",A1GQPW286SLV69,B000YFSR5G,"{'Size:': ' Medium', 'Color:': ' Light Steel'}",Bhargav Kanakiya,Very nice pair! Helps my legs stay warm.,Five Stars,1515110400,,
2,3,True,"01 5, 2018",A343KWSY5I3ZCU,B000YFSR5G,"{'Size:': ' X-Large', 'Color:': ' Black'}",Roger M.,"Smaller size, needed a size larger. I miss not...",Three Stars,1515110400,,
3,5,True,"01 5, 2018",A1ARUODW18J2KV,B000YFSR5G,"{'Size:': ' Large', 'Color:': ' Black'}",Heni,"Arrived on time. Perfect but a little baggy, s...",Perfect but a little baggy,1515110400,,
4,5,True,"01 4, 2018",ARPP0CUQFJ6N6,B000YFSR5G,"{'Size:': ' Medium', 'Color:': ' Light Steel'}",ilande,Great considering the price of $7!,Buy a size smaller than you think you need.,1515024000,,


In [82]:
print(db[db["verified"] == True].shape[0])
print(db[db["verified"] == False].shape[0])     

325492
24508


In [83]:
print(db[db["verified"]==False].shape[0])

24508


In [84]:
print(db["vote"].isna().sum())
print(db["image"].isna().sum())

311263
337928


# Creating a data pipeline which cleans the data, perform sentiment analysis and generate user tags
### Perform Follwing operations:
1. Remove non verified reviews if they are more than 5% of the total reviews.
2. Remove columns verified, vote, image. 
3. split column style.
4. convert unixReviewTime into a feature which gives an idea of how old the review is.
5. Change overall to categories
6. Drop numerical columns (reuse previously created transformer)
7. tokenize review text column, clean text data, remove spaces
8. remove stopwords from review text column
9. sentiment analyse review text column store sentiment as a category
10. Based on the sentiment include labels from review text
11. make an array of labels for each user
12. rearrange columns

In [85]:
# Creating a transformer for managing verified and non verified reviews
class verified(BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold = 0.05):
        self.threshold = threshold
    
    def fit(self, x, y = 0):
        if "verified" in x.columns:
            self.number_of_non_verified =  x[x["verified"]==False].shape[0] 
        else:
            self.number_of_non_verified = 0
        return self
    def transform(self, x):
        if self.number_of_non_verified > self.threshold * x.shape[0]:
            x = x.drop("verified", axis=1)
            return x
        else:
            return x

In [86]:
# Create a transformer for removing columns vote and image

class dropcols(BaseEstimator, TransformerMixin):
    def __init__(self,colms):
        self.colms = colms
    def fit(self, x, y=0):
        return self
    def transform(self, x):
        x = x.drop(self.colms, axis = 1)
        return x

In [87]:
# Create a transformer for splitting column style

class splitStyle(BaseEstimator,TransformerMixin):

    def fit(self, x, y=0):
        return self
        
    def transform(self,x):
        result_colms = pd.json_normalize(x["style"])
        x = pd.concat([x,result_colms], axis=1)
        x = x.drop("style",axis=1)
        return x

In [None]:
# Create a transformer to convert review time into categories

class reviewage(BaseEstimator,TransformerMixin):
    def __init__(self ):
        pass


    def fit(self,x,y=0):
        if not isinstance(x,pd.DataFrame):
            raise AttributeError("Input must be a pandas dataframe")
        if "reviewTime" not in x.columns:
            raise AttributeError("Column not found in dataframe")
        
        x["reviewTime"] = pd.to_datetime(x["reviewTime"],format='%m %d, %Y')
        
        self.maxi = x["reviewTime"].max()
        self.mini = x["reviewTime"].min()
        self.range = self.maxi - self.mini
        self.part1 = self.mini + 0.50 * self.range
        self.part2 = self.part1 + 0.30 * self.range
        return self
    
    def transform(self,x):

        x_transformed = x.copy()
        conditions = [(x["reviewTime"] > self.mini) & (x["reviewTime"]  <= self.part1),
        (x["reviewTime"]  > self.part1) & (x["reviewTime"]  <= self.part2),
        (x["reviewTime"] > self.part2) & (x["reviewTime"]  <= self.maxi)]
        choices = ["old","new","latest"]
        x_transformed["reviewTime"] = np.select(conditions,choices,default="latest")

        return x_transformed


In [89]:
# Transformer to convert overall to categories

class ratingscore(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self, x,y=0):
        return self
    def transform(self,x):

        x_transformed = x.copy()

        conditions = [(x["overall"]<3),
                      (x["overall"] == 3),
                      (x["overall"]>3)]
        items = ["poor","average","good"]
        x_transformed["ratingTag"] = np.select(conditions,items,default="poor")

        return x_transformed


In [90]:
# Transformer to tokenize the column values of "reviewText" column

class tokenize_text(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self,x,y=0):
        return self
    def transform(self,x):
        res = Parallel(n_jobs=5)(delayed(tokenizer)(text) for text in x["reviewText"])
        res = pd.Series(res)
        x_transformed = pd.concat([x,pd.DataFrame(res,columns=["labels"])], axis=1)
        x_transformed = x_transformed.drop(["reviewText"], axis=1)
        return x_transformed

def tokenizer(token):
        # handling null values
        if pd.isna(token) or str(token) == "" or token is None:
             return []
        # tokenize_constructor = RegexpTokenizer(pattern='\s+',gaps=True) # commented it out as this was providing bad results although was time efficient
        tokens = word_tokenize(str(token))
        tokens = [x.lower() for x in tokens if x and x.strip()]
        tokens = [x for x in tokens if x.isalnum() or x in ["n't", "'s"] ]
        return tokens


In [91]:
# Transformer to remove stop words from labels column

class remove_stop_words(BaseEstimator, TransformerMixin):
    def __init__(self, n_jobs = 1):
        self.stop_words = set(stopwords.words('english'))
        self.n_jobs = n_jobs
        pass

    def fit(self,x,y=0):
        return self
        
    def transform(self,x):
        x_transformed = x.copy()
        if self.n_jobs>1:
            try:
                x_transformed["labels"] = Parallel(n_jobs=self.n_jobs)(delayed(remove_stop)(tokens, self.stop_words) for tokens in x_transformed["labels"])
            except Exception as e:
                print(f"Parallel processing failed {e} returning to single processing")
                x_transformed["labels"] = [remove_stop(tokens, self.stop_words) for tokens in x_transformed["labels"]]
        else:
            x_transformed["labels"] = [remove_stop(tokens, self.stop_words) for tokens in x_transformed["labels"]]
        
        return x_transformed
    
def remove_stop(tokens, stop_words):    
     if not isinstance(tokens,list):
         return []
     else:
         return [token for token in tokens if token not in stop_words]
     


In [92]:
# transformer to get the sentiment of the review

class sentiment_analyze(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self,x,y=0):
        return self
    def transform(self,x):
        x_transformed = x.copy()
        sia = SentimentIntensityAnalyzer()
        scores = [sia.polarity_scores(" ".join(str(text) for text in labels))["compound"] for labels in x_transformed["labels"]]
        scores_df = pd.DataFrame(scores,columns=["scores"])
        conditions = [
            (scores_df["scores"] >= 0.05),
            (scores_df["scores"].between(0.05,-0.05)),
            (scores_df["scores"] <= -0.05),
        ]
        choises =  ["Good","Average","Bad"]
        scores_df["scores"] = np.select(conditions,choises,default="Average")
        x_transformed["sentiment"] = scores_df["scores"]
        
        return x_transformed

In [107]:
# transformer to accumulate all labels for a good and average sentiment

class get_all_labels(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=0):
        return self
    
    def combine_label_colms(self,data):
        if data["sentiment"] == "Good" or data["sentiment"] == "Average":
            return data["combinedLabel"]+data["labels"]
        else:
            return data["combinedLabel"]
        
    def transform(self,x):
        x_tranformed = x.copy()
        x_tranformed["combinedLabel"] = x_tranformed[["Size:","Color:","Metal Type:","Style:","Format:","Style Name:","Material:"]].apply(lambda x: ",".join(x.dropna()).split(","),axis=1)
        x_tranformed["tags"] = x_tranformed.apply(self.combine_label_colms,axis=1)
        x_tranformed.drop(["labels","combinedLabel"],axis=1)
        
        return x_tranformed


In [108]:
# transformer to rearrange columns

class rearrangeCols(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self,x,y=0):
        return self
    def transform(self,x):
        
        x_transformed = x.copy()
        colms = ["reviewerID","reviewerName","asin"	,"reviewTime","ratingTag","sentiment","tags"]
        x_transformed = x_transformed.reindex(columns=colms)

        return x_transformed

In [109]:
pipe = Pipeline([
    ("manageverified",verified(threshold=0.05)),
    ("splitstyles",splitStyle()),
    ("reviewage",reviewage()),
    ("overallrating",ratingscore()),
    ("tokenizetext",tokenize_text()),
    ("removestopwords",remove_stop_words()),
    ("sentimentanalize", sentiment_analyze()),
    ("combinelabels",get_all_labels()),
    ("dropcolumns",dropcols(colms=["overall","Item Display Length:","Team Name:","Package Quantity:", "Length:","Size Name:","vote","image","unixReviewTime","summary","Size:","Color:","Metal Type:","Style:","Format:","Style Name:","Material:","labels","combinedLabel"])),
    ("rearrangecolmns", rearrangeCols())
    ])

In [110]:
result = pipe.fit_transform(db)
result.head()

Unnamed: 0,reviewerID,reviewerName,asin,reviewTime,ratingTag,sentiment,tags
0,A2VOA9Z3QNDNRI,Amazon Customer,B000YFSR5G,latest,poor,Good,"[ X-Large, Charcoal Heather, cheaply, made, s..."
1,A1GQPW286SLV69,Bhargav Kanakiya,B000YFSR5G,latest,good,Good,"[ Medium, Light Steel, nice, pair, helps, leg..."
2,A343KWSY5I3ZCU,Roger M.,B000YFSR5G,latest,average,Bad,"[ X-Large, Black]"
3,A1ARUODW18J2KV,Heni,B000YFSR5G,latest,good,Good,"[ Large, Black, arrived, time, perfect, littl..."
4,ARPP0CUQFJ6N6,ilande,B000YFSR5G,latest,good,Good,"[ Medium, Light Steel, great, considering, pr..."


In [117]:
os.makedirs("models/", exist_ok=True)
with open("models/sentiment_pipe.pkl", "wb") as f:
    pickle.dump(pipe,f)