# This module does the sentiment analysis of the reviews data

In [70]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
from dotenv import load_dotenv
import os
import urllib.parse
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config

In [71]:
set_config(display='diagram')

### Loading the data

In [2]:
# Loading the environment varibles
try:
   load_dotenv("login.env", override=True)
   username = os.getenv("username")
   username = urllib.parse.quote_plus(username)
   password = os.getenv("password")
   password = urllib.parse.quote_plus(password)
   print("environment variable loaded successfully")
except Exception as e:
   print(f'environment variable not able to load: {e}')

connection_string = f'mongodb+srv://{username}:{password}@cluster0.elvspmq.mongodb.net/'

environment variable loaded successfully


In [3]:
# checking the connection with the databse
try:
   mongocli = MongoClient(connection_string)
   print("database connected successfully")
except Exception as e:
   print(f'Error occured while connection to database: {e}')

database connected successfully


In [4]:
#creating the database

cluster = mongocli["contentrecommendation"]
collection = cluster["reviews"]
data = collection.find()
reviews = list(data)
db = pd.DataFrame(reviews)

In [5]:
db = db.drop("_id", axis= 1)
db.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5,True,"01 4, 2018",A17PZ8JD2WJJN9,B000YFSR5G,"{'Size:': ' Large', 'Color:': ' Black'}",Andrus Motto,Light weight and comfortable. Thank you.,Five Stars,1515024000,,
1,4,True,"05 6, 2018",AJGQGM602L80G,B000YFSR4W,"{'Size:': ' X-Large', 'Color:': ' Charcoal Hea...",Anna M. Matherly,Like the pants. Were a little large but after...,Four Stars,1525564800,,
2,4,True,"12 28, 2017",A2EKF4NAKIMVDE,B000YFSR5G,"{'Size:': ' XX-Large', 'Color:': ' Charcoal He...",Farrukh K,It was larger than I had anticipated. I ordere...,Comfy and warm,1514419200,,
3,2,True,"12 25, 2017",A16QVBMQGQ9M2K,B000YFSR5G,"{'Size:': ' XX-Large', 'Color:': ' Black'}",Salvatore Petrone,Super gigantic sweatpants. I went with XXL bec...,Super gigantic sweatpants,1514160000,,
4,4,True,"05 3, 2018",A3IJ9KRTUTD3LP,B000YFSR4W,"{'Size:': ' XX-Large', 'Color:': ' Black'}",T. DAVIS,"comfortable, needs pockets.",Four Stars,1525305600,,


In [6]:
print(db[db["verified"] == True].shape[0])
print(db[db["verified"] == False].shape[0])     

325492
24508


In [7]:
print(db[db["verified"]==False].shape[0])

24508


In [8]:
print(db["vote"].isna().sum())
print(db["image"].isna().sum())

311263
337928


# Creating a data pipeline which takes in the data and automates the overall data cleaning process
### Follwing are the requiremnts:
1. Remove non verified reviews if they are more than 5% of the total reviews.
2. Remove columns verified, vote, image. 
3. split column style.
4. convert unixReviewTime into a feature which gives an idea of how old the review is.

In [80]:
# Creating a transformer for managing verified and non verified reviews
class verified(BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold = 0.05):
        self.threshold = threshold
    
    def fit(self, x, y = 0):
        if "verified" in x.columns:
            self.number_of_non_verified =  x[x["verified"]==False].shape[0] 
        else:
            self.number_of_non_verified = 0
        return self
    def transform(self, x):
        if self.number_of_non_verified > self.threshold * x.shape[0]:
            x = x.drop("verified", axis=1)
            return x
        else:
            return x

In [81]:
# Create a transformer for removing columns vote and image

class dropcols(BaseEstimator, TransformerMixin):
    def __init__(self,colms):
        self.colms = colms
    def fit(self, x, y=0):
        return self
    def transform(self, x):
        x = x.drop(self.colms, axis = 1)
        return x

In [82]:
# Create a transformer for splitting column style

class splitStyle(BaseEstimator,TransformerMixin):

    def fit(self, x, y=0):
        return self
        
    def transform(self,x):
        result_colms = x["style"].apply(pd.Series)
        x = pd.concat([x,result_colms], axis=1)
        x = x.drop("style",axis=1)
        return x

In [87]:
# Create a transformer to convert review time into categories

class reviewage(BaseEstimator,TransformerMixin):
    def __init__(self ):
        pass


    def fit(self,x,y=0):
        if not isinstance(x,pd.DataFrame):
            raise AttributeError("Input must be a pandas dataframe")
        if "reviewTime" not in x.columns:
            raise AttributeError("Column not found in dataframe")
        
        self.maxi = x["reviewTime"].max()
        self.mini = x["reviewTime"].min()
        self.range = self.maxi - self.mini
        self.part1 = self.mini + 0.50 * range
        self.part2 = self.part1 + 0.30 * range
        return self
    
    def transform(self,x):

        x_transformed = x.copy()
        conditions = [(x["reviewTime"] > self.mini) & (x["reviewTime"]  <= self.part1),
        (x["reviewTime"]  > self.part1) & (x["reviewTime"]  <= self.part2),
        (x["reviewTime"] > self.part2) & (x["reviewTime"]  <= self.maxi)]
        choices = ["old","new","latest"]
        x_transformed["reviewTime"] = np.select(conditions,choices,default="latest")

        return x_transformed

        


In [88]:
pipe = Pipeline([
    ("manageverified",verified(threshold=0.05)),
    ("dropcolumns",dropcols(colms=["vote","image","unixReviewTime"])),
    ("splitstyles",splitStyle()),
    ("reviewage",reviewage())
    ])

In [89]:
result = pipe.fit_transform(db)
result.head()

Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,Size:,Color:,0,Length:,Metal Type:,Size Name:,Style:,Format:,Package Quantity:,Style Name:,Material:,Team Name:,Item Display Length:
0,5,latest,A17PZ8JD2WJJN9,B000YFSR5G,Andrus Motto,Light weight and comfortable. Thank you.,Five Stars,Large,Black,,,,,,,,,,,
1,4,latest,AJGQGM602L80G,B000YFSR4W,Anna M. Matherly,Like the pants. Were a little large but after...,Four Stars,X-Large,Charcoal Heather,,,,,,,,,,,
2,4,latest,A2EKF4NAKIMVDE,B000YFSR5G,Farrukh K,It was larger than I had anticipated. I ordere...,Comfy and warm,XX-Large,Charcoal Heather,,,,,,,,,,,
3,2,latest,A16QVBMQGQ9M2K,B000YFSR5G,Salvatore Petrone,Super gigantic sweatpants. I went with XXL bec...,Super gigantic sweatpants,XX-Large,Black,,,,,,,,,,,
4,4,latest,A3IJ9KRTUTD3LP,B000YFSR4W,T. DAVIS,"comfortable, needs pockets.",Four Stars,XX-Large,Black,,,,,,,,,,,


In [92]:
print(result[result["reviewTime"] == "latest"].shape[0])
print(result[result["reviewTime"] == "new"].shape[0])
print(result[result["reviewTime"] == "old"].shape[0])

238606
106051
5343
