In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazon-reviews-data-set/amazon_review.csv


In [2]:
import pandas as pd
import math
import scipy.stats as st
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [3]:
df = pd.read_csv("/kaggle/input/amazon-reviews-data-set/amazon_review.csv")
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,day_diff,helpful_yes,total_vote
0,A3SBTW3WS4IQSN,B007WTAJTO,,"[0, 0]",No issues.,4.0,Four Stars,1406073600,2014-07-23,138,0,0
1,A18K1ODH1I2MVB,B007WTAJTO,0mie,"[0, 0]","Purchased this for my device, it worked as adv...",5.0,MOAR SPACE!!!,1382659200,2013-10-25,409,0,0
2,A2FII3I2MBMUIA,B007WTAJTO,1K3,"[0, 0]",it works as expected. I should have sprung for...,4.0,nothing to really say....,1356220800,2012-12-23,715,0,0
3,A3H99DFEG68SR,B007WTAJTO,1m2,"[0, 0]",This think has worked out great.Had a diff. br...,5.0,Great buy at this price!!! *** UPDATE,1384992000,2013-11-21,382,0,0
4,A375ZM4U047O79,B007WTAJTO,2&amp;1/2Men,"[0, 0]","Bought it with Retail Packaging, arrived legit...",5.0,best deal around,1373673600,2013-07-13,513,0,0


In [4]:
df.shape

(4915, 12)

In [5]:
df['overall'].mean()

4.587589013224822

In [6]:
df["day_diff"].mean()

437.3670396744659

In [7]:
# Let's convert the 'reviewTime' column to datetime type:
df["reviewTime"] = pd.to_datetime(df["reviewTime"])

# Let's find the latest date in the dataset and determine the date two days later:
max_date = df["reviewTime"].max()
current_date = max_date + pd.Timedelta(days=2)

# Let's calculate the difference in days:
df["days"] = (current_date - df["reviewTime"]).dt.days

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4915 entries, 0 to 4914
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   reviewerID      4915 non-null   object        
 1   asin            4915 non-null   object        
 2   reviewerName    4914 non-null   object        
 3   helpful         4915 non-null   object        
 4   reviewText      4914 non-null   object        
 5   overall         4915 non-null   float64       
 6   summary         4915 non-null   object        
 7   unixReviewTime  4915 non-null   int64         
 8   reviewTime      4915 non-null   datetime64[ns]
 9   day_diff        4915 non-null   int64         
 10  helpful_yes     4915 non-null   int64         
 11  total_vote      4915 non-null   int64         
 12  days            4915 non-null   int64         
dtypes: datetime64[ns](1), float64(1), int64(5), object(6)
memory usage: 499.3+ KB


In [8]:
# Let's calculate the weighted average rating:
def time_based_weighted_average(dataframe, w1=28, w2=26, w3=24, w4=22):
    return dataframe.loc[dataframe["days"] <= 30, "overall"].mean() * w1 / 100 + \
           dataframe.loc[(dataframe["days"] > 30) & (dataframe["days"] <= 90), "overall"].mean() * w2 / 100 + \
           dataframe.loc[(dataframe["days"] > 90) & (dataframe["days"] <= 180), "overall"].mean() * w3 / 100 + \
           dataframe.loc[(dataframe["days"] > 180), "overall"].mean() * w4 / 100

time_based_weighted_average(df)

4.6987161061560725

In [9]:
# We will select the 20 reviews to be displayed on the product detail page.

# Let's first create the helpful_no variable.:
df['helpful_no'] = df['total_vote'] - df['helpful_yes']

# Let's calculate the score_pos_neg_diff:
df['score_pos_neg_diff'] = df['helpful_yes'] - df['helpful_no']

# Let's calculate the score_average_rating:
df['score_average_rating'] = df.apply(lambda x: x['helpful_yes'] / x['total_vote'] if x['total_vote'] > 0 else 0, axis=1)

# Let's calculate the wilson_lower_bound:
from scipy.stats import norm
def wilson_lower_bound(helpful_yes, total_vote, confidence=0.95):
    if total_vote == 0:
        return 0
    z = norm.ppf(1 - (1 - confidence) / 2)
    phat = helpful_yes / total_vote
    return (phat + z**2 / (2 * total_vote) - z * ((phat * (1 - phat) + z**2 / (4 * total_vote)) / total_vote)**0.5) / (1 + z**2 / total_vote)

df['wilson_lower_bound'] = df.apply(lambda x: wilson_lower_bound(x['helpful_yes'], x['total_vote']), axis=1)

# Let's calculate the wilson_lower_bound values and add them to the dataframe:
top_20_reviews = df.sort_values(by='wilson_lower_bound', ascending=False).head(20)

# Let's see the results:
print(top_20_reviews[['reviewText', 'helpful_yes', 'helpful_no', 'total_vote', 'wilson_lower_bound']])

                                             reviewText  helpful_yes  helpful_no  total_vote  wilson_lower_bound
2031  [[ UPDATE - 6/19/2014 ]]So my lovely wife boug...         1952          68        2020             0.95754
3449  I have tested dozens of SDHC and micro-SDHC ca...         1428          77        1505             0.93652
4212  NOTE:  please read the last update (scroll to ...         1568         126        1694             0.91214
317   If your card gets hot enough to be painful, it...          422          73         495             0.81858
4672  Sandisk announcement of the first 128GB micro ...           45           4          49             0.80811
1835  Bought from BestBuy online the day it was anno...           60           8          68             0.78465
3981  The last few days I have been diligently shopp...          112          27         139             0.73214
3807  I bought this card to replace a lost 16 gig in...           22           3          25    