In [1]:
#TUR
#Veri Seti hikayesi :
#Amazon ürün verilerini içeren bu veri seti ürün kategorileri ile çeşitli metadatalarıiçermektedir. Elektronik kategorisindeki en fazla yorum alan ürünün kullanıcı puanları ve yorumları vardır

#ENG
#Story of Dataset
#This dataset, which includes Amazon product data, includes product categories and various metadata. The product with the most reviews in the electronics category has user ratings and reviews

In [2]:
#Importing Libraries
import pandas as pd 
import math 
import scipy.stats as st
import numpy as np
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)
pd.set_option('display.expand_frame_repr',False)
pd.set_option('display.float_format', lambda x: '%.5f' % x)


In [3]:
#ENG
#Reading the dataset
#TUR 
#Veri Setinin okunması
df = pd.read_csv("amazon_review.csv")

In [4]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,day_diff,helpful_yes,total_vote
0,A3SBTW3WS4IQSN,B007WTAJTO,,"[0, 0]",No issues.,4.0,Four Stars,1406073600,2014-07-23,138,0,0
1,A18K1ODH1I2MVB,B007WTAJTO,0mie,"[0, 0]","Purchased this for my device, it worked as adv...",5.0,MOAR SPACE!!!,1382659200,2013-10-25,409,0,0
2,A2FII3I2MBMUIA,B007WTAJTO,1K3,"[0, 0]",it works as expected. I should have sprung for...,4.0,nothing to really say....,1356220800,2012-12-23,715,0,0
3,A3H99DFEG68SR,B007WTAJTO,1m2,"[0, 0]",This think has worked out great.Had a diff. br...,5.0,Great buy at this price!!! *** UPDATE,1384992000,2013-11-21,382,0,0
4,A375ZM4U047O79,B007WTAJTO,2&amp;1/2Men,"[0, 0]","Bought it with Retail Packaging, arrived legit...",5.0,best deal around,1373673600,2013-07-13,513,0,0


In [5]:
#ENG
#Check for shape of dataset
#TUR
#Veri setinin şekil kontrolü, değişken sayısı ve satır sayısı...
df.shape

(4915, 12)

In [6]:
#ENG 
#Check if there is null value in columns
#TUR
#Değişkenlerde boş değer var mı yok mu kontrol ediyoruz
df.isnull().sum()

reviewerID        0
asin              0
reviewerName      1
helpful           0
reviewText        1
overall           0
summary           0
unixReviewTime    0
reviewTime        0
day_diff          0
helpful_yes       0
total_vote        0
dtype: int64

In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
overall,4915.0,4.58759,0.99685,1.0,5.0,5.0,5.0,5.0
unixReviewTime,4915.0,1379465001.66836,15818574.32275,1339200000.0,1365897600.0,1381276800.0,1392163200.0,1406073600.0
day_diff,4915.0,437.36704,209.43987,1.0,281.0,431.0,601.0,1064.0
helpful_yes,4915.0,1.31109,41.61916,0.0,0.0,0.0,0.0,1952.0
total_vote,4915.0,1.52146,44.12309,0.0,0.0,0.0,0.0,2020.0


In [8]:
#Calculation of avg overall: 
df["overall"].mean()

4.587589013224822

In [9]:
#Some changes to the reviewTime variable.

In [10]:
df = df.rename(columns = {'reviewTime': 'review_time'} )

In [11]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,review_time,day_diff,helpful_yes,total_vote
0,A3SBTW3WS4IQSN,B007WTAJTO,,"[0, 0]",No issues.,4.0,Four Stars,1406073600,2014-07-23,138,0,0
1,A18K1ODH1I2MVB,B007WTAJTO,0mie,"[0, 0]","Purchased this for my device, it worked as adv...",5.0,MOAR SPACE!!!,1382659200,2013-10-25,409,0,0
2,A2FII3I2MBMUIA,B007WTAJTO,1K3,"[0, 0]",it works as expected. I should have sprung for...,4.0,nothing to really say....,1356220800,2012-12-23,715,0,0
3,A3H99DFEG68SR,B007WTAJTO,1m2,"[0, 0]",This think has worked out great.Had a diff. br...,5.0,Great buy at this price!!! *** UPDATE,1384992000,2013-11-21,382,0,0
4,A375ZM4U047O79,B007WTAJTO,2&amp;1/2Men,"[0, 0]","Bought it with Retail Packaging, arrived legit...",5.0,best deal around,1373673600,2013-07-13,513,0,0


In [12]:
df["review_time"] = pd.to_datetime(df["review_time"])

In [13]:
df.dtypes

reviewerID                object
asin                      object
reviewerName              object
helpful                   object
reviewText                object
overall                  float64
summary                   object
unixReviewTime             int64
review_time       datetime64[ns]
day_diff                   int64
helpful_yes                int64
total_vote                 int64
dtype: object

In [14]:
#ENG 
#Take the maximum value of review_time as Current Date

#TUR
#Bugünün tarihini review_time'ın maximum değeriyle aldık.
current_date = df["review_time"].max()

In [15]:
current_date

Timestamp('2014-12-07 00:00:00')

In [16]:
#ENG
#Creating diff_Time variable
#TUR
# diff_time değişkeninin oluşturulması.
df["diff_time"] = (current_date-df["review_time"]).dt.days

In [17]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,review_time,day_diff,helpful_yes,total_vote,diff_time
0,A3SBTW3WS4IQSN,B007WTAJTO,,"[0, 0]",No issues.,4.0,Four Stars,1406073600,2014-07-23,138,0,0,137
1,A18K1ODH1I2MVB,B007WTAJTO,0mie,"[0, 0]","Purchased this for my device, it worked as adv...",5.0,MOAR SPACE!!!,1382659200,2013-10-25,409,0,0,408
2,A2FII3I2MBMUIA,B007WTAJTO,1K3,"[0, 0]",it works as expected. I should have sprung for...,4.0,nothing to really say....,1356220800,2012-12-23,715,0,0,714
3,A3H99DFEG68SR,B007WTAJTO,1m2,"[0, 0]",This think has worked out great.Had a diff. br...,5.0,Great buy at this price!!! *** UPDATE,1384992000,2013-11-21,382,0,0,381
4,A375ZM4U047O79,B007WTAJTO,2&amp;1/2Men,"[0, 0]","Bought it with Retail Packaging, arrived legit...",5.0,best deal around,1373673600,2013-07-13,513,0,0,512


In [18]:
#ENG
# Defining a function for make quartiles of diff_time value of dataset

#TUR
#diff_time değişkeninin çeyreklik değerlerini yakalamak için bir fonksiyon tanımlıyoruz
def make_quartiles(dataframe, lst = [.25,.5,.75], quart_value = "diff_time"):
    quartiless = dataframe[quart_value].quantile(lst)
    return quartiless
    

In [25]:
quartiles = 

NameError: name 'quartiless' is not defined

In [20]:
#ENG
#We analyze by weighting the quarters we find with the time-weighted average.
#TUR
#Zaman ağırlıklı ortalama ile bulduğumuz çeyreklikleri ağırlaştırarak analiz yapıyoruz.ve bunun için bir fonksiyon tanımlıyoruz
def time_based_weighted_average(dataframe, w1 =28, w2 = 26, w3 = 24, w4 = 22):
    tbwa = dataframe.loc[df["diff_time"] <= 280, "overall"].mean() * w1/100 + \
           dataframe.loc[(df["diff_time"] > 280) & (df["diff_time"] <= 430), "overall"].mean() * w2/100 + \
           dataframe.loc[(df["diff_time"] > 430) & (df["diff_time"] <= 600), "overall"].mean() * w3/100 + \
           dataframe.loc[df["diff_time"] > 600, "overall"].mean() * w4/100 
    print(f"Time Based Weighted Average is {tbwa}")

In [21]:
time_based_weighted_average(df)

Time Based Weighted Average is 4.595593165128118


In [22]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,review_time,day_diff,helpful_yes,total_vote,diff_time
0,A3SBTW3WS4IQSN,B007WTAJTO,,"[0, 0]",No issues.,4.0,Four Stars,1406073600,2014-07-23,138,0,0,137
1,A18K1ODH1I2MVB,B007WTAJTO,0mie,"[0, 0]","Purchased this for my device, it worked as adv...",5.0,MOAR SPACE!!!,1382659200,2013-10-25,409,0,0,408
2,A2FII3I2MBMUIA,B007WTAJTO,1K3,"[0, 0]",it works as expected. I should have sprung for...,4.0,nothing to really say....,1356220800,2012-12-23,715,0,0,714
3,A3H99DFEG68SR,B007WTAJTO,1m2,"[0, 0]",This think has worked out great.Had a diff. br...,5.0,Great buy at this price!!! *** UPDATE,1384992000,2013-11-21,382,0,0,381
4,A375ZM4U047O79,B007WTAJTO,2&amp;1/2Men,"[0, 0]","Bought it with Retail Packaging, arrived legit...",5.0,best deal around,1373673600,2013-07-13,513,0,0,512


In [23]:
#ENG
#I create a function to capture weighted and weighted average values for each quarter.
#TUR
#Her bir çeyreklik için ağırlaştırılmış ve ağırlaştırılmamış ortalama değerleri yakalamak için bir fonksiyon yaratıyorum.
def time_based_weighted_average_each(dataframe,lst = [.25,.5,.75],quart_value = "diff_time",w1 =28, w2 = 26, w3 = 24, w4 = 22):
    quartiless = dataframe[quart_value].quantile(lst)
    return quartiless
    
    q1_weighted = dataframe.loc[df["diff_time"] <= quartiless[0.25], "overall"].mean() * w1/100
    q2_weighted = dataframe.loc[(df["diff_time"] > quartiless[0.25]) & (df["diff_time"] <= quartiless[0.50]), "overall"].mean() * w2 / 100
    q3_weighted = dataframe.loc[(df["diff_time"] > quartiless[0.50]) & (df["diff_time"] <= quartiless[0.75]), "overall"].mean() * w3 / 100
    q4_weighted = dataframe.loc[df["diff_time"] > quartiless[0.75], "overall"].mean()* w4/100
    
    q1_unweighted = dataframe.loc[df["diff_time"] <= quartiless[0.25], "overall"].mean() 
    q2_unweighted = dataframe.loc[(df["diff_time"] > quartiless[0.25]) & (df["diff_time"] <= quartiless[0.50]), "overall"].mean() 
    q3_unweighted = dataframe.loc[(df["diff_time"] > quartiless[0.50]) & (df["diff_time"] <= quartiless[0.75]), "overall"].mean() 
    q4_unweighted = dataframe.loc[df["diff_time"] > quartiless[0.75], "overall"].mean()
    
    print(" Weighted Values ".center(60, "*"))
    print(60*" ")
    print(f"Quartile_1 mean with weighted value is {q1_weighted}")
    print(f"Quartile_2 mean with weighted value is {q2_weighted}")
    print(f"Quartile_3 mean with weighted value is {q3_weighted}")
    print(f"Quartile_4 mean with weighted value is {q4_weighted}")
    print(60*" ")
    print(" Unweighted Values ".center(60, "*"))
    print(60*" ")
    print(f"Quartile_1 mean with unweighted value is {q1_unweighted}")
    print(f"Quartile_2 mean with unweighted value is {q2_unweighted}")
    print(f"Quartile_3 mean with unweighted value is {q3_unweighted}")
    print(f"Quartile_4 mean with unweighted value is {q4_unweighted}")
    

In [24]:
time_based_weighted_average_each(df,quartiless)

NameError: name 'quartiless' is not defined

In [None]:
#ENG 
#Considering the predominantly average score of the quarter, it was observed that customers who commented on a more recent date in terms of days gave higher scores.
#While creating the weight system, the weight system has been applied in a decreasing way from the recent past to the distant date.
#It has been observed that this situation occurs in the same way in the data without the weight system applied.

#TUR 
#Yorumlama : 4 Çeyreğin ağırlıklı olarak puan ortalamalarına bakıldığında gün olarak daha yakın tarihte yorum yapan müşterilerin daha yüksek puan verdiği gözlemlenmiştir. 
#Ağırlık sistemi yaratılırken, yakın tarihten uzak tarihe doğru azalan bir şekilde ağırlık sistemi uygulanmıştır. 
#Ağırlık sistemi uygulanmamış verilerde de bu durumun aynı şekilde gerçekleştiği gözlemlenmiştir. 

In [None]:
#ENG
#creating of helpful_no variable
#TUR 
#helpful_no değişkeninin oluşturulması
df["helpful_no"] = df["total_vote"] - df["helpful_yes"]

In [None]:
#ENG
#creating of score_pos_neg_diff variable
#TUR
#score_pos_neg_diff değişkeninin oluşturulması
def score_pos_neg_diff(pos,neg):
    return pos-neg

df["score_pos_neg_diff"] = df.apply(lambda t: score_pos_neg_diff(t["helpful_yes"],t["helpful_no"]), axis = 1)

In [None]:
df.head()

In [None]:
#ENG
#creating of score_average_rating function
#TUR
#score_average_rating fonksiyonunun oluşturulması
def score_average_rating(pos,neg):
    if pos+neg == 0:
        return 0
    return pos / (pos+neg)

In [None]:
#ENG
#creating of score_average_rating column
#TUR
#score_average_rating  oluşturulması
df["score_average_rating"] = df.apply(lambda x: score_average_rating(x["helpful_yes"],x["helpful_no"]),axis = 1)

In [None]:
df.head()

In [None]:
#ENG
#We defined a function named Wilson_lower_bound
#TUR
#Wilson_lower_bound adında bir fonksiyon tanımladık
def wilson_lower_bound(pos,neg,confidence = 0.95):
    n = pos + neg
    if n == 0:
        return 0
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    phat = 1.0 * pos / n
    return (phat + z * z / (2 * n) - z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)

In [None]:
#ENG
#We defined a column named Wilson_lower_bound and applied the function in it.

#TUR
#Wilson_lower_bound adında bir kolon tanımlayıp içerisinde fonksiyonu uyguladık.
df["wilson_lower_bound"] = df.apply(lambda x: wilson_lower_bound(x["helpful_yes"],x["helpful_no"]),axis = 1) 

In [None]:
df.head()

In [None]:
#ENG
#We ranked the reviews with Wilson_lower_bound and viewed the first 20 reviews.
#TUR
#Wilson_lower_bound ile reviewleri sıralayıp ilk 20 review'i görüntüledik.
df.sort_values("wilson_lower_bound", ascending = False).head(20)

In [None]:
#ENG
#Looking at the ranks, the first 20 reviews were received according to wilson_lower_bound. Numbers, distributions and bernoulli of positive and negative comments in Wilson_Lower_bound
#We took the lower limit of the p value as probability. There is no change in overall, both low overalls and high overalls are in the top 20.
#Despite being negative comments, they rose to the top of the rankings because they were explanatory in terms of social proof and satisfying in terms of information.
#score_average_rating did not exactly affect the wlb value. While a value such as 0.95 was observed in the first value, values such as 1.000 were observed in the lower ranks.

#TUR
#Yorum : 
#Sıralamalara bakıldığında wilson_lower_bound'a göre ilk 20 review alınmıştır. Wilson_Lower_bound içerisinde positif ve negatif yorumların sayıları, dağılımları ve bernoulli
#olasılığında p değerinin alt limitini değer olarak aldık. Overall açısından herhangi bir değişiklik yoktur, düşük overallar da yüksek overaller da ilk 20 içerisinde yer almaktadır.
#Negatif yorum olmalarına rağmen sosyal kanıt açısından açıklayıcı oldukları ve bilgi açısından doyurucu oldukları için sıralamada üst sıralara yükselmişlerdir. 
#score_average_rating birebir şekilde wlb değerini etkilememiştir. birinci olan değerde 0.95 gibi bir değer gözlenirken alt sıralarda 1.000 gibi değerler gözlemlenmiştir