# Positivy Bias Analysis

## Determining Bias Towards Favorable Reviews from Amazon Vine Program Participants

### Data Preparation

In [1]:
# Load dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff

In [2]:
# Read in vine_table as DataFrame
df = pd.read_csv("vine_table.csv")
df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
0,REAKC26P07MDN,5.0,0.0,0.0,N,Y
1,R3NU7OMZ4HQIEG,2.0,0.0,1.0,N,Y
2,R14QJW3XF8QO1P,5.0,0.0,0.0,N,Y
3,R2HB7AX0394ZGY,5.0,0.0,0.0,N,Y
4,RGKMPDQGSAHR3,5.0,0.0,0.0,N,Y


In [3]:
# Filter for total_votes >= 20
df_2 = df.loc[df['total_votes'] >= 20]
df_2.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
128,R21KC552Y6HL8X,1.0,27.0,31.0,N,Y
161,RX9WC9FTIR1XR,5.0,25.0,25.0,N,Y
256,RGDCOU1KBHMNG,3.0,29.0,31.0,N,Y
267,RVTYWID2TPMMY,2.0,35.0,42.0,N,Y
719,R2CMPZ5VESGRLY,4.0,27.0,28.0,N,Y


In [4]:
# Filter for >= 50% helpful_votes
df_3 = df_2.loc[(df_2['helpful_votes']/df_2['total_votes']) >= .5]
df_3.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
128,R21KC552Y6HL8X,1.0,27.0,31.0,N,Y
161,RX9WC9FTIR1XR,5.0,25.0,25.0,N,Y
256,RGDCOU1KBHMNG,3.0,29.0,31.0,N,Y
267,RVTYWID2TPMMY,2.0,35.0,42.0,N,Y
719,R2CMPZ5VESGRLY,4.0,27.0,28.0,N,Y


In [5]:
# Inspect null values
df_3.isnull().sum()

review_id            0
star_rating          0
helpful_votes        0
total_votes          0
vine                 0
verified_purchase    0
dtype: int64

In [6]:
# Inspect data types
df_3.dtypes

review_id             object
star_rating          float64
helpful_votes        float64
total_votes          float64
vine                  object
verified_purchase     object
dtype: object

In [7]:
# Add star_str column with star rating as string type
df_3['star_str'] = (df_3['star_rating'].astype(str)).str.replace(r'.0', '')
df_3.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase,star_str
128,R21KC552Y6HL8X,1.0,27.0,31.0,N,Y,1
161,RX9WC9FTIR1XR,5.0,25.0,25.0,N,Y,5
256,RGDCOU1KBHMNG,3.0,29.0,31.0,N,Y,3
267,RVTYWID2TPMMY,2.0,35.0,42.0,N,Y,2
719,R2CMPZ5VESGRLY,4.0,27.0,28.0,N,Y,4


In [8]:
# Create 'review_type' column for 'Paid', 'Unpaid'
df_3['review_type'] = df_3['vine'].map({'N': 'Unpaid', 'Y': 'Paid'})
df_3.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_type
128,R21KC552Y6HL8X,1.0,27.0,31.0,N,Y,Unpaid
161,RX9WC9FTIR1XR,5.0,25.0,25.0,N,Y,Unpaid
256,RGDCOU1KBHMNG,3.0,29.0,31.0,N,Y,Unpaid
267,RVTYWID2TPMMY,2.0,35.0,42.0,N,Y,Unpaid
719,R2CMPZ5VESGRLY,4.0,27.0,28.0,N,Y,Unpaid


### Data Organization

In [None]:
# Create DataFrame for Vine Program participants
paid_df = df_3.loc[df_3['vine'] == 'Y']

# Create DataFrame for non-Vine Program participants
unpaid_df = df_3.loc[df_3['vine'] == 'N']

In [None]:
# Inspect paid_df
paid_df.head()

In [None]:
# Inspect unpaid_df
unpaid_df.head()

## Analysis

In [None]:
# Determine total # of reviews, # of 5-star reviews, and % of 5-star reviews for paid vs. unpaid reviews
paid_count = len(paid_df)
unpaid_count = len(unpaid_df)
paid_5_count = len(paid_df[paid_df['star_rating']==5.0])
unpaid_5_count = len(unpaid_df[unpaid_df['star_rating']==5.0])
paid_5_percentage = round((paid_5_count / paid_count)*100, 2)
unpaid_5_percentage = round((unpaid_5_count / unpaid_count)*100, 2)

paid = {'Total_Reviews': paid_count, '5_Star_Reviews': paid_5_count, 'Percent_5_Star': paid_5_percentage}
unpaid = {'Total_Reviews': unpaid_count, '5_Star_Reviews': unpaid_5_count, 'Percent_5_Star': unpaid_5_percentage}
data = [paid, unpaid]

In [None]:
summary_df = pd.DataFrame(data, index=["Paid", "Unpaid"]).transpose()
summary_df

### Plotting

In [None]:
# Boxplot
fig = px.box(df_3, x="review_type", y="star_rating", title="Unpaid vs. Paid Review Star Ratings",
            labels={
                'review_type': 'Review Type',
                'star_rating': 'Star Rating'
            })
fig.show()

In [None]:
# Bar Charts


In [None]:
paid_df.star_rating.plot.density()

In [None]:
unpaid_df.star_rating.plot.density()

In [None]:
# Combined Histogram
hist_data = [unpaid_df.star_rating, paid_df.star_rating]
labels = ['Unpaid', 'Paid']

fig = ff.create_distplot(hist_data, labels, bin_size=1, show_rug=False)
fig.show()