# Positivy Bias Analysis

## Determining Bias Towards Favorable Reviews from Amazon Vine Program Participants

### Data Preparation

In [1]:
# Load dependencies
import pandas as pd

In [2]:
# Read in vine_table as DataFrame
df = pd.read_csv("vine_table.csv")
df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
0,REAKC26P07MDN,5.0,0.0,0.0,N,Y
1,R3NU7OMZ4HQIEG,2.0,0.0,1.0,N,Y
2,R14QJW3XF8QO1P,5.0,0.0,0.0,N,Y
3,R2HB7AX0394ZGY,5.0,0.0,0.0,N,Y
4,RGKMPDQGSAHR3,5.0,0.0,0.0,N,Y


In [3]:
# Filter for total_votes >= 20
df_2 = df.loc[df['total_votes'] >= 20]
df_2.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
128,R21KC552Y6HL8X,1.0,27.0,31.0,N,Y
161,RX9WC9FTIR1XR,5.0,25.0,25.0,N,Y
256,RGDCOU1KBHMNG,3.0,29.0,31.0,N,Y
267,RVTYWID2TPMMY,2.0,35.0,42.0,N,Y
719,R2CMPZ5VESGRLY,4.0,27.0,28.0,N,Y


In [4]:
# Filter for >= 50% helpful_votes
df_3 = df_2.loc[(df_2['helpful_votes']/df_2['total_votes']) >= .5]
df_3.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
128,R21KC552Y6HL8X,1.0,27.0,31.0,N,Y
161,RX9WC9FTIR1XR,5.0,25.0,25.0,N,Y
256,RGDCOU1KBHMNG,3.0,29.0,31.0,N,Y
267,RVTYWID2TPMMY,2.0,35.0,42.0,N,Y
719,R2CMPZ5VESGRLY,4.0,27.0,28.0,N,Y


In [5]:
# Create DataFrame for Vine Program participants
paid_df = df_3.loc[df_3['vine'] == 'Y']

# Create DataFrame for non-Vine Program participants
unpaid_df = df_3.loc[df_3['vine'] == 'N']

In [6]:
# Inspect paid_df
paid_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
8547,R3A71VR1JZD8WF,2.0,27.0,30.0,Y,N
10246,R16OMUJIGI18JZ,5.0,72.0,72.0,Y,N
25168,R3TS8ZP2FHQ9XR,5.0,39.0,42.0,Y,N
46422,R2MHP919VZN7DI,5.0,29.0,30.0,Y,N
66446,RD2BCTVS59A5L,2.0,20.0,20.0,Y,N


In [7]:
# Inspect unpaid_df
unpaid_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
128,R21KC552Y6HL8X,1.0,27.0,31.0,N,Y
161,RX9WC9FTIR1XR,5.0,25.0,25.0,N,Y
256,RGDCOU1KBHMNG,3.0,29.0,31.0,N,Y
267,RVTYWID2TPMMY,2.0,35.0,42.0,N,Y
719,R2CMPZ5VESGRLY,4.0,27.0,28.0,N,Y


In [8]:
# Determine total # of reviews, # of 5-star reviews, and % of 5-star reviews for paid vs. unpaid reviews
paid_count = len(paid_df)
unpaid_count = len(unpaid_df)
paid_5_count = len(paid_df[paid_df['star_rating']==5.0])
unpaid_5_count = len(unpaid_df[unpaid_df['star_rating']==5.0])
paid_5_percentage = round((paid_5_count / paid_count)*100, 2)
unpaid_5_percentage = round((unpaid_5_count / unpaid_count)*100, 2)

paid = {'Total_Reviews': paid_count, '5_Star_Reviews': paid_5_count, 'Percent_5_Star': paid_5_percentage}
unpaid = {'Total_Reviews': unpaid_count, '5_Star_Reviews': unpaid_5_count, 'Percent_5_Star': unpaid_5_percentage}
data = [paid, unpaid]

In [9]:
summary_df = pd.DataFrame(data, index=["Paid", "Unpaid"]).transpose()
summary_df

Unnamed: 0,Paid,Unpaid
Total_Reviews,170.0,37840.0
5_Star_Reviews,65.0,20612.0
Percent_5_Star,38.24,54.47


### Analysis

In [10]:
# Test for statistical significance