In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("vine_table.csv")

In [3]:
# 1. Filter the data for all the rows where the total_votes count >= 20; avoids error division by zero and allows for credible products
reviews_df = df.loc[df["total_votes"]>=20]
reviews_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
218,R37F42INKX7L9K,5,45,49,N,Y
557,R2EHKYNEP8WVSR,5,25,25,N,Y
560,RXOS7BHID0UHL,5,16,27,N,N
909,R33HHGFPB403GM,5,19,21,N,Y
1232,RY9O9XNLP464N,2,19,22,N,Y


In [4]:
# 2. Filter reviews_df to retrieve all the rows where # of helpful_votes divided by total_votes >= 50%.
helpful_reviews_df = reviews_df.loc[(reviews_df["helpful_votes"]/reviews_df["total_votes"])>=0.5]
helpful_reviews_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
218,R37F42INKX7L9K,5,45,49,N,Y
557,R2EHKYNEP8WVSR,5,25,25,N,Y
560,RXOS7BHID0UHL,5,16,27,N,N
909,R33HHGFPB403GM,5,19,21,N,Y
1232,RY9O9XNLP464N,2,19,22,N,Y


In [5]:
# 3. filter the helpful_reviews_df to retrieve all the rows for reviews through Vine program (vine == 'Y')
vine_reviews_df = helpful_reviews_df.loc[helpful_reviews_df["vine"] == "Y"]
vine_reviews_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
270349,R2N45ZKRRZS856,5,21,22,Y,N
306676,R5OMLMK13A8NS,5,34,38,Y,N
1050160,R2MPEQ4SPTEQNS,4,180,184,Y,N
1224726,RIR0D3KJ0CQ31,4,21,21,Y,N
1231743,R1SPWJDHUWWC5E,5,88,98,Y,N


In [6]:
# 4. filter the helpful_reviews_df to retrieve all the rows for reviews through Vine program (vine == 'N')
non_vine_reviews_df = helpful_reviews_df.loc[helpful_reviews_df["vine"] == "N"]
non_vine_reviews_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
218,R37F42INKX7L9K,5,45,49,N,Y
557,R2EHKYNEP8WVSR,5,25,25,N,Y
560,RXOS7BHID0UHL,5,16,27,N,N
909,R33HHGFPB403GM,5,19,21,N,Y
1232,RY9O9XNLP464N,2,19,22,N,Y


In [7]:
# 5. The overall total number of reviews (paid and unpaid reviews)
all_reviews = helpful_reviews_df["review_id"].nunique()
print(f"The overall number of reviews (paid and unpaid) is {all_reviews}.")

The overall number of reviews (paid and unpaid) is 27009.


In [9]:
# 5. Paid reviews, Determine:
    # Total number of paid reviews
    # Number of paid 5-star reviews
    # The percentage of paid reviews that are five stars (denominator is all paid reviews)

# For all products with a minimum of 20 votes, there are:
# Total number of paid reviews
total_paid_reviews = vine_reviews_df["review_id"].nunique()
print(f"There are {total_paid_reviews} total paid reviews for shoes on Amazon.")
# Number of paid 5-star reviews
paid_five_star_reviews = vine_reviews_df.loc[vine_reviews_df["star_rating"]==5.0]["star_rating"].count()
print(f"Out of all paid reviews, there are {paid_five_star_reviews} five star reviews.")
# The percentage of paid reviews that are five stars
percentage_paid_five_star_reviews = round(100*paid_five_star_reviews/total_paid_reviews)
print(f"{percentage_paid_five_star_reviews}% of paid reviews had a five star rating.")

There are 22 total paid reviews for shoes on Amazon.
Out of all paid reviews, there are 13 five star reviews.
59% of paid reviews had a five star rating.


In [10]:
# 5. Unpaid reviews, Determine:
    # Total number of unpaid reviews
    # Number of unpaid 5-star reviews
    # The percentage of  Vine == 'N' (unpaid) reviews that are 5-stars (denominator is all unpaid reviews)

# For all products with a minimum of 20 votes, there are:
# Total number of unpaid reviews
total_unpaid_reviews = non_vine_reviews_df["review_id"].nunique()
print(f"There are {total_unpaid_reviews} total unpaid reviews for shoes on Amazon.")
# Number of unpaid 5-star reviews
unpaid_five_star_reviews = non_vine_reviews_df.loc[non_vine_reviews_df["star_rating"]==5.0]["star_rating"].count()
print(f"Out of all unpaid reviews, there are {unpaid_five_star_reviews} five star reviews.")
# The percentage of unpaid reviews that are five stars
percentage_unpaid_five_star_reviews = round(100*unpaid_five_star_reviews/total_unpaid_reviews)
print(f"{percentage_unpaid_five_star_reviews}% of unpaid reviews had a five star rating.")

There are 26987 total unpaid reviews for shoes on Amazon.
Out of all unpaid reviews, there are 14475 five star reviews.
54% of unpaid reviews had a five star rating.
