In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math
import gzip
import json

In [14]:
# Functions for reading in the dataset obtained from https://nijianmo.github.io/amazon/index.html
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [9]:
df = getDF('reviews_Electronics_5.json.gz')

# Data Processing and EDA

In [51]:
df.head(3)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AO94DHGC771SJ,528881469,amazdnu,"[0, 0]",We got this GPS for my husband who is an (OTR)...,5.0,Gotta have GPS!,1370131200,"06 2, 2013"
1,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1.0,Very Disappointed,1290643200,"11 25, 2010"
2,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010"


**Feature Descriptions**
- reviewerID – ID of the reviewer, e.g. A2SUAM1J3GNN3B
- asin – ID of the product, e.g. 0000013714
- reviewerName – name of the reviewer
- helpful – helpfulness rating of the review, e.g. 2/3
- reviewText – text of the review
- overall – rating of the product
- summary – summary of the review
- unixReviewTime – time of the review (unix time)
- reviewTime – time of the review (raw)

One of the first things that I noticed is that the 'helpful' feature will need to be massaged because it is currently in the form of a tuple.

In [70]:
def helpful_decomposition(df): 
    """This function extracts information out of the tuple 'helpful' 
    feature so that we can start to create some other features"""
    
    df['helpful_votes'] = df.helpful.apply(lambda x: x[0])
    df['overall_votes'] = df.helpful.apply(lambda x: x[1])
    df['percent_helpful'] = round((df['helpful_votes'] / df['overall_votes']) * 100)
    df['review_helpful'] = np.where((df.percent_helpful > 60) & (df.overall_votes > 5), 1, 0)

In [71]:
# Calling function to decompose the 'helpful' feature
helpful_decomposition(df)

In [72]:
# The dataframe now has some new features that make the 'helpful' columns more meaningful
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,helpful_votes,overall_votes,percent_helpful,review_helpful
0,AO94DHGC771SJ,528881469,amazdnu,"[0, 0]",We got this GPS for my husband who is an (OTR)...,5.0,Gotta have GPS!,1370131200,"06 2, 2013",0,0,,0
1,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1.0,Very Disappointed,1290643200,"11 25, 2010",12,15,80.0,1
2,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010",43,45,96.0,1
3,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""","[9, 10]","Not going to write a long review, even thought...",2.0,"Great grafics, POOR GPS",1290556800,"11 24, 2010",9,10,90.0,1
4,A24EV6RXELQZ63,528881469,Wayne Smith,"[0, 0]",I've had mine for a year and here's what we go...,1.0,"Major issues, only excuses for support",1317254400,"09 29, 2011",0,0,,0


In [64]:
df.shape

(1689188, 13)

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1689188 entries, 0 to 1689187
Data columns (total 13 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   reviewerID       1689188 non-null  object 
 1   asin             1689188 non-null  object 
 2   reviewerName     1664458 non-null  object 
 3   helpful          1689188 non-null  object 
 4   reviewText       1689188 non-null  object 
 5   overall          1689188 non-null  float64
 6   summary          1689188 non-null  object 
 7   unixReviewTime   1689188 non-null  int64  
 8   reviewTime       1689188 non-null  object 
 9   helpful_votes    1689188 non-null  int64  
 10  overall_votes    1689188 non-null  int64  
 11  percent_helpful  725961 non-null   float64
 12  review_helpful   1689188 non-null  int32  
dtypes: float64(2), int32(1), int64(3), object(7)
memory usage: 174.0+ MB


In [73]:
df.describe()

Unnamed: 0,overall,unixReviewTime,helpful_votes,overall_votes,percent_helpful,review_helpful
count,1689188.0,1689188.0,1689188.0,1689188.0,725961.0,1689188.0
mean,4.222779,1340571000.0,3.160563,3.780423,74.902496,0.08455838
std,1.185632,63424510.0,38.96195,40.67347,35.223544,0.2782235
min,1.0,929232000.0,0.0,0.0,0.0,0.0
25%,4.0,1318118000.0,0.0,0.0,50.0,0.0
50%,5.0,1360800000.0,0.0,0.0,100.0,0.0
75%,5.0,1385078000.0,1.0,2.0,100.0,0.0
max,5.0,1406074000.0,30735.0,31453.0,100.0,1.0
