# Amazon Customer Review Analysis
## By. Andrew Simmons & Jingnan Jin

## Schema Reference
0. marketplace
1. customer_id
2. review_id
3. product_id
4. product_parent
5. product_title
6. product_category
7. star_rating
8. helpful_votes
9. total_votes
10. vine
11. verified_purchase
12. review_headline
13. review_body
14. review_date

In [None]:
from collections import defaultdict
from functools import reduce
from operator import itemgetter
from pathlib import Path
import re

import findspark
import matplotlib.pyplot as plt
import numpy as np

findspark.init()

# create entry points to spark
try:
    sc.stop()
except:
    pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = SparkConf().setAppName("AmazonCustomerReviewAnalysis").setMaster("local[*]")
sc = SparkContext(conf=conf)
spark = SparkSession(sparkContext=sc)

In [None]:
# Constants
PLOT_DIMENSIONS = (20, 10)

In [None]:
"""Process list of files to separate by category"""

data_dir = Path('sample_data')
data_files = data_dir.glob('*.tsv.gz')

data_categories = defaultdict(list)
for file in data_files:
    category_name = file.name[18:-13].replace('_', ' ')
    data_categories[category_name].append(file)

In [None]:
"""Create mapping of category names to unioned RDD"""

for key, value in data_categories.items():
    data_categories[key] = sc.union([sc.textFile(str(file)) for file in value])

In [None]:
"""Remove headers from data"""

for key, value in data_categories.items():
    data_categories[key] = value.filter(lambda x: not x.startswith('marketplace'))

In [None]:
"""Split TSV"""

for key, value in data_categories.items():
    data_categories[key] = value.map(lambda x: x.split('\t'))

## How many records exist in each category?

In [None]:
record_counts = []
for key, value in data_categories.items():
    record_counts.append((key, value.count()))

categories, counts = zip(*sorted(record_counts, key=itemgetter(1)))

fig, ax = plt.subplots()
fig.set_size_inches(PLOT_DIMENSIONS)

rects = ax.bar(categories, counts)

ax.set_title('Amazon Review Counts by Category')
ax.set_xlabel('Product Category')
ax.set_ylabel('Review Counts')

for tick in ax.get_xticklabels():
    tick.set_rotation(90)

print(f'Total number of reviews in dataset: {sum(counts)}')

## Overall, how satisfied are customers of each product category?

In [None]:
average_review_by_category = []
for key, value in data_categories.items():
    star_ratings = value.map(lambda x: int(x[7]))
    average_review_by_category.append((key, star_ratings.mean()))

categories, ratings = zip(*sorted(average_review_by_category, key=itemgetter(1)))

fig, ax = plt.subplots()
fig.set_size_inches(PLOT_DIMENSIONS)

rects = ax.bar(categories, ratings)

ax.set_title('Average Product Category Rating')
ax.set_xlabel('Product Category')
ax.set_ylabel('Average Rating')

for tick in ax.get_xticklabels():
    tick.set_rotation(90)