## Useful links:

- [Dataset documentation](https://nijianmo.github.io/amazon/index.html)
- [Complete Metadata files](http://deepyeti.ucsd.edu/jianmo/amazon/index.html)
- [Pandas reference sheet](https://ds100.org/sp21/resources/assets/exams/sp20/sp20_checkpoint_reference_sheet.pdf)
- [Data-200 Google Doc](https://docs.google.com/document/d/19HWODy5kpWoUB7BEKEmKLbRnK8MC1fBmRat_WP7vfNc/edit)
- [Grad Project Guidelines](https://ds100.org/sp21/grad_proj/gradproject/)

## Utils

In [None]:
# Imports.

import os
import json
import gzip
import urllib.request
from urllib.request import urlopen
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Utils.

#################################### Loading data. ####################################

def load_data(url, filename):
    if not os.path.exists(filename):
        urllib.request.urlretrieve(url,filename)
        
    # Load the data.
    data = []
    with gzip.open(filename) as f:
        for l in f:
            data.append(json.loads(l.strip()))
    
    df = pd.DataFrame.from_dict(data)
    print('data shape:', df.shape)
    print('first rows of data:')
    display(df.head(3))
    return df

def get_metadata_with_ratings(reviews, metadata):
    ratings = reviews[['asin', 'overall']].groupby('asin').agg('mean').rename(columns={'overall': 'rating'})
    metadata_with_ratings = metadata.merge(ratings, how="left", on="asin")
    
    # Check how many products have ratings.
    print('distribution of ratings:')
    display(metadata_with_ratings['rating'].describe())
    print('number of missing ratings:', metadata_with_ratings['rating'].isnull().sum())
    return metadata_with_ratings

################################## Data exploration. ##################################

def describe_feat(data, feat):
    display(data[feat].describe())
    print(f'number of missing {feat}s:', data[feat].isnull().sum())

def plot_joint(data, x, y):
    sns.jointplot(data=data, x=x, y=y, kind='reg',
                  scatter_kws={'alpha': 0.1, 's': 15}, line_kws={'color': 'r'})
    data_cleaned = data.query(f'not {x}.isnull() and not {y}.isnull()')
    X = data_cleaned[x].to_numpy().reshape(-1, 1)
    model = LinearRegression().fit(X, data_cleaned[y])
    print(f'y = {model.intercept_} + {model.coef_[0]} * x, r^2 = {model.score(X, data_cleaned[y])}')
    
###################################### Modeling. ######################################

def clean_features(metadata_with_ratings):
    # Clean price.
    metadata_with_ratings['price_float'] = pd.to_numeric(
            metadata_with_ratings['price'].str.replace('$', ''), errors='coerce')
    describe_feat(metadata_with_ratings, 'price_float')
    
    # Clean sales rank.
    metadata_with_ratings['rank_float'] = pd.to_numeric(metadata_with_ratings['rank'].str.replace(',', '') \
                                                      .str.extract('^(\d+)', expand=False), errors='coerce')
    describe_feat(metadata_with_ratings, 'rank_float')
    
    # Add sales rank category.
    metadata_with_ratings['rank_category'] = metadata_with_ratings['rank'] \
                .str.extract(' in (.+) \(', expand=False) \
                .str.replace('&amp;', '&')
    print('categories:')
    print(metadata_with_ratings['rank_category'].value_counts())
    
    # Clean description.
    metadata_with_ratings['description_str'] = metadata_with_ratings['description'].str.join('\n')
    
def transform_col(data, func, feat, new_feat):
    data[new_feat] = func(metadata_with_ratings[feat])
    data[new_feat].describe()

def extract_words(data, feat, max_words=100):
    vectorizer = CountVectorizer(max_features=max_words, stop_words='english')
    X = vectorizer.fit_transform(data[feat]).toarray()

    print('first 50 features:', vectorizer.get_feature_names()[:50])
    print('feature matrix shape', X.shape)
    return X

def get_feat_matrix(data):
    pass

## Load data

### Column labels:

Reviews:

- reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
- asin - ID of the product, e.g. 0000013714
- reviewerName - name of the reviewer
- vote - helpful votes of the review
- style - a disctionary of the product metadata, e.g., "Format" is "Hardcover"
- reviewText - text of the review
- overall - rating of the product
- summary - summary of the review
- unixReviewTime - time of the review (unix time)
- reviewTime - time of the review (raw)
- image - images that users post after they have received the product

Metadata:

- asin - ID of the product, e.g. 0000031852
- title - name of the product
- feature - bullet-point format features of the product
- description - description of the product
- price - price in US dollars (at time of crawl)
- image - url of the product image
- related - related products (also bought, also viewed, bought together, buy after viewing)
- salesRank - sales rank information
- brand - brand name
- categories - list of categories the product belongs to
- tech1 - the first technical detail table of the product
- tech2 - the second technical detail table of the product
- similar - similar product table

### Import data

In [None]:
url = "http://deepyeti.ucsd.edu/jianmo/amazon/categoryFiles/All_Beauty.json.gz"
filename = 'data/All_Beauty.json.gz'
reviews = load_data(url, filename)

In [None]:
url = "http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles/meta_All_Beauty.json.gz"
filename = 'data/Meta_All_Beauty.json.gz'
metadata = load_data(url, filename)

### Merge reviews and metadata

In [None]:
metadata_with_ratings = get_metadata_with_ratings(reviews, metadata)

### Clean features

In [None]:
clean_features(metadata_with_ratings)

In [None]:
transform_col(metadata_with_ratings, np.log, 'rank_float', 'log_rank')
transform_col(metadata_with_ratings, np.sqrt, 'rank_float', 'sqrt_rank')

In [None]:
metadata_beauty = metadata_with_ratings.query('rank_category == "Beauty & Personal Care"')

## Data exploration

### Sales rank vs average rating

In [None]:
# The two products with no reviews.
metadata_beauty.query('rating.isnull()')

The following three plots show the correlation between sales rank and average rating. The correlation is most apparent when taking the log of sales rank.

In [None]:
plot_joint(data=metadata_beauty, x='log_rank', y='rating')

In [None]:
plot_joint(data=metadata_beauty, x='sqrt_rank', y='rating')

In [None]:
plot_joint(data=metadata_beauty, x='rank_float', y='rating')

### Price vs sales rank and rating

In [None]:
price_truncated = metadata_beauty.query('price_float < 100')
plot_joint(data=price_truncated, x='price_float', y='sqrt_rank')

In [None]:
price_truncated = metadata_beauty.query('price_float < 100')
plot_joint(data=price_truncated, x='price_float', y='rating')

### Brand

In [None]:
brand_counts = metadata_beauty['brand'].value_counts()
display(brand_counts[:10])
display(brand_counts.iloc[1:].describe())

brand_counts_filtered = brand_counts.iloc[1:]
brand_counts_filtered = brand_counts_filtered[brand_counts_filtered > 10]
fig = plt.figure(figsize=(12, 4))
plt.plot(np.arange(len(brand_counts_filtered)), brand_counts_filtered)
plt.ylabel('frequency')
plt.title('Frequency of occurence for brands, from most to least common')

### Title and description

In [None]:
extract_words(metadata_beauty, 'title');

In [None]:
extract_words(metadata_beauty, 'description_str', 500);

### PCA