# Project Setup

In [None]:
import pandas as pd
import numpy as np
import re

# Dataset Analysis
In this Jupyter notebook, we'll analyze the data that we extracted previously.

In [None]:
DRIVE = 'drive/MyDrive/'
DATASET_NAME = 'combined_dataset.csv'
DATASET_PATH = DRIVE + DATASET_NAME

df = pd.read_csv(DATASET_PATH)
df.head()

Unnamed: 0,rating,parent_asin,timestamp,review,category
0,5,B01BRVADG6,2019-02-07 22:41:27.737,One of my favorite oils. This oil is wonderful...,All_Beauty
1,5,B01BRVADG6,2017-06-04 13:07:28.000,Five Stars. I love the oils! My skin is so muc...,All_Beauty
2,5,B01BRVADG6,2019-10-16 15:50:59.484,best massage oil. I have now used this product...,All_Beauty
3,5,B01BRVADG6,2017-08-09 11:35:37.781,Five Stars. 2 or 3 time buying I love this oil,All_Beauty
4,5,B01BRVADG6,2017-01-31 12:32:56.000,Five Stars. Awesome,All_Beauty


## Number of reviews

In [None]:
df.shape

(10514, 5)

In [None]:
df.groupby(by="category").size()

Unnamed: 0_level_0,0
category,Unnamed: 1_level_1
All_Beauty,391
Amazon_Fashion,346
Appliances,551
Arts_Crafts_and_Sewing,282
Automotive,250
Baby_Products,250
Beauty_and_Personal_Care,250
Books,251
CDs_and_Vinyl,253
Cell_Phones_and_Accessories,249


In [None]:
asin_counts = df.groupby(['category', 'parent_asin']).size().reset_index(name='review_count')

asin_review_stats = asin_counts.groupby('category').agg(
    avg_reviews_per_asin=('review_count', 'mean'),
    min_reviews_per_asin=('review_count', 'min'),
    max_reviews_per_asin=('review_count', 'max')
).reset_index()

asin_review_stats

Unnamed: 0,category,avg_reviews_per_asin,min_reviews_per_asin,max_reviews_per_asin
0,All_Beauty,65.166667,23,208
1,Amazon_Fashion,49.428571,28,103
2,Appliances,91.833333,30,333
3,Arts_Crafts_and_Sewing,56.4,21,161
4,Automotive,20.833333,9,56
5,Baby_Products,20.833333,1,92
6,Beauty_and_Personal_Care,20.833333,1,107
7,Books,20.916667,5,72
8,CDs_and_Vinyl,21.083333,1,129
9,Cell_Phones_and_Accessories,22.636364,2,192


## Rating

In [None]:
df['rating'].mean(), df['rating'].std()

(4.264124025109378, 1.2665916872698675)

In [None]:
df.groupby(by="category").agg({'rating':['mean', 'std']})

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,mean,std
category,Unnamed: 1_level_2,Unnamed: 2_level_2
All_Beauty,4.084399,1.328403
Amazon_Fashion,3.991329,1.406995
Appliances,4.38294,1.27079
Arts_Crafts_and_Sewing,4.400709,1.1867
Automotive,4.212,1.388079
Baby_Products,4.296,1.273921
Beauty_and_Personal_Care,3.808,1.521973
Books,4.454183,1.054937
CDs_and_Vinyl,4.758893,0.667537
Cell_Phones_and_Accessories,4.285141,1.235815


## Review length

In [None]:
df['num_characters'] = df['review'].apply(len)
df['num_words'] = df['review'].apply(lambda x: len(x.split()))

df['num_characters'].mean(), df['num_words'].mean()

(228.62830511698687, 42.0527867605098)

In [None]:
result = df.groupby('category').agg(
    avg_characters=('num_characters', 'mean'),
    min_characters=('num_characters', 'min'),
    max_characters=('num_characters', 'max'),
    avg_words=('num_words', 'mean'),
    min_words=('num_words', 'min'),
    max_words=('num_words', 'max')
).reset_index()

result

Unnamed: 0,category,avg_characters,min_characters,max_characters,avg_words,min_words,max_words
0,All_Beauty,197.040921,8,2349,36.815857,2,456
1,Amazon_Fashion,168.462428,8,1552,31.141618,2,300
2,Appliances,145.568058,5,1541,26.381125,2,273
3,Arts_Crafts_and_Sewing,149.319149,6,1495,27.521277,2,298
4,Automotive,219.672,10,2062,40.436,2,362
5,Baby_Products,204.328,12,1674,37.532,2,301
6,Beauty_and_Personal_Care,235.448,10,1539,44.58,2,281
7,Books,318.474104,10,4388,57.478088,2,760
8,CDs_and_Vinyl,175.549407,4,2392,32.715415,2,395
9,Cell_Phones_and_Accessories,199.2249,14,1586,37.646586,3,310
