## <b>Initial Library Imports</b>

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### <b> Importing CSV, Finding products (Earbuds)</b>

In [18]:
# Read dataset and search for products
df = pd.read_csv("new_reference_data_for_beats.csv")

products = df["product_id"].unique()

print(products)

['B09GJVTRNZ' 'B0BYC52LYP' 'B08X4YMTPM' 'B0D6WD2QSQ' 'B0DYB6KMJH'
 'B0CVFKZ1LC' 'B0BW34LCB8' 'B09PYVXXW5' 'B09H27SXHS' 'B09FM6PDHP'
 'B09HN594TL' 'B096SV8SJG' 'B0BTZKP1TP' 'B08ZR5JB9G' 'B0C2F5KD26'
 'B0BYPFNW6T' 'B093SLWMS7' 'B0CXL4FQBK' 'B099TJGJ91' 'B0D4SX9RC6'
 'B0B44F1GGK' 'B0BG93TZ5N' 'B09XXW54QG' 'B07DD3WBYW' 'B07YBN9XXG'
 'B0CY6S748H' 'B01IOD7KB6' 'B0CVFM97GD' 'B08NLCW9WY' 'B08YRT9T38'
 'B08VL5S148' 'B097XX34SL' 'B07PXGQC1Q' 'B0BZ9WMLNQ' 'B0D3JB14QS'
 'B0BQPNMXQV' 'B0C1QWWZR4' 'B09CKF166Y' 'B0CF7GYNW2' 'B09CFP6J6D'
 'B0BZTCXG6T' 'B0863H1JKB' 'B09JL41N9C' 'B09V9P5Q6W' 'B0BZK2Z2TC'
 'B0C345M3T7' 'B0CD2FSRDD' 'B0CPFV77W4' 'B0B1NGPY94' 'B0BYSQDWRT'
 'B0B2SH4CN6' 'B09D1HMBQ3' 'B0D4STD5ZC' 'B099TLMRB6' 'B07GWRCZQP'
 'B09GK5JMHK' 'B088KRKFJ3' 'B08Z1RP9K8' 'B0B445JCZ3' 'B0B43Y8GHZ']


In [19]:
# Determine which products are beats earbuds and which ones aren't w/ brute force :(
# Double check all product codes on amazon

product_mapping = {
    "B09PYVXXW5" : "1MORE ComfoBuds Mini",
    "B09H27SXHS" : "HTC True Wireless Bluetooth Earbuds 2",
    "B09FM6PDHP" : "JBL Tune 230NC TWS",
    "B09HN594TL" : "JLab Go Air Pop True Wireless Bluetooth Earbuds",
    "B096SV8SJG" : "Beats Studio Buds",
    "B08ZR5JB9G" : "Sennheiser Consumer Audio CX True Wireless Earbuds"
}

### <b> Mapping Products to ID's, Removing All other results </b>

In [20]:
product_ids = product_mapping.keys()
product_names = product_mapping.values()

In [21]:
#Remove results not found in products I'm trying to analyze
df = df[df['product_id'].isin(product_ids)]

In [22]:
df['product_name'] = df['product_id'].map(product_mapping)

### <b> Basic Cleaning </b>

In [23]:
# Remove duplicate answers
df = df.drop_duplicates()

In [24]:
# Check where most null values are:
for i in df.columns:
    print("Column Name: {} | Null vals: {}".format(i, df[i].isnull().sum()))

Column Name: review_id | Null vals: 0
Column Name: product_id | Null vals: 0
Column Name: title | Null vals: 0
Column Name: author | Null vals: 1
Column Name: rating | Null vals: 0
Column Name: content | Null vals: 2
Column Name: timestamp | Null vals: 0
Column Name: profile_id | Null vals: 0
Column Name: is_verified | Null vals: 0
Column Name: helpful_count | Null vals: 0
Column Name: product_attributes | Null vals: 101
Column Name: product_name | Null vals: 0


In [25]:
# Handle missing qualitative data other than attributes
df_dropped = df
df_dropped['author'] = df_dropped['author'].fillna("Unknown")
df_dropped['content'] = df_dropped['content'].dropna()

In [26]:
# Check if there is a specific products causing the null vals for
df_check = df_dropped
df_check = df_check[df_check['product_attributes'].isnull()]
null_companies = df_check.groupby('product_name')['product_attributes'].value_counts(dropna=False)
print(null_companies)

product_name                           product_attributes
Beats Studio Buds                      NaN                     1
HTC True Wireless Bluetooth Earbuds 2  NaN                   100
Name: count, dtype: int64


In [27]:
# Fill Values for the specific product in the cleaned dataset
df_dropped = df_dropped.fillna("Unknown")
df['product_attributes'] = df['product_attributes'].fillna("Not Specified")

In [28]:
df_dropped

Unnamed: 0,review_id,product_id,title,author,rating,content,timestamp,profile_id,is_verified,helpful_count,product_attributes,product_name
45,RUE030N50F9EJ,B09PYVXXW5,5.0 out of 5 stars Really good with a couple o...,CTM,5,I love TWS earbuds. I have many including buds...,"Reviewed in the United States May 14, 2022",AEGYSY5H3ZUJC4SGGPRM3Z2OE5PA,1,32,Color: Black,1MORE ComfoBuds Mini
46,R385JSD6KWP2QU,B09PYVXXW5,4.0 out of 5 stars I wish I could rate 5 stars...,Gianna,4,…I just can’t. Because as useful as these earb...,"Reviewed in the United States July 22, 2023",AHINA7A6O2I5RZSNAY4OWYN4QXVA,1,31,Color: Red,1MORE ComfoBuds Mini
47,R1UB1V4EPP9MN3,B09PYVXXW5,"5.0 out of 5 stars Basically perfect, fantasti...",Colin M.,5,I needed a replacement for my Galaxy buds pro'...,"Reviewed in the United States August 18, 2022",AFG2T5XGMQCACK7JBDRHKEKWJLPA,1,21,Color: Red,1MORE ComfoBuds Mini
48,RWYK1GXIVV6H1,B09PYVXXW5,1.0 out of 5 stars Decent but Defective,Aquila,1,"UPDATE 6/24/22: As of 6/23/22, my replacement ...","Reviewed in the United States June 3, 2022",AGFUWY2GO4HF5RMLUR7ZOSKID4KA,1,9,Color: Black,1MORE ComfoBuds Mini
49,R3FBQBGQM3II4W,B09PYVXXW5,3.0 out of 5 stars I went with soundcore instead,Frankie,3,So out of the box these tiny buds surprised me...,"Reviewed in the United States October 28, 2022",AEOFU2SCDWYLS6DTSXIR6FWMRQMQ,1,7,Color: Black,1MORE ComfoBuds Mini
...,...,...,...,...,...,...,...,...,...,...,...,...
2448,R1XBAKBKG2S46J,B096SV8SJG,4.0 out of 5 stars Work great!,Ginia M,4,Nice quality sound,"Reviewed in the United States August 5, 2024",AFXUF2INMS7JNNUE4ROFDBIIYOXQ,1,0,Color: BlackStyle: Studio BudsSet: Without App...,Beats Studio Buds
2449,R2W75ORU094CIS,B096SV8SJG,5.0 out of 5 stars Comfortable,Laura,5,I got these to replace my Apple AirPods becaus...,"Reviewed in the United States August 6, 2024",AEJDQQ6ATDSOHAIQJZR77ASZWNLA,1,0,Color: WhiteStyle: Studio BudsSet: Without App...,Beats Studio Buds
2450,R30OPOXA9TLKOO,B096SV8SJG,5.0 out of 5 stars Work great,Gary Hageman,5,These work great. Got them on prime days for 1...,"Reviewed in the United States August 6, 2024",AH4XWZN7MMJY5DPCQDWDM5KKPSAQ,1,0,Color: BlackStyle: Studio BudsSet: Without App...,Beats Studio Buds
3030,R31Y8P0O74CZ2I,B08ZR5JB9G,5.0 out of 5 stars Excellent product,Unknown,5,The sound quality it produces is excellent and...,"Reviewed in the United States June 7, 2024",AFYSPXTT2U35CMRK3NZWS76HBEKA,1,0,Color: BlackSize: SmallStyle: True wireless ea...,Sennheiser Consumer Audio CX True Wireless Ear...


In [29]:
df = df_dropped
print(df.isnull().value_counts())

review_id  product_id  title  author  rating  content  timestamp  profile_id  is_verified  helpful_count  product_attributes  product_name
False      False       False  False   False   False    False      False       False        False          False               False           625
Name: count, dtype: int64


In [30]:
df.to_csv("reviews_cleaned.csv")

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 625 entries, 45 to 3031
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   review_id           625 non-null    object
 1   product_id          625 non-null    object
 2   title               625 non-null    object
 3   author              625 non-null    object
 4   rating              625 non-null    int64 
 5   content             625 non-null    object
 6   timestamp           625 non-null    object
 7   profile_id          625 non-null    object
 8   is_verified         625 non-null    int64 
 9   helpful_count       625 non-null    int64 
 10  product_attributes  625 non-null    object
 11  product_name        625 non-null    object
dtypes: int64(3), object(9)
memory usage: 63.5+ KB
