In [1]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from pathlib import Path

In [None]:
# Import Data
amazon_data1 = Path('amazon_products.csv')
amazon_data2 = Path('amazon_categories.csv')

print(amazon_data1)
print(amazon_data2)

In [None]:
# Read in the Amazon Products CSV as a DataFrame
amazon_data1_df = pd.read_csv(amazon_data1)

# Print the first 3 rows of the DataFrame
display(amazon_data1_df.head(3))

In [None]:
# Read in the Amazon Categories CSV as a DataFrame
amazon_data2_df = pd.read_csv(amazon_data2)

# Print the first 3 rows of the DataFrame
display(amazon_data2_df.head(5))

In [None]:
# Merge the Amazon Products and Amazon Categories DataFrames
# Amazon Categories has column name 'id' and Amazon Products has column name 'category_id'
# Merge on 'id' and 'category_id'
amazon_data_merged_df = pd.merge(amazon_data1_df, amazon_data2_df, left_on='category_id', right_on='id')
amazon_data_merged_df = amazon_data_merged_df.drop(columns=['id'])

# Print the first and last 3 rows of the merged DataFrame
display(amazon_data_merged_df.head(3))
display(amazon_data_merged_df.tail(3))

In [None]:
# Print the info of the merged DataFrame
amazon_data_merged_df.info()

In [None]:
# Clean the data

# Check for null values
display(amazon_data_merged_df.isnull().mean() * 100)

# Drop any null values
amazon_data_merged_df = amazon_data_merged_df.dropna()

# Check for duplicates
display(amazon_data_merged_df.duplicated().sum())

# Drop any duplicates
amazon_data_merged_df = amazon_data_merged_df.drop_duplicates()

# Check the data types
display(amazon_data_merged_df.dtypes)

# Print first 3 rows of the Amazon Products DataFrame
display(amazon_data_merged_df.head(3))

In [None]:
# Change 'asin' name to 'Product ID', 'title' to 'Product Desccription', 'stars' to 'Stars', 
# 'price' to 'Price', 'listPrice' to 'List Price', 'category_id' to 'Category ID', 'isBestSeller' to 'Best Seller',
# 'boughtinLastMonth' to 'Bought Last Month', 'category_name' to 'Category Name'
amazon_data_merged_df = amazon_data_merged_df.rename(columns={'asin': 'Product ID', 
                                                              'title': 'Product Description', 
                                                              'stars': 'Stars',
                                                              'reviews': 'Reviews',
                                                              'price': 'Price', 
                                                              'listPrice': 'List Price', 
                                                              'category_id': 'Category ID', 
                                                              'isBestSeller': 'Best Seller',
                                                              'boughtInLastMonth': 'Product Volume',                                                         
                                                              'category_name': 'Category Name'})

# Print the first 3 rows of the merged DataFrame
amazon_data_merged_df.head(3)

In [None]:
# Drop 'imgUrl', 'productURL' columns from Amazon Products DataFrame
amazon_data_merged_df = amazon_data_merged_df.drop(columns=['imgUrl', 'productURL'])

# Print first 3 rows of the Amazon Products DataFrame
display(amazon_data_merged_df.head(3))

In [10]:
# Drop Product ID and Category ID columns
amazon_data_merged_df = amazon_data_merged_df.drop(columns=['Product ID', 'Category ID'])


In [None]:
# Change the column order
amazon_data_merged_df = amazon_data_merged_df[['Category Name', 'Product Description', 'Price', 'List Price', 'Stars', 'Reviews', 'Best Seller', 'Product Volume']]

# Print the first 3 rows of the merged DataFrame
amazon_data_merged_df.head(3)

In [None]:
# Calculate the measures of central tendency of Price
from scipy import stats

mean_numpy = np.mean(amazon_data_merged_df["Price"])
median_numpy = np.median(amazon_data_merged_df["Price"])
mode_scipy = stats.mode(amazon_data_merged_df["Price"])

print(f"The mean price of Amazon products is {mean_numpy}")
print(f"The median price of Amazon products is {median_numpy}")
print(f"The mode price of Amazon products is {mode_scipy}")

In [None]:
# Check values for Price
price_data = amazon_data_merged_df["Price"].tolist()
price_data

In [None]:
# Calculate the measures of central tendency of List Price
mean_numpy = np.mean(amazon_data_merged_df["List Price"])
median_numpy = np.median(amazon_data_merged_df["List Price"])
mode_scipy = stats.mode(amazon_data_merged_df["List Price"])

print(f"The mean list price of Amazon products is {mean_numpy}")
print(f"The median list price of Amazon products is {median_numpy}")
print(f"The mode list price of Amazon products is {mode_scipy}")

In [None]:
# Check values for List Price
list_price_data = amazon_data_merged_df["List Price"].tolist()
list_price_data

In [None]:
# Calculate the measures of central tendency of Stars
mean_numpy = np.mean(amazon_data_merged_df["Stars"])
median_numpy = np.median(amazon_data_merged_df["Stars"])
mode_scipy = stats.mode(amazon_data_merged_df["Stars"])

print(f"The mean stars of Amazon products is {mean_numpy}")
print(f"The median stars of Amazon products is {median_numpy}")
print(f"The mode stars of Amazon products is {mode_scipy}")

In [None]:
# Calculate the measures of central tendency of Reviews
mean_numpy = np.mean(amazon_data_merged_df["Reviews"])
median_numpy = np.median(amazon_data_merged_df["Reviews"])
mode_scipy = stats.mode(amazon_data_merged_df["Reviews"])

print(f"The mean reviews of Amazon products is {mean_numpy}")
print(f"The median reviews of Amazon products is {median_numpy}")
print(f"The mode reviews of Amazon products is {mode_scipy}")

In [None]:
# Check values for Reviews
reviews_data = amazon_data_merged_df["Reviews"].tolist()
reviews_data

In [None]:
# Calculate the measures of central tendency of Product Volume
mean_numpy = np.mean(amazon_data_merged_df["Product Volume"])
median_numpy = np.median(amazon_data_merged_df["Product Volume"])
mode_scipy = stats.mode(amazon_data_merged_df["Product Volume"])

print(f"The mean product volume of Amazon products is {mean_numpy}")
print(f"The median product volume of Amazon products is {median_numpy}")
print(f"The mode product volume of Amazon products is {mode_scipy}")

In [None]:
# Check values for Product Volume
product_volume_data = amazon_data_merged_df["Product Volume"].tolist()
product_volume_data

In [None]:
# Calculate variance and standard deviation for Price
variance = np.var(amazon_data_merged_df["Price"])
print(f"The variance of Amazon products is {variance}")

standard_deviation = np.std(amazon_data_merged_df["Price"])
print(f"The standard deviation of Amazon products is {standard_deviation}")

In [None]:
# Calculate variance and standard deviation for List Price
variance = np.var(amazon_data_merged_df["List Price"])
print(f"The variance of Amazon products is {variance}")

standard_deviation = np.std(amazon_data_merged_df["List Price"])
print(f"The standard deviation of Amazon products is {standard_deviation}")

In [None]:
# Calculate variance and standard deviation for Stars
variance = np.var(amazon_data_merged_df["Stars"])
print(f"The variance of Amazon products is {variance}")

standard_deviation = np.std(amazon_data_merged_df["Stars"])
print(f"The standard deviation of Amazon products is {standard_deviation}")

In [None]:
# Calculate variance and standard deviation for Reviews
variance = np.var(amazon_data_merged_df["Reviews"])
print(f"The variance of Amazon products is {variance}")

standard_deviation = np.std(amazon_data_merged_df["Reviews"])
print(f"The standard deviation of Amazon products is {standard_deviation}")

In [None]:
# Calculate variance and standard deviation for Product Volume
variance = np.var(amazon_data_merged_df["Product Volume"])
print(f"The variance of Amazon products is {variance}")

standard_deviation = np.std(amazon_data_merged_df["Product Volume"])
print(f"The standard deviation of Amazon products is {standard_deviation}")

In [None]:
# Find potential outliers by the interquartile range (IQR) for Price
Q1 = amazon_data_merged_df["Price"].quantile(0.25)
median = amazon_data_merged_df["Price"].quantile(0.5)
Q3 = amazon_data_merged_df["Price"].quantile(0.75)
IQR = Q3 - Q1

print(f"The lower quartile of the Price column is: {Q1}")
print(f"The upper quartile of the Price column is: {Q3}")
print(f"The interquartile range of the Price column is: {IQR}")
print(f"The median of the Price column is: {median}")

# Determine potential outliers for Price
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)
print(f"Values below {lower_bound} could be outliers for Price.")
print(f"Values above {upper_bound} could be outliers for Price.")



In [None]:
# Find potential outliers by the interquartile range (IQR) List Price
Q1 = amazon_data_merged_df["List Price"].quantile(0.25)
median = amazon_data_merged_df["List Price"].quantile(0.5)
Q3 = amazon_data_merged_df["List Price"].quantile(0.75)
IQR = Q3 - Q1

print(f"The lower quartile of the List Price column is: {Q1}")
print(f"The upper quartile of the List Price column is: {Q3}")
print(f"The interquartile range of the List Price column is: {IQR}")

# Determine potential outliers for List Price
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)
print(f"Values below {lower_bound} could be outliers for List Price.")
print(f"Values above {upper_bound} could be outliers for List Price.")

In [None]:
# Find potential outliers by the interquartile range (IQR) Stars
Q1 = amazon_data_merged_df["Stars"].quantile(0.25)
median = amazon_data_merged_df["Stars"].quantile(0.5)
Q3 = amazon_data_merged_df["Stars"].quantile(0.75)
IQR = Q3 - Q1

print(f"The lower quartile of the Stars column is: {Q1}")
print(f"The upper quartile of the Stars column is: {Q3}")
print(f"The interquartile range of the Stars column is: {IQR}")

# Determine potential outliers for Stars
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)
print(f"Values below {lower_bound} could be outliers for Stars.")
print(f"Values above {upper_bound} could be outliers for Stars.")

In [None]:
# Find potential outliers by the interquartile range (IQR) Reviews
Q1 = amazon_data_merged_df["Reviews"].quantile(0.25)
median = amazon_data_merged_df["Reviews"].quantile(0.5)
Q3 = amazon_data_merged_df["Reviews"].quantile(0.75)
IQR = Q3 - Q1

print(f"The lower quartile of the Reviews column is: {Q1}")
print(f"The upper quartile of the Reviews column is: {Q3}")
print(f"The interquartile range of the Reviews column is: {IQR}")

# Determine potential outliers for Reviews
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)
print(f"Values below {lower_bound} could be outliers for Reviews.")
print(f"Values above {upper_bound} could be outliers for Reviews.")

In [None]:
# Find potential outliers by the interquartile range (IQR) Product Volume
Q1 = amazon_data_merged_df["Product Volume"].quantile(0.25)
median = amazon_data_merged_df["Product Volume"].quantile(0.5)
Q3 = amazon_data_merged_df["Product Volume"].quantile(0.75)
IQR = Q3 - Q1

print(f"The lower quartile of the Product Volume column is: {Q1}")
print(f"The upper quartile of the Product Volume column is: {Q3}")
print(f"The interquartile range of the Product Volume column is: {IQR}")

# Determine potential outliers for Product Volume
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)
print(f"Values below {lower_bound} could be outliers for Product Volume.")
print(f"Values above {upper_bound} could be outliers for Product Volume.")

In [None]:
# Summary statistics for Best Seller
best_seller_summary = amazon_data_merged_df["Best Seller"].value_counts()
print("Summary Statistics for Best Seller:")
print(best_seller_summary)

In [None]:
# Summary statistics for Category Name
category_name_summary = amazon_data_merged_df["Category Name"].value_counts()
print("Summary Statistics for Category Name:")
print(category_name_summary)

In [None]:
# Summary statistics for Product Description
product_description_summary = amazon_data_merged_df["Product Description"].value_counts()
print("Summary Statistics for Product Description:")

In [None]:
product_description_summary = pd.DataFrame(product_description_summary)
product_description_summary.head(3)