# Lab - EDA Bivariate Analysis: Diving into Amazon UK Product Insights Part II


In [None]:
import pandas as pd
import os
import glob
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis
from scipy.stats import chi2_contingency
from scipy.stats.contingency import association
from scipy.stats import pearsonr, spearmanr
import scipy.stats as stats

In [None]:
df = pd.read_csv('C:/Users/faval/Desktop/Ironhack/DataAnalytics/week5/day1/lab-eda-univariate/amz_uk_price_prediction_dataset.csv')

In [None]:
df.head()

**PART 1:** Analyzing Best-Seller Trends Across Product Categories

**1. Crosstab Analysis:**


In [None]:
crosstab_result = pd.crosstab(df['category'], df['isBestSeller'])
crosstab_result

In [None]:
proportions = (df.groupby('category')['isBestSeller'].apply(lambda x: (x == True).sum() / len(x)))
proportions_sorted = proportions.sort_values(ascending=False)
top_10_proportions = proportions_sorted.head(10).index
print(top_10_proportions)

In [None]:
top_5_proportions = proportions_sorted.head(5).index

In [None]:
top_10_proportions_values= proportions_sorted.head(10)
top_10_proportions_values

**2. Statistical  Tests:**

In [None]:
chi2_statistic, chi2_p_value, _, _ = chi2_contingency(crosstab_result)
chi2_statistic, chi2_p_value
# p-value = 0.0 this result indicates that there is a statistically significant association between the two variables

In [None]:
association(crosstab_result, method="cramer")
# The Cramér's V value of 0.122 suggests a weak association between the two variables.
# Though statistically significant, the strength of this relationship is not very strong in practical terms.

**3. Visualizations:**

In [None]:
top_10_crosstab = crosstab_result.loc[top_10_proportions]
top_10_crosstab.plot(kind="bar", stacked=True, figsize=(10, 6))

In [None]:
plt.figure(figsize=(15, 6))
sns.countplot(data=df, x='category', hue='isBestSeller', order=top_5_proportions)
plt.xticks(rotation=45)
plt.show()

**PART 2:** Exploring Product Prices and Ratings Across Categories and Brands

**0. Preliminary Step: Remove outliers in product prices.**

In [None]:
df.head()

In [None]:
def tukeys_test_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1

    # Define bounds for the outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify the outliers
    outliers = data[(data < lower_bound) | (data > upper_bound)]

    return outliers

In [None]:
outliers = tukeys_test_outliers(df['price'])
print(outliers)

In [None]:
non_outliers_mask = ~df['price'].index.isin(outliers.index)
df_cleaned = df[non_outliers_mask]
df_cleaned["price"].describe()

In [None]:
plt.figure(figsize=(4, 3))
sns.boxplot(y=df_cleaned['price'], color='mintcream')
plt.title('Box Plot of Prices (Cleaned Data)')
plt.ylabel('Price')
plt.show()

**1. Violin Plots:**

In [None]:
top_20_categories = df_cleaned['category'].value_counts().head(20).index
print(top_20_categories)

In [None]:
df_top_20 = df_cleaned[df_cleaned['category'].isin(top_20_categories)]

In [None]:
plt.figure(figsize=(16, 8))
sns.violinplot(data=df_top_20, x='category', y='price', palette="coolwarm", hue='category', dodge=False)
plt.xticks(rotation=45)
plt.show()

In [None]:
median_price_by_category = df_cleaned.groupby('category')['price'].median()
median_price_sorted = median_price_by_category.sort_values(ascending=False)
print(median_price_sorted)
# Product category that tends to have the highest median price is: Desktop PCs with median price of €74.00

**2. Bar Charts:**

In [None]:
top_10_categories = df_cleaned['category'].value_counts().head(10).index
df_top_10 = df_cleaned[df_cleaned['category'].isin(top_10_categories)]
mean_price_by_category = df_top_10.groupby('category')['price'].mean()

In [None]:
plt.figure(figsize=(15, 6))
mean_price_by_category.plot(kind='bar', color='lavender')
plt.xticks(rotation=45);

In [None]:
mean_price_by_category = df_cleaned.groupby('category')['price'].mean()
mean_price_by_category = mean_price_by_category.sort_values(ascending=False)
print(mean_price_by_category)
# Motherboards with a mean price of 68.77 commands the highest average price

**3. Box Plots:**

In [None]:
plt.figure(figsize=(16, 8))
sns.boxplot(x='category', y='stars', data=df_top_10, hue='category', palette='pastel', dodge=False, legend=False)
plt.xticks(rotation=45)
plt.show()

In [None]:
median_stars_by_category = df_cleaned.groupby('category')['stars'].median()
median_stars_sorted = median_stars_by_category.sort_values(ascending=False)
print(median_stars_sorted)
# Product category that tends to have the highest median rating is: Computer Memory with median rating of 4.7

**PART 3:** Investigating the Interplay Between Product Prices and Ratings

**1. Correlation Coefficients:**

In [None]:
pearson_corr, pearson_p = pearsonr(df['price'], df['stars'])
print(f"Pearson Correlation: {pearson_corr}, p-value: {pearson_p}")
# A coefficient of -0.125 suggests a very weak negative linear correlation. 

In [None]:
spearman_corr, spearman_p = spearmanr(df['price'], df['stars'])
print(f"Spearman Correlation: {spearman_corr}, p-value: {spearman_p}")
# A coefficient of -0.133 also indicates a very weak negative monotonic relationship.

**2. Visualizations:**

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='price', y='stars', data=df, alpha=0.5, color='teal')
plt.xlim(0, 100)
plt.show()

In [None]:
df.info()
# I dont consider"uid" nor "boughtInLastMonth" to be numerical.

In [None]:
numerical_df = df[['stars', 'reviews', 'price']]
correlation_matrix = numerical_df.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='Pastel1', fmt=".2f", linewidths=0.5, square=True)
plt.title('Correlation Heatmap for Numerical Features')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
stats.probplot(df['price'], dist="norm", plot=plt)
plt.title('Q-Q Plot for Product Prices')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
stats.probplot(df_cleaned['price'], dist="norm", plot=plt)
plt.title('Q-Q Plot for Product Prices without outliers')
plt.show()