In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import chi2_contingency
from scipy.stats.contingency import association


In [None]:
data=pd.read_csv("/Users/angie/Documents/Ironhack/labs/Unit5/lab1/amz_uk_price_prediction_dataset.csv")

In [None]:
data

## Part 1: Analyzing Best-Seller Trends Across Product Categories
Objective: Understand the relationship between product categories and their best-seller status.

1. Crosstab Analysis:

- Create a crosstab between the product category and the isBestSeller status.

- Are there categories where being a best-seller is more prevalent?:
    - No, the frequency of 'False'is higher than the 'True'. And, if we sum it, the sum of False is quite higher respect the True.

Hint: one option is to calculate the proportion of best-sellers for each category and then sort the categories based on this proportion in descending order.

2. Statistical Tests:

- Conduct a Chi-square test to determine if the best-seller distribution is independent of the product category.
- Compute Cram√©r's V to understand the strength of association between best-seller status and category.
    - The p.value es lower than 0.05, therefore there is an association between these two columms, but the association strength is not significan, being the Cramer's result closer to zero than to one.

3. Visualizations:

- Visualize the relationship between product categories and the best-seller status using a stacked bar chart.

In [None]:
crosstab_table=pd.crosstab(data.category, data.isBestSeller).sort_values(by=[False], ascending=False)
crosstab_table.head(20)
crosstab_table.sum()

In [None]:
chi2, pv, _,_= chi2_contingency(crosstab_table)
chi2, pv

In [None]:
association(crosstab_table, method="cramer")

In [None]:
crosstab_table.head(20).plot(kind="bar", stacked=True)

## Part 2: Exploring Product Prices and Ratings Across Categories and Brands
Objective: Investigate how different product categories influence product prices.

0. Preliminary Step: Remove outliers in product prices.

For this purpose, we can use the IQR (Interquartile Range) method. Products priced below the first quartile minus 1.5 times the IQR or above the third quartile plus 1.5 times the IQR will be considered outliers and removed from the dataset. The next steps will be done with the dataframe without outliers.

Hint: you can check the last Check For Understanding at the end of the lesson EDA Bivariate Analysis for a hint on how to do this.

1. Violin Plots:

- Use a violin plot to visualize the distribution of price across different product categories. Filter out the top 20 categories based on count for better visualization.
- Which product category tends to have the highest median price? Don't filter here by top categories.

    - The Digital cameras and the Professional Medical Supplies, tends to have the highest median price. Between, 1500 and 2000 GBP

2. Bar Charts:

- Create a bar chart comparing the average price of products for the top 10 product categories (based on count).
- Which product category commands the highest average price? Don't filter here by top categories.

    -  The Digital cameras commands the highest AVG price, follow by Professional Medical Supplies

3. Box Plots:

- Visualize the distribution of product ratings based on their category using side-by-side box plots. Filter out the top 10 categories based on count for better visualization.
- Which category tends to receive the highest median rating from customers? Don't filter here by top categories.
    - the median for the ratings in each category is being the same for the categorias, office products, kids toys, beer, wine and spirtis, Luxury drinks and hobbies. Therefore those are having the highest median rating

In [None]:
def tukeys_test_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1

    # Define bounds for the outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify the outliers
    outliers = data[(data < lower_bound) | (data > upper_bound)]

    return outliers

In [None]:
outliers=tukeys_test_outliers(data['price'])
outliers

In [None]:
df=data.iloc[~outliers.index,:].reset_index()
df

In [None]:
#Filtering by count the first 20:
filtered=df.groupby('category')['price'].count().head(20)
df['filter_categories']=df['category'].apply(lambda value: value if value in filtered.keys() else "None")
df_filtered=df['filter_categories']!="None"
df[df_filtered]

In [None]:
#filtered violin plot
plt.figure(figsize=(30, 6))
plot=sns.violinplot(data=df[df_filtered], x='category', y='price', palette="coolwarm", legend=False)
plt.xticks(rotation=90)
plot

In [None]:
#Violinplot without filtering:
plt.figure(figsize=(50, 6))
plot_=sns.violinplot(data=df, x='category', y='price', palette="coolwarm", legend=False)
plt.xticks(rotation=90)
plot_



In [None]:
#Filtering by count the first 10 for the barcharts:
filtered_bar=df.groupby('category')['price'].count().head(10)
df['filter_categories']=df['category'].apply(lambda value: value if value in filtered_bar.keys() else "None")
df_filtered_bar=df['filter_categories']!="None"
df[df_filtered_bar]

In [None]:
#filtered barplot
sns.barplot(data=df[df_filtered_bar], x='category', y='price', palette="coolwarm")
plt.xticks(rotation=90)

In [None]:
#witout filter barplot
plt.figure(figsize=(50, 6))
bar_plt=sns.barplot(data=df, x='category', y='price', palette="coolwarm")
plt.xticks(rotation=90)
bar_plt

In [None]:
#filtered boxplot
sns.boxplot(data=df[df_filtered_bar], x='category', y='stars', palette="coolwarm")
plt.xticks(rotation=90)

In [None]:
#Without boxplot
plt.figure(figsize=(50, 6))
boxplt=sns.boxplot(data=df, x='category', y='stars', palette="coolwarm")
plt.xticks(rotation=90)
boxplt

# Part 3: Investigating the Interplay Between Product Prices and Ratings
Objective: Analyze how product ratings (stars) correlate with product prices.

1. Correlation Coefficients:

- Calculate the correlation coefficient between price and stars.
- Is there a significant correlation between product price and its rating?

    - The pearson's result is negative, meaning that there is not any linear correlation, between those two variables. Also there is not a monotonic correlation, being the spearman" results, negative also.

2. Visualizations:

- Use a scatter plot to visualize the relationship between product rating and price. What patterns can you observe?

    - highest prices, are rated with zero stars or none rates. Annd products that have zero prices, have more stars ratings given.
- Use a correlation heatmap to visualize correlations between all numerical variables.


- Examine if product prices typically follow a normal distribution using a QQ plot.


In [None]:
correlation = df['price'].corr(df["stars"])
correlation

In [None]:
correlation_sp = df['price'].corr(df["stars"], method="spearman")
correlation_sp

In [None]:
sns.scatterplot(data=df, x='price', y='stars');

In [None]:
sns.lmplot(data=df,x='price', y='stars')

In [None]:
categorical_numerical = df.select_dtypes("number").loc[:, df.select_dtypes("number").nunique() > 20]
numerical_col=categorical_numerical.drop(['index', 'uid', 'boughtInLastMonth'], axis=1)
correlation_matrix = numerical_col.corr()
correlation_matrix

In [None]:
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")

In [None]:
import statsmodels.api as sm

In [None]:
#data with the outliers removed
sm.qqplot(df['price'], line='s');

In [None]:
#complete data with outliers
sm.qqplot(data['price'], line='s')