In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv("~/code/aplabey/2nd_hand_fashion_valuation/raw_data/vestiaire.csv")

In [None]:
print(data.head())

In [None]:
data.info()

In [None]:
data.columns

In [None]:
data["has_cross_border_fees"].head()

In [None]:
data.duplicated().sum() 

In [None]:
# Counting the number of NaN for each column
data.isnull().sum().sort_values(ascending=False)

In [None]:
# Counting the percentage of NaN for each column
data.isnull().sum().sort_values(ascending=False) / len(data) #NaN percentage for each column

In [None]:
grouped_by_brand = data.groupby('brand_name')

In [None]:
brand_counts = grouped_by_brand.size().sort_values(ascending=False)
brand_counts

## discuss about threshold (useful? size?)

In [None]:
threshold = 50
brands_with_few_counts = brand_counts[brand_counts < threshold]
brands_with_few_counts

In [None]:
brand_mapping = {brand: brand if count >= threshold else 'Others'
                 for brand, count in brand_counts.items()}

In [None]:
data['brand_name'] = data['brand_name'].map(brand_mapping)

In [None]:
brand_counts = data['brand_name'].value_counts()
brand_counts.sort_values(ascending=False)

In [None]:
brand_price_mean = grouped_by_brand['price_usd'].mean().sort_values(ascending=False)
brand_price_mean

In [None]:
data[["price_usd"]].boxplot();

In [None]:
top_20_brands = data['brand_name'].value_counts().head(20).index
filtered_data = data[data['brand_name'].isin(top_20_brands)]
# Filter the DataFrame to include only the top 20 brands

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15, 10))
sns.boxplot(x='brand_name', y='price_usd', data=filtered_data)
plt.xticks(rotation=45)
plt.title('Boxplot of Prices by Top 20 Most Common Brands')
plt.xlabel('Brand')
plt.ylabel('Price')
plt.show();

In [None]:
# Save the indexes corresponding to rows 
# without very high values (price < 200.000)
boolean_mask = (filtered_data['price_usd']<20000) 

# Apply the boolean filtering
filtered_data_boolean = filtered_data[boolean_mask].reset_index(drop=True)

# Visualize the boxplot again
plt.figure(figsize=(15, 10))
sns.boxplot(x='brand_name', y='price_usd', data=filtered_data_boolean)
plt.xticks(rotation=45)
plt.title('Boxplot of Prices by Top 20 Most Common Brands')
plt.xlabel('Brand')
plt.ylabel('Price')
plt.show();

## Dropping Data

In [None]:
from sklearn.impute import SimpleImputer

# Step 1: Calculate the overall most frequent value
overall_most_frequent_value = data['usually_ships_within'].mode()[0]

# Step 2: Define the imputation function
def impute_most_frequent(group):
    # Check if all values in the group are missing
    if group['usually_ships_within'].isnull().all():
        # Impute with the overall most frequent value
        group['usually_ships_within'] = overall_most_frequent_value
    else:
        imputer = SimpleImputer(strategy="most_frequent")
        # Reshape to 2D array because SimpleImputer expects it
        group_reshaped = group[['usually_ships_within']].values.reshape(-1, 1)
        imputed = imputer.fit_transform(group_reshaped)
        group['usually_ships_within'] = imputed.ravel()
    return group

# Step 3: Group by 'brand_name' and apply the imputation function
data = data.groupby('brand_name').apply(impute_most_frequent).reset_index(drop=True)



In [None]:
# Counting the percentage of NaN for each column
data.isnull().sum().sort_values(ascending=False)

In [None]:
data.drop(['buyers_fees', 'has_cross_border_fees'], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
data_cleaned = data.dropna()
data_cleaned.isnull().sum().sort_values(ascending=False)

In [None]:
data_cleaned.info()

## Preprocessing

### Product_like_count
#### As it is very skewed I chose Robust Scaler

#### Feature Transformation/Engineering

If your feature is extremely skewed  
→
  consider Feature Engineering first (e.g. log(feature))
shell we do this?

# The following code is just how we would do without brands

In [None]:
sns.boxplot(data=data_cleaned, x='product_like_count');
plt.show()

In [None]:
# Step 1: Identify the top 20 most common brands
top_brands = data_cleaned['brand_name'].value_counts().head(20).index

# Step 2: Filter the data to only include these top brands
filtered_data = data_cleaned[data_cleaned['brand_name'].isin(top_brands)]

# Step 3: Plot the histogram
plt.figure(figsize=(12, 8))  # Set the figure size for better readability
likes_hist = sns.histplot(filtered_data, x='product_like_count', hue='brand_name', bins=200, kde=True)

# Add labels and title for clarity
plt.title('Distribution of Product Like Count for Top 20 Brands')
plt.xlabel('Product Like Count')
plt.ylabel('Frequency')

# Show the plot
plt.show();

In [None]:
from sklearn.preprocessing import RobustScaler

# Make sure that brand_name is only a column and not an index
data_cleaned = data_cleaned.reset_index(drop=True)

# Function to apply RobustScaler to each group
def scale_group(group):
    rb_scaler = RobustScaler()
    group['product_like_count'] = rb_scaler.fit_transform(group[['product_like_count']])
    return group

# Apply the function to each group
# Using group_keys=False to avoid adding the group name back as an index level
data_cleaned = data_cleaned.groupby('brand_name', group_keys=False).apply(scale_group)


# Reset the index to ensure 'brand_name' is a column, not an index
data_cleaned = data_cleaned.reset_index(drop=True)


# Display the first few rows of the transformed data
data_cleaned.head()

### Seller_products_sold

In [None]:
# Step 3: Plot the histogram
plt.figure(figsize=(12, 8))  # Set the figure size for better readability
products_sold_hist = sns.histplot(filtered_data, x='seller_products_sold', hue='brand_name', bins=200, kde=True)

# Add labels and title for clarity
plt.title('Distribution of Products sold per Seller for Top 20 Brands')
plt.xlabel('Products sold per Seller')
plt.ylabel('Frequency')

# Show the plot
plt.show();

In [None]:
def scale_group(group):
    rb_scaler = RobustScaler()
    group['seller_products_sold'] = rb_scaler.fit_transform(group[['seller_products_sold']])
    return group

# Apply the function to each group
# Using group_keys=False to avoid adding the group name back as an index level
data_cleaned = data_cleaned.groupby('brand_name', group_keys=False).apply(scale_group)


# Reset the index to ensure 'brand_name' is a column, not an index
data_cleaned = data_cleaned.reset_index(drop=True)


# Display the first few rows of the transformed data
data_cleaned.head()

### Seller_num_products_listed 

In [None]:
# Step 3: Plot the histogram
plt.figure(figsize=(12, 8))  # Set the figure size for better readability
products_listed_hist = sns.histplot(filtered_data, x='seller_num_products_listed', hue='brand_name', bins=200, kde=True)

# Add labels and title for clarity
plt.title('Distribution of Products listed per Seller for Top 20 Brands')
plt.xlabel('Products listed per Seller')
plt.ylabel('Frequency')

# Show the plot
plt.show();

In [None]:
def scale_group(group):
    rb_scaler = RobustScaler()
    group['seller_num_products_listed'] = rb_scaler.fit_transform(group[['seller_num_products_listed']])
    return group

# Apply the function to each group
# Using group_keys=False to avoid adding the group name back as an index level
data_cleaned = data_cleaned.groupby('brand_name', group_keys=False).apply(scale_group)


# Reset the index to ensure 'brand_name' is a column, not an index
data_cleaned = data_cleaned.reset_index(drop=True)


# Display the first few rows of the transformed data
data_cleaned.head()

### Seller_community_rank

In [None]:
# Step 3: Plot the histogram
plt.figure(figsize=(12, 8))  # Set the figure size for better readability
community_rank_hist = sns.histplot(filtered_data, x='seller_community_rank', hue='brand_name', bins=200, kde=True)

# Add labels and title for clarity
plt.title('Sellers community rank for Top 20 Brands')
plt.xlabel('Community Rank per Seller')
plt.ylabel('Frequency')

# Show the plot
plt.show();

In [None]:
def scale_group(group):
    rb_scaler = RobustScaler()
    group['seller_community_rank'] = rb_scaler.fit_transform(group[['seller_community_rank']])
    return group

# Apply the function to each group
# Using group_keys=False to avoid adding the group name back as an index level
data_cleaned = data_cleaned.groupby('brand_name', group_keys=False).apply(scale_group)


# Reset the index to ensure 'brand_name' is a column, not an index
data_cleaned = data_cleaned.reset_index(drop=True)


# Display the first few rows of the transformed data
data_cleaned.head()

### Seller_num_followers

In [None]:
# Step 3: Plot the histogram
plt.figure(figsize=(12, 8))  # Set the figure size for better readability
num_followers_hist = sns.histplot(filtered_data, x='seller_num_followers', hue='brand_name', bins=200, kde=True)

# Add labels and title for clarity
plt.title('Number of Followers per Seller for Top 20 Brands')
plt.xlabel('Follower per Seller')
plt.ylabel('Frequency')

# Show the plot
plt.show();

In [None]:
def scale_group(group):
    rb_scaler = RobustScaler()
    group['seller_num_followers'] = rb_scaler.fit_transform(group[['seller_num_followers']])
    return group

# Apply the function to each group
# Using group_keys=False to avoid adding the group name back as an index level
data_cleaned = data_cleaned.groupby('brand_name', group_keys=False).apply(scale_group)


# Reset the index to ensure 'brand_name' is a column, not an index
data_cleaned = data_cleaned.reset_index(drop=True)


# Display the first few rows of the transformed data
data_cleaned.head()

### Seller_pass_rate

In [None]:
# Step 3: Plot the histogram
plt.figure(figsize=(12, 8))  # Set the figure size for better readability
pass_rate_hist = sns.histplot(filtered_data, x='seller_pass_rate', hue='brand_name', bins=200, kde=True)

# Add labels and title for clarity
plt.title('Pass Rate per Seller for Top 20 Brands')
plt.xlabel('Pass Rate per Seller')
plt.ylabel('Frequency')

# Show the plot
plt.show();

In [None]:
def scale_group(group):
    rb_scaler = RobustScaler()
    group['seller_pass_rate'] = rb_scaler.fit_transform(group[['seller_pass_rate']])
    return group

# Apply the function to each group
# Using group_keys=False to avoid adding the group name back as an index level
data_cleaned = data_cleaned.groupby('brand_name', group_keys=False).apply(scale_group)


# Reset the index to ensure 'brand_name' is a column, not an index
data_cleaned = data_cleaned.reset_index(drop=True)


# Display the first few rows of the transformed data
data_cleaned.head()

In [None]:
data_cleaned.head()

### Product_gender_target

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Check unique values for streets (3)
print(f"The unique values for 'Product_gender_target' are {data_cleaned.product_gender_target.unique()}")

# Instantiate the OneHotEncoder
ohe = OneHotEncoder(sparse_output = False) 

# Fit encoder
ohe.fit(data_cleaned[['product_gender_target']]) 

# Display the detected categories
print(f"The categories detected by the OneHotEncoder are {ohe.categories_}")

# Transform the current "Street" column
data_cleaned[ohe.get_feature_names_out()] = ohe.transform(data_cleaned[['product_gender_target']])

# Drop the column "Street" which has been encoded
data_cleaned = data_cleaned.drop(columns = ["product_gender_target"])

In [None]:
# Show the dataset
data_cleaned.head(3)

## Observing the Correlartion

In [None]:
%matplotlib inline

# Standard import for matplotlib
import matplotlib.pyplot as plt

# Your plotting code
correlation_matrix = data_cleaned.select_dtypes('number').corr()
column_names = correlation_matrix.columns
sns.heatmap(correlation_matrix, xticklabels=column_names, yticklabels=column_names, cmap="bwr")

# Make sure to show the plot
plt.show();

In [None]:
# Convert the correlation matrix into a DataFrame
corr_df = correlation_matrix.stack().reset_index()

# Rename the columns
corr_df.columns = ['feature_1','feature_2', 'correlation']

# Remove "self correlations"
no_self_correlation = (corr_df['feature_1'] != corr_df['feature_2'])
corr_df = corr_df[no_self_correlation]

In [None]:
# Compute the absolute correlation
corr_df['absolute_correlation'] = np.abs(corr_df['correlation'])

# Showe the top 5 most correlated pairs of feature
corr_df.sort_values(by="absolute_correlation", ascending=False).head(15*2)