In [1]:
import pandas as pd
import numpy as np
import sweetviz as sv
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Preparación de los datos

## Se carga el dataset

In [2]:
original = pd.read_csv("../data/original.csv")

In [3]:
full_dataset = original.copy()

## Comprobación de los datos

In [27]:
full_dataset.isna().sum()

Order Date      0
Brand           0
Sneaker Name    0
Sale Price      0
Retail Price    0
Release Date    0
Shoe Size       0
Buyer Region    0
dtype: int64

In [28]:
full_dataset.dtypes

Order Date       object
Brand            object
Sneaker Name     object
Sale Price       object
Retail Price     object
Release Date     object
Shoe Size       float64
Buyer Region     object
dtype: object

## Limpieza del dataset

### Limpieza de la variable "Sneaker Name"

In [4]:
def clean_sneakers(original_data):
    products = original_data["Sneaker Name"].unique()
    # Normalizar nombres de marcas (capitalizar 'adidas' a 'Adidas')
    normalized_products = [product.replace('adidas', 'Adidas') for product in products]
    
    # Listas predefinidas de términos conocidos
    brands = ['Nike', 'Adidas']
    sub_brands = ['Yeezy', 'Air Jordan']
    sub_brand_to_brand = {
        'Yeezy': 'Adidas',
        'Air Jordan': 'Nike'
    }
    collaborations = ['Off-White', 'Off White', 'Virgil Abloh']
    product_lines = [
        'Boost',
        'Air Max',
        'Air Presto',
        'Air VaporMax',
        'Air Force 1',
        'Blazer Mid',
        'Zoom Fly',
        'React Hyperdunk',
        'Hyperdunk',
        'Mercurial',
        'Retro',
        'Flyknit'
    ]
    models = ['350', '90', '97', '1']
    versions = ['V2', 'V3', '2pt0']
    heights = ['Low', 'Mid', 'High']
    years = ['2015', '2016', '2017', '2018']
    # Lista de colores extendida y organizada
    colors = [
        'Beluga', 'Core Black Copper', 'Core Black Green', 'Core Black Red', 'Core Black White',
        'Cream White', 'Zebra', 'Moonrock', 'Pirate Black', 'Oxford Tan', 'Turtledove',
        'Semi Frozen Yellow', 'Blue Tint', 'Black', 'Desert Ore', 'Elemental Rose Queen',
        'All Hallows Eve', 'Grim Reaper', 'Sesame', 'Wolf Grey', 'Menta', 'Black Silver',
        'Pink', 'Volt', 'Butter', 'Static', 'Static Reflective', 'Chicago', 'University Blue',
        'White', 'Black-White', 'Black-Silver', 'Total-Orange'
    ]
    
    # Función auxiliar para verificar y asignar tokens
    def assign_tokens(tokens):
        data = {
            'Sneaker Name': None,
            'Brand': None,
            'Sub-brand': None,
            'Product Line': None,
            'Model': None,
            'Version': None,
            'Height': None,
            'Collaboration': None,
            'Color(s)': [],
            'Year': None
        }
        i = 0
        while i < len(tokens):
            token = tokens[i]
    
            # Manejo de tokens de múltiples palabras
            if i < len(tokens) - 1:
                next_token = tokens[i+1]
                combined_token = f"{token} {next_token}"
                combined_token_hyphen = f"{token}-{next_token}"  # Para colores como 'Black-White'
    
                # Verificar combinaciones con guiones (ej. 'Black-White')
                if combined_token_hyphen in colors:
                    data['Color(s)'].append(combined_token_hyphen)
                    i += 2
                    continue
    
                # Verificar combinaciones de dos palabras sin guión
                if combined_token in sub_brands:
                    data['Sub-brand'] = combined_token
                    # Asignar Brand basado en Sub-brand
                    data['Brand'] = sub_brand_to_brand.get(combined_token, data['Brand'])
                    i += 2
                    continue
                if combined_token in product_lines:
                    data['Product Line'] = combined_token
                    i += 2
                    continue
                if combined_token in collaborations:
                    data['Collaboration'] = combined_token
                    i += 2
                    continue
                if combined_token in versions:
                    data['Version'] = combined_token
                    i += 2
                    continue
                if combined_token in colors:
                    data['Color(s)'].append(combined_token)
                    i += 2
                    continue
                if combined_token in heights:
                    data['Height'] = combined_token
                    i += 2
                    continue
    
            # Verificaciones de un solo token
            if token in brands:
                data['Brand'] = token
            elif token in sub_brands:
                data['Sub-brand'] = token
                # Asignar Brand basado en Sub-brand si Brand no está ya asignado
                if not data['Brand']:
                    data['Brand'] = sub_brand_to_brand.get(token, data['Brand'])
            elif token in product_lines:
                data['Product Line'] = token
            elif token in models:
                data['Model'] = token
            elif token in versions:
                data['Version'] = token
            elif token in heights:
                data['Height'] = token
            elif token in collaborations:
                data['Collaboration'] = token
            elif token in years:
                data['Year'] = token
            elif token in colors:
                data['Color(s)'].append(token)
            else:
                # Asumir que tokens no reconocidos son parte del color
                data['Color(s)'].append(token)
            i += 1
    
        # Asignar Brand basado en Sub-brand si Brand no está ya asignado
        if not data['Brand'] and data['Sub-brand']:
            data['Brand'] = sub_brand_to_brand.get(data['Sub-brand'], None)
    
        return data
    
    # Analizar cada producto
    parsed_data = []
    for product in normalized_products:
        tokens = product.split('-')
        parsed = assign_tokens(tokens)
        parsed['Sneaker Name'] = product
        parsed_data.append(parsed)
    
    # Crear DataFrame
    df = pd.DataFrame(parsed_data)
    
    # Reorganizar columnas para mejor legibilidad
    df = df[['Sneaker Name', 'Brand', 'Sub-brand', 'Product Line', 'Model', 'Version', 'Height', 'Collaboration', 'Color(s)', 'Year']]

    if 'Brand' in original_data.columns:
        original_data.drop(columns=['Brand'], inplace=True)
    
    # Unir el dataframe original por "Sneaker Name"
    merged_data = pd.merge(original_data, df, on="Sneaker Name", how="left")

    return merged_data

### Preparación del dataset

In [5]:
def prepare_dataset(data):
    data['Sale Price'] = data['Sale Price'].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False).astype(float)
    data['Retail Price'] = data['Retail Price'].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False).astype(float)
    data['Order Date'] = pd.to_datetime(data['Order Date'], format='%m/%d/%y')
    data['Release Date'] = pd.to_datetime(data['Release Date'], format='%m/%d/%y')
    return data

### Analisis de los datos usando Sweetviz

In [7]:
full_dataset_sv = clean_sneakers(prepare_dataset(full_dataset.copy()))
full_dataset_sv['Color(s)'] = full_dataset_sv['Color(s)'].apply(lambda x: x if isinstance(x, list) else [])
full_dataset_sv['Color(s)'] = full_dataset_sv['Color(s)'].apply(lambda x: ', '.join(map(str, x)))
my_report = sv.analyze(full_dataset_sv, target_feat="Sale Price")
my_report.show_html("reports/report_general.html")

                                             |                                             | [  0%]   00:00 ->…

Report reports/report_general.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


#### Se guarda para usarlo para hacer gráficos

In [58]:
df = full_dataset_sv.copy()

In [59]:
df.to_excel('../data/data_graphics.xlsx', index=False)

### Limpieza del dataset simple

In [12]:
def clean_dataset_simple(data, data_scaler, dtype):
    #categorical_cols = ['Brand', 'Sneaker Name', 'Buyer Region']
    categorical_cols = ['Brand', 'Buyer Region']
    numerical_cols = ['Sale Price', 'Retail Price', 'Shoe Size']
    
    if dtype != "test":
        numerical_cols.remove("Sale Price")

    # Nombre
    data['Sneaker Name'] = data['Sneaker Name'].str.replace("adidas", "Adidas")
    data["Boost"] = data["Sneaker Name"].apply(lambda x: 1 if "Boost" in x.split("-") else 0)
    data["Jordan"] = data["Sneaker Name"].apply(lambda x: 1 if "Jordan" in x.split("-") else 0)
    data["V2"] = data["Sneaker Name"].apply(lambda x: 1 if "V2" in x.split("-") else 0)
    data["Core"] = data["Sneaker Name"].apply(lambda x: 1 if "Core" in x.split("-") else 0)
    data["Low"] = data["Sneaker Name"].apply(lambda x: 1 if "Low" in x.split("-") else 0)
    data["350"] = data["Sneaker Name"].apply(lambda x: 1 if "350" in x.split("-") else 0)
    data["Adidas"] = data["Sneaker Name"].apply(lambda x: 1 if "Adidas" in x.split("-") else 0)
    data["Nike"] = data["Sneaker Name"].apply(lambda x: 1 if "Nike" in x.split("-") else 0)
    data["Air"] = data["Sneaker Name"].apply(lambda x: 1 if "Air" in x.split("-") else 0)
    data["Zoom"] = data["Sneaker Name"].apply(lambda x: 1 if "Zoom" in x.split("-") else 0)
    data["Blazer"] = data["Sneaker Name"].apply(lambda x: 1 if "Blazer" in x.split("-") else 0)
    data["Retro"] = data["Sneaker Name"].apply(lambda x: 1 if "Retro" in x.split("-") else 0)
    data["Force"] = data["Sneaker Name"].apply(lambda x: 1 if "Force" in x.split("-") else 0)
    data["Max"] = data["Sneaker Name"].apply(lambda x: 1 if "Max" in x.split("-") else 0)
    data["Black"] = data["Sneaker Name"].apply(lambda x: 1 if "Black" in x.split("-") else 0)
    data["Vapormax"] = data["Sneaker Name"].apply(lambda x: 1 if "Vapormax" in x.split("-") else 0)
    

    # Prueba
    data = data.drop("Sneaker Name", axis=1)

    # Fechas
    data['Order Year'] = data['Order Date'].dt.year
    data['Order Month'] = data['Order Date'].dt.month
    data['Order Day'] = data['Order Date'].dt.day
    data['Order Day of Week'] = data['Order Date'].dt.dayofweek
    
    data['Release Year'] = data['Release Date'].dt.year
    data['Release Month'] = data['Release Date'].dt.month
    data['Release Day'] = data['Release Date'].dt.day
    data['Release Day of Week'] = data['Release Date'].dt.dayofweek
    
    data['Order Date'] = data['Order Date'].astype(np.int64) // 10**9
    data['Release Date'] = data['Release Date'].astype(np.int64) // 10**9

    # Variables Categóricas
    data = pd.get_dummies(data, columns=categorical_cols)

    # Variables Numéricas
    scaler = StandardScaler()
    scaler.fit(data_scaler[numerical_cols])
    data[numerical_cols] = scaler.transform(data[numerical_cols])
    return data

In [13]:
full_dataset0 = prepare_dataset(full_dataset.copy())
full_dataset1 = clean_dataset_simple(full_dataset0.copy(), full_dataset0.copy(), "train")
X = full_dataset1.loc[:, full_dataset1.columns != "Sale Price"]
y = full_dataset1.loc[:, "Sale Price"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### Limpieza del dataset complejo

In [56]:
def clean_dataset(data):
    data_scaler = data.copy()
    categorical_cols = ['Brand', 'Sub-brand', 'Product Line', 'Model', 'Version', 'Height', 'Collaboration']
    numerical_cols = ['Sale Price', 'Retail Price', 'Shoe Size']
    
    numerical_cols.remove("Sale Price")
    
    # Eliminar columnas
    data = data.drop("Buyer Region", axis=1)
    data = data.drop("Sneaker Name", axis=1)
    data = data.drop("Year", axis=1)
    
    # Fechas
    data['Order Year'] = data['Order Date'].dt.year
    data['Order Month'] = data['Order Date'].dt.month
    data['Order Day'] = data['Order Date'].dt.day
    data['Order Day of Week'] = data['Order Date'].dt.dayofweek
    
    data['Release Year'] = data['Release Date'].dt.year
    data['Release Month'] = data['Release Date'].dt.month
    data['Release Day'] = data['Release Date'].dt.day
    data['Release Day of Week'] = data['Release Date'].dt.dayofweek

    #data['Order Month sin'] = np.sin(2 * np.pi * data['Order Month'] / 12)
    #data['Release Month sin'] = np.sin(2 * np.pi * data['Release Month'] / 12)
    #data['Order Month cos'] = np.cos(2 * np.pi * data['Order Month'] / 12)
    #data['Release Month cos'] = np.cos(2 * np.pi * data['Release Month'] / 12)
    
    data['Order Date'] = data['Order Date'].astype(np.int64) // 10**9
    data['Release Date'] = data['Release Date'].astype(np.int64) // 10**9
    #data.drop(columns='Order Date', inplace=True)
    #data.drop(columns='Release Date', inplace=True)
    
    # Variables Categóricas
    data = pd.get_dummies(data, columns=categorical_cols)

    # Colores
    unique_colors = set()
    for sublist in data['Color(s)']:
        if isinstance(sublist, list):
            unique_colors.update(sublist)

    for color in unique_colors:
        data['Color_'+color] = 0

    for index, row in data.iterrows():
        colors = row['Color(s)']
        if isinstance(colors, list):
            for color in colors:
                data.at[index, 'Color_'+color] = 1
    
    data.drop(columns='Color(s)', inplace=True)
    
    # Variables Numéricas
    scaler = StandardScaler()
    scaler.fit(data_scaler[numerical_cols])
    data[numerical_cols] = scaler.transform(data[numerical_cols])
    return data

In [57]:
full_dataset0 = clean_sneakers(prepare_dataset(full_dataset.copy()))
full_dataset1 = clean_dataset(full_dataset0.copy())
X = full_dataset1.loc[:, full_dataset1.columns != "Sale Price"]
y = full_dataset1.loc[:, "Sale Price"]
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

In [25]:
full_dataset0["Sale Price"].describe()

count    99956.000000
mean       446.634719
std        255.982969
min        186.000000
25%        275.000000
50%        370.000000
75%        540.000000
max       4050.000000
Name: Sale Price, dtype: float64

In [35]:
y_train_full.describe()

count    79964.000000
mean        -0.002639
std          1.001645
min         -1.018177
25%         -0.670496
50%         -0.299376
75%          0.364734
max         14.076653
Name: Sale Price, dtype: float64

## Comprobar la distribución de los datasets

In [10]:
#full_dataset_sv = clean_sneakers(prepare_dataset(full_dataset.copy()))
#full_dataset_sv['Color(s)'] = full_dataset_sv['Color(s)'].apply(lambda x: x if isinstance(x, list) else [])
#full_dataset_sv['Color(s)'] = full_dataset_sv['Color(s)'].apply(lambda x: ', '.join(map(str, x)))
#train_sv, test_sv = train_test_split(full_dataset_sv, test_size=0.2, random_state=42)

train_sv = pd.concat([X_train_full, y_train_full], axis=1)
test_sv = pd.concat([X_test, y_test], axis=1)
train_sv = train_sv.select_dtypes(include=['int64', 'float64'])
test_sv = test_sv.select_dtypes(include=['int64', 'float64'])

my_report = sv.compare([train_sv, "Train"], [test_sv, "Test"], target_feat="Sale Price")
my_report.show_html("reports/report_train_test.html")

                                             |                                             | [  0%]   00:00 ->…

Report reports/report_train_test.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [11]:
train_sv = pd.concat([X_train, y_train], axis=1)
val_sv = pd.concat([X_val, y_val], axis=1)
train_sv = train_sv.select_dtypes(include=['int64', 'float64'])
val_sv = val_sv.select_dtypes(include=['int64', 'float64'])

my_report = sv.compare([train_sv, "Train"], [val_sv, "Val"], target_feat="Sale Price")
my_report.show_html("reports/report_train_val.html")

                                             |                                             | [  0%]   00:00 ->…

Report reports/report_train_val.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


## Guardar Datasets

In [58]:
X_train_full.to_csv('../data/X_train_full.csv', index=False)
y_train_full.to_csv('../data/y_train_full.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)
X_train.to_csv('../data/X_train.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
X_val.to_csv('../data/X_val.csv', index=False)
y_val.to_csv('../data/y_val.csv', index=False)