In [1]:
import pandas as pd
import numpy as np
import sweetviz as sv
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Preparación de los datos

## Se carga el dataset

In [2]:
original = pd.read_csv("../data/original.csv")

In [3]:
full_dataset = original.copy()

## Comprobación de los datos

In [4]:
full_dataset.isna().sum()

Order Date      0
Brand           0
Sneaker Name    0
Sale Price      0
Retail Price    0
Release Date    0
Shoe Size       0
Buyer Region    0
dtype: int64

In [5]:
full_dataset.dtypes

Order Date       object
Brand            object
Sneaker Name     object
Sale Price       object
Retail Price     object
Release Date     object
Shoe Size       float64
Buyer Region     object
dtype: object

In [6]:
full_dataset.head()

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region
0,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,"$1,097",$220,9/24/16,11.0,California
1,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,$685,$220,11/23/16,11.0,California
2,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,$690,$220,11/23/16,11.0,California
3,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,"$1,075",$220,11/23/16,11.5,Kentucky
4,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,$828,$220,2/11/17,11.0,Rhode Island


## Limpieza del dataset

### Limpieza de la variable "Sneaker Name"

In [7]:
def clean_sneakers(original_data):
    products = original_data["Sneaker Name"].unique()
    # Normalizar nombres de marcas (capitalizar 'adidas' a 'Adidas')
    normalized_products = [product.replace('adidas', 'Adidas') for product in products]
    
    # Listas predefinidas de términos conocidos
    brands = ['Nike', 'Adidas']
    sub_brands = ['Yeezy', 'Air Jordan']
    sub_brand_to_brand = {
        'Yeezy': 'Adidas',
        'Air Jordan': 'Nike'
    }
    collaborations = ['Off-White', 'Off White', 'Virgil Abloh']
    product_lines = [
        'Boost',
        'Air Max',
        'Air Presto',
        'Air VaporMax',
        'Air Force 1',
        'Blazer Mid',
        'Zoom Fly',
        'React Hyperdunk',
        'Hyperdunk',
        'Mercurial',
        'Retro',
        'Flyknit'
    ]
    models = ['350', '90', '97', '1']
    versions = ['V2', 'V3', '2pt0']
    heights = ['Low', 'Mid', 'High']
    years = ['2015', '2016', '2017', '2018']
    # Lista de colores extendida y organizada
    colors = [
        'Beluga', 'Core Black Copper', 'Core Black Green', 'Core Black Red', 'Core Black White',
        'Cream White', 'Zebra', 'Moonrock', 'Pirate Black', 'Oxford Tan', 'Turtledove',
        'Semi Frozen Yellow', 'Blue Tint', 'Black', 'Desert Ore', 'Elemental Rose Queen',
        'All Hallows Eve', 'Grim Reaper', 'Sesame', 'Wolf Grey', 'Menta', 'Black Silver',
        'Pink', 'Volt', 'Butter', 'Static', 'Static Reflective', 'Chicago', 'University Blue',
        'White', 'Black-White', 'Black-Silver', 'Total-Orange'
    ]
    
    # Función auxiliar para verificar y asignar tokens
    def assign_tokens(tokens):
        data = {
            'Sneaker Name': None,
            'Brand': None,
            'Sub-brand': None,
            'Product Line': None,
            'Model': None,
            'Version': None,
            'Height': None,
            'Collaboration': None,
            'Color(s)': [],
            'Year': None
        }
        i = 0
        while i < len(tokens):
            token = tokens[i]
    
            # Manejo de tokens de múltiples palabras
            if i < len(tokens) - 1:
                next_token = tokens[i+1]
                combined_token = f"{token} {next_token}"
                combined_token_hyphen = f"{token}-{next_token}"  # Para colores como 'Black-White'
    
                # Verificar combinaciones con guiones (ej. 'Black-White')
                if combined_token_hyphen in colors:
                    data['Color(s)'].append(combined_token_hyphen)
                    i += 2
                    continue
    
                # Verificar combinaciones de dos palabras sin guión
                if combined_token in sub_brands:
                    data['Sub-brand'] = combined_token
                    # Asignar Brand basado en Sub-brand
                    data['Brand'] = sub_brand_to_brand.get(combined_token, data['Brand'])
                    i += 2
                    continue
                if combined_token in product_lines:
                    data['Product Line'] = combined_token
                    i += 2
                    continue
                if combined_token in collaborations:
                    data['Collaboration'] = combined_token
                    i += 2
                    continue
                if combined_token in versions:
                    data['Version'] = combined_token
                    i += 2
                    continue
                if combined_token in colors:
                    data['Color(s)'].append(combined_token)
                    i += 2
                    continue
                if combined_token in heights:
                    data['Height'] = combined_token
                    i += 2
                    continue
    
            # Verificaciones de un solo token
            if token in brands:
                data['Brand'] = token
            elif token in sub_brands:
                data['Sub-brand'] = token
                # Asignar Brand basado en Sub-brand si Brand no está ya asignado
                if not data['Brand']:
                    data['Brand'] = sub_brand_to_brand.get(token, data['Brand'])
            elif token in product_lines:
                data['Product Line'] = token
            elif token in models:
                data['Model'] = token
            elif token in versions:
                data['Version'] = token
            elif token in heights:
                data['Height'] = token
            elif token in collaborations:
                data['Collaboration'] = token
            elif token in years:
                data['Year'] = token
            elif token in colors:
                data['Color(s)'].append(token)
            else:
                # Asumir que tokens no reconocidos son parte del color
                data['Color(s)'].append(token)
            i += 1
    
        # Asignar Brand basado en Sub-brand si Brand no está ya asignado
        if not data['Brand'] and data['Sub-brand']:
            data['Brand'] = sub_brand_to_brand.get(data['Sub-brand'], None)
    
        return data
    
    # Analizar cada producto
    parsed_data = []
    for product in normalized_products:
        tokens = product.split('-')
        parsed = assign_tokens(tokens)
        parsed['Sneaker Name'] = product
        parsed_data.append(parsed)
    
    # Crear DataFrame
    df = pd.DataFrame(parsed_data)
    
    # Reorganizar columnas para mejor legibilidad
    df = df[['Sneaker Name', 'Brand', 'Sub-brand', 'Product Line', 'Model', 'Version', 'Height', 'Collaboration', 'Color(s)', 'Year']]

    if 'Brand' in original_data.columns:
        original_data.drop(columns=['Brand'], inplace=True)
    
    # Unir el dataframe original por "Sneaker Name"
    merged_data = pd.merge(original_data, df, on="Sneaker Name", how="left")

    return merged_data

### Preparación del dataset

In [8]:
def prepare_dataset(data):
    data['Sale Price'] = data['Sale Price'].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False).astype(float)
    data['Retail Price'] = data['Retail Price'].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False).astype(float)
    data['Order Date'] = pd.to_datetime(data['Order Date'], format='%m/%d/%y')
    data['Release Date'] = pd.to_datetime(data['Release Date'], format='%m/%d/%y')
    return data

### Analisis de los datos usando Sweetviz

In [9]:
full_dataset_sv = clean_sneakers(prepare_dataset(full_dataset.copy()))
full_dataset_sv['Color(s)'] = full_dataset_sv['Color(s)'].apply(lambda x: x if isinstance(x, list) else [])
full_dataset_sv['Color(s)'] = full_dataset_sv['Color(s)'].apply(lambda x: ', '.join(map(str, x)))

In [10]:
my_report = sv.analyze(full_dataset_sv, target_feat="Sale Price")
my_report.show_html("reports/report_general.html")

                                             |                                             | [  0%]   00:00 ->…

Report reports/report_general.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


#### Se guarda para usarlo para hacer gráficos

In [11]:
df = full_dataset_sv.copy()

In [12]:
df.to_excel('../data/data_graphics.xlsx', index=False)

### Limpieza del dataset complejo

In [13]:
def clean_dataset(data):
    data_scaler = data.copy()
    categorical_cols = ['Brand', 'Sub-brand', 'Product Line', 'Model', 'Version', 'Height', 'Collaboration']
    numerical_cols = ['Retail Price', 'Shoe Size']

    
    # Eliminar columnas
    data = data.drop("Buyer Region", axis=1)
    data = data.drop("Sneaker Name", axis=1)
    data = data.drop("Year", axis=1)
    
    # Fechas
    data['Order Year'] = data['Order Date'].dt.year
    data['Order Month'] = data['Order Date'].dt.month
    data['Order Day'] = data['Order Date'].dt.day
    data['Order Day of Week'] = data['Order Date'].dt.dayofweek
    
    data['Release Year'] = data['Release Date'].dt.year
    data['Release Month'] = data['Release Date'].dt.month
    data['Release Day'] = data['Release Date'].dt.day
    data['Release Day of Week'] = data['Release Date'].dt.dayofweek
    
    data['Order Date'] = data['Order Date'].astype(np.int64) // 10**9
    data['Release Date'] = data['Release Date'].astype(np.int64) // 10**9
    
    # Variables Categóricas
    data = pd.get_dummies(data, columns=categorical_cols)

    # Colores
    unique_colors = set()
    for sublist in data['Color(s)']:
        if isinstance(sublist, list):
            unique_colors.update(sublist)

    for color in unique_colors:
        data['Color_'+color] = 0

    for index, row in data.iterrows():
        colors = row['Color(s)']
        if isinstance(colors, list):
            for color in colors:
                data.at[index, 'Color_'+color] = 1
    
    data.drop(columns='Color(s)', inplace=True)
    
    # Variables Numéricas
    scaler = StandardScaler()
    scaler.fit(data_scaler[numerical_cols])
    data[numerical_cols] = scaler.transform(data[numerical_cols])

    # Guardar scaler
    with open('../data/scaler.pkl', 'wb') as file:
        pickle.dump(scaler, file)
    return data

In [14]:
full_dataset0 = clean_sneakers(prepare_dataset(full_dataset.copy()))
full_dataset1 = clean_dataset(full_dataset0.copy())
X = full_dataset1.loc[:, full_dataset1.columns != "Sale Price"]
y = full_dataset1.loc[:, "Sale Price"]
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

## Comprobar la distribución de los datasets

In [15]:
train_sv = pd.concat([X_train_full, y_train_full], axis=1)
test_sv = pd.concat([X_test, y_test], axis=1)
train_sv = train_sv.select_dtypes(include=['int64', 'float64'])
test_sv = test_sv.select_dtypes(include=['int64', 'float64'])

my_report = sv.compare([train_sv, "Train"], [test_sv, "Test"], target_feat="Sale Price")
my_report.show_html("reports/report_train_test.html")

                                             |                                             | [  0%]   00:00 ->…

Report reports/report_train_test.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [16]:
train_sv = pd.concat([X_train, y_train], axis=1)
val_sv = pd.concat([X_val, y_val], axis=1)
train_sv = train_sv.select_dtypes(include=['int64', 'float64'])
val_sv = val_sv.select_dtypes(include=['int64', 'float64'])

my_report = sv.compare([train_sv, "Train"], [val_sv, "Val"], target_feat="Sale Price")
my_report.show_html("reports/report_train_val.html")

                                             |                                             | [  0%]   00:00 ->…

Report reports/report_train_val.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


## Guardar Datasets

In [17]:
X_train_full.to_csv('../data/X_train_full.csv', index=False)
y_train_full.to_csv('../data/y_train_full.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)
X_train.to_csv('../data/X_train.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
X_val.to_csv('../data/X_val.csv', index=False)
y_val.to_csv('../data/y_val.csv', index=False)