**Parte 2: Preprocesamiento de Datos**

**Transformación de Columnas:**

Utilizar ColumnTransformer para aplicar transformaciones específicas a diferentes columnas.

In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
import calendar


In [4]:
df = pd.read_csv('../data/retail_sales_dataset.csv')

# Exploración de datos
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    1000 non-null   int64 
 1   Date              1000 non-null   object
 2   Customer ID       1000 non-null   object
 3   Gender            1000 non-null   object
 4   Age               1000 non-null   int64 
 5   Product Category  1000 non-null   object
 6   Quantity          1000 non-null   int64 
 7   Price per Unit    1000 non-null   int64 
 8   Total Amount      1000 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 70.4+ KB
None
   Transaction ID        Date Customer ID  Gender  Age Product Category  \
0               1  2023-11-24     CUST001    Male   34           Beauty   
1               2  2023-02-27     CUST002  Female   26         Clothing   
2               3  2023-01-13     CUST003    Male   50      Electronics   
3               4  2023-05-21    

In [8]:
# 5. Análisis temporal: Ventas por mes
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Crear una columna con el mes
df['Month'] = df['Date'].dt.month
df['Month Name'] = df['Date'].dt.month.map(lambda x: calendar.month_name[x])

# Asumir que el DataFrame df ya está cargado

# Definir las columnas que se transformarán
categorical_columns = ['Gender', 'Product Category', 'Month Name']
numerical_columns = ['Age', 'Quantity', 'Price per Unit', 'Total Amount']
date_columns = ['Date']  # Podríamos extraer más características de esta columna si es necesario

# Pipeline para las variables numéricas
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Imputar valores faltantes con la media
    ('scaler', StandardScaler())  # Escalar las variables numéricas
])

# Pipeline para las variables categóricas
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Imputar valores faltantes con la moda
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # Codificación OneHot
])

# Crear el ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_columns),  # Transformación de variables numéricas
        ('cat', categorical_pipeline, categorical_columns)  # Transformación de variables categóricas
    ]
)

# Crear el Pipeline final para preprocesar los datos
modeling_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Aplicar el pipeline al dataframe
df_transformed = modeling_pipeline.fit_transform(df)

# Si deseas que el resultado sea un DataFrame, puedes reconstruirlo:

columns_transformed = (numerical_columns + 
                       list(modeling_pipeline.named_steps['preprocessor'].transformers_[1][1].named_steps['encoder'].get_feature_names_out(categorical_columns)))
df_transformed_df = pd.DataFrame(df_transformed, columns=columns_transformed)

# Ver los primeros registros del dataframe transformado
print(df_transformed_df.head())


        Age  Quantity  Price per Unit  Total Amount  Gender_Female  \
0 -0.540565  0.429265       -0.685123     -0.546704            0.0   
1 -1.125592 -0.453996        1.688464      0.971919            1.0   
2  0.629489 -1.337258       -0.790615     -0.761098            0.0   
3 -0.321180 -1.337258        1.688464      0.078611            0.0   
4 -0.833078 -0.453996       -0.685123     -0.636035            0.0   

   Gender_Male  Product Category_Beauty  Product Category_Clothing  \
0          1.0                      1.0                        0.0   
1          0.0                      0.0                        1.0   
2          1.0                      0.0                        0.0   
3          1.0                      0.0                        1.0   
4          1.0                      1.0                        0.0   

   Product Category_Electronics  Month Name_April  ...  Month Name_December  \
0                           0.0               0.0  ...                  0.0   
