In [82]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [83]:
df_delivery = pd.read_csv('../01_CSV Trabajo/df_delivery_limpio.csv')

In [84]:
# Comprobar los valores nulos en el dataframe
nulos = df_delivery.isnull().sum()

# Verificar los tipos de datos de cada columna
tipos_datos = df_delivery.dtypes

# Mostrar los resultados
nulos, tipos_datos

(store_primary_category      0
 total_items                 0
 subtotal                    0
 num_distinct_items          0
 min_item_price              0
 max_item_price              0
 total_onshift_partners      0
 total_busy_partners         0
 total_outstanding_orders    0
 delivery_duration           0
 partner_density             0
 order_day                   0
 order_hour                  0
 order_period                0
 busy_ratio                  0
 avg_item_price              0
 order_size                  0
 grouped_category            0
 dtype: int64,
 store_primary_category       object
 total_items                   int64
 subtotal                      int64
 num_distinct_items            int64
 min_item_price                int64
 max_item_price                int64
 total_onshift_partners      float64
 total_busy_partners         float64
 total_outstanding_orders    float64
 delivery_duration           float64
 partner_density             float64
 order_day          

In [85]:
# Eliminar la columna irrelevante 'store_primary_category'
df_delivery_cleaned = df_delivery.drop(columns=['store_primary_category'])

# Convertir las columnas categóricas a tipo categoría
df_delivery_cleaned['order_day'] = df_delivery_cleaned['order_day'].astype('category')
df_delivery_cleaned['order_period'] = df_delivery_cleaned['order_period'].astype('category')
df_delivery_cleaned['order_size'] = df_delivery_cleaned['order_size'].astype('category')
df_delivery_cleaned['grouped_category'] = df_delivery_cleaned['grouped_category'].astype('category')

# Verificar los cambios en los tipos de datos
df_delivery_cleaned.dtypes


total_items                    int64
subtotal                       int64
num_distinct_items             int64
min_item_price                 int64
max_item_price                 int64
total_onshift_partners       float64
total_busy_partners          float64
total_outstanding_orders     float64
delivery_duration            float64
partner_density              float64
order_day                   category
order_hour                     int64
order_period                category
busy_ratio                   float64
avg_item_price               float64
order_size                  category
grouped_category            category
dtype: object

In [86]:
# Corregir el uso de OneHotEncoder para versiones compatibles de scikit-learn
encoder = OneHotEncoder(sparse=False)

# Transformar las columnas categóricas
order_day_encoded = encoder.fit_transform(df_delivery_cleaned[['order_day']])
order_period_encoded = encoder.fit_transform(df_delivery_cleaned[['order_period']])
order_size_encoded = encoder.fit_transform(df_delivery_cleaned[['order_size']])
grouped_category_encoded = encoder.fit_transform(df_delivery_cleaned[['grouped_category']])

# Normalizar las columnas numéricas
numerical_columns = ['total_items', 'subtotal', 'num_distinct_items', 'min_item_price', 'max_item_price',
                     'total_onshift_partners', 'total_busy_partners', 'total_outstanding_orders', 
                     'partner_density', 'order_hour', 'busy_ratio', 'avg_item_price']
scaler = StandardScaler()
numerical_data_scaled = scaler.fit_transform(df_delivery_cleaned[numerical_columns])

# Concatenar todos los datos preprocesados
X_processed = pd.concat([pd.DataFrame(order_day_encoded), 
                         pd.DataFrame(order_period_encoded),
                         pd.DataFrame(order_size_encoded), 
                         pd.DataFrame(grouped_category_encoded), 
                         pd.DataFrame(numerical_data_scaled)], axis=1)

# Variable objetivo (delivery_duration)
y_processed = df_delivery_cleaned['delivery_duration']

# Mostrar el dataframe preprocesado
import ace_tools as tools; tools.display_dataframe_to_user(name="Datos Preprocesados para Análisis", dataframe=df_delivery_cleaned)


TypeError: __init__() got an unexpected keyword argument 'sparse'

In [87]:
import numpy as np

# Verificar si hay valores infinitos o extremadamente grandes en las columnas numéricas
infinity_check = np.isinf(df_delivery_cleaned[numerical_columns]).sum()
max_values = df_delivery_cleaned[numerical_columns].max()
min_values = df_delivery_cleaned[numerical_columns].min()

infinity_check, max_values, min_values


(total_items                  0
 subtotal                     0
 num_distinct_items           0
 min_item_price               0
 max_item_price               0
 total_onshift_partners       0
 total_busy_partners          0
 total_outstanding_orders     0
 partner_density             14
 order_hour                   0
 busy_ratio                   6
 avg_item_price               0
 dtype: int64,
 total_items                  411.0
 subtotal                    6405.0
 num_distinct_items            15.0
 min_item_price              6400.0
 max_item_price              6400.0
 total_onshift_partners       171.0
 total_busy_partners          154.0
 total_outstanding_orders     278.0
 partner_density                inf
 order_hour                    23.0
 busy_ratio                     inf
 avg_item_price              6400.0
 dtype: float64,
 total_items                  1.0
 subtotal                     0.0
 num_distinct_items           1.0
 min_item_price             -86.0
 max_item_price 

In [88]:
# Reemplazar los valores extremos o cercanos a cero por la mediana de cada columna correspondiente
df_delivery_cleaned['min_item_price'] = df_delivery_cleaned['min_item_price'].apply(lambda x: df_delivery_cleaned['min_item_price'].median() if x <= 0 else x)
df_delivery_cleaned['max_item_price'] = df_delivery_cleaned['max_item_price'].apply(lambda x: df_delivery_cleaned['max_item_price'].median() if x <= 0 else x)
df_delivery_cleaned['total_onshift_partners'] = df_delivery_cleaned['total_onshift_partners'].apply(lambda x: df_delivery_cleaned['total_onshift_partners'].median() if x <= 0 else x)
df_delivery_cleaned['total_busy_partners'] = df_delivery_cleaned['total_busy_partners'].apply(lambda x: df_delivery_cleaned['total_busy_partners'].median() if x <= 0 else x)
df_delivery_cleaned['total_outstanding_orders'] = df_delivery_cleaned['total_outstanding_orders'].apply(lambda x: df_delivery_cleaned['total_outstanding_orders'].median() if x <= 0 else x)
df_delivery_cleaned['partner_density'] = df_delivery_cleaned['partner_density'].apply(lambda x: df_delivery_cleaned['partner_density'].median() if x <= 0 else x)
df_delivery_cleaned['avg_item_price'] = df_delivery_cleaned['avg_item_price'].apply(lambda x: df_delivery_cleaned['avg_item_price'].median() if x <= 0 else x)

# Verificar los valores máximos y mínimos después de reemplazar
max_values = df_delivery_cleaned[numerical_columns].max()
min_values = df_delivery_cleaned[numerical_columns].min()

max_values, min_values


(total_items                  411.0
 subtotal                    6405.0
 num_distinct_items            15.0
 min_item_price              6400.0
 max_item_price              6400.0
 total_onshift_partners       171.0
 total_busy_partners          154.0
 total_outstanding_orders     278.0
 partner_density                inf
 order_hour                    23.0
 busy_ratio                     inf
 avg_item_price              6400.0
 dtype: float64,
 total_items                  1.000000
 subtotal                     0.000000
 num_distinct_items           1.000000
 min_item_price               1.000000
 max_item_price              52.000000
 total_onshift_partners       1.000000
 total_busy_partners          1.000000
 total_outstanding_orders     1.000000
 partner_density              0.020833
 order_hour                   0.000000
 busy_ratio                  -8.000000
 avg_item_price               7.579075
 dtype: float64)

In [89]:
# Reemplazar valores negativos con la mediana de cada columna correspondiente
df_delivery_cleaned['total_items'] = df_delivery_cleaned['total_items'].apply(lambda x: df_delivery_cleaned['total_items'].median() if x < 0 else x)
df_delivery_cleaned['subtotal'] = df_delivery_cleaned['subtotal'].apply(lambda x: df_delivery_cleaned['subtotal'].median() if x < 0 else x)
df_delivery_cleaned['num_distinct_items'] = df_delivery_cleaned['num_distinct_items'].apply(lambda x: df_delivery_cleaned['num_distinct_items'].median() if x < 0 else x)
df_delivery_cleaned['min_item_price'] = df_delivery_cleaned['min_item_price'].apply(lambda x: df_delivery_cleaned['min_item_price'].median() if x < 0 else x)
df_delivery_cleaned['max_item_price'] = df_delivery_cleaned['max_item_price'].apply(lambda x: df_delivery_cleaned['max_item_price'].median() if x < 0 else x)
df_delivery_cleaned['total_onshift_partners'] = df_delivery_cleaned['total_onshift_partners'].apply(lambda x: df_delivery_cleaned['total_onshift_partners'].median() if x < 0 else x)
df_delivery_cleaned['total_busy_partners'] = df_delivery_cleaned['total_busy_partners'].apply(lambda x: df_delivery_cleaned['total_busy_partners'].median() if x < 0 else x)
df_delivery_cleaned['total_outstanding_orders'] = df_delivery_cleaned['total_outstanding_orders'].apply(lambda x: df_delivery_cleaned['total_outstanding_orders'].median() if x < 0 else x)
df_delivery_cleaned['partner_density'] = df_delivery_cleaned['partner_density'].apply(lambda x: df_delivery_cleaned['partner_density'].median() if x < 0 or np.isinf(x) else x)
df_delivery_cleaned['busy_ratio'] = df_delivery_cleaned['busy_ratio'].apply(lambda x: df_delivery_cleaned['busy_ratio'].median() if x < 0 or np.isinf(x) else x)
df_delivery_cleaned['avg_item_price'] = df_delivery_cleaned['avg_item_price'].apply(lambda x: df_delivery_cleaned['avg_item_price'].median() if x < 0 else x)

# Verificar si ahora no hay valores negativos ni infinitos
max_values = df_delivery_cleaned[numerical_columns].max()
min_values = df_delivery_cleaned[numerical_columns].min()

max_values, min_values


(total_items                  411.0
 subtotal                    6405.0
 num_distinct_items            15.0
 min_item_price              6400.0
 max_item_price              6400.0
 total_onshift_partners       171.0
 total_busy_partners          154.0
 total_outstanding_orders     278.0
 partner_density               34.0
 order_hour                    23.0
 busy_ratio                    29.0
 avg_item_price              6400.0
 dtype: float64,
 total_items                  1.000000
 subtotal                     0.000000
 num_distinct_items           1.000000
 min_item_price               1.000000
 max_item_price              52.000000
 total_onshift_partners       1.000000
 total_busy_partners          1.000000
 total_outstanding_orders     1.000000
 partner_density              0.020833
 order_hour                   0.000000
 busy_ratio                  -0.000000
 avg_item_price               7.579075
 dtype: float64)

In [96]:
from sklearn.ensemble import RandomForestRegressor

# Inicializar el modelo de Random Forest
modelo_rf = RandomForestRegressor(random_state=42, n_estimators=100, max_depth=5)

# Entrenar el modelo
modelo_rf.fit(X_train, y_train)

# Realizar predicciones
y_pred_rf = modelo_rf.predict(X_test)

# Calcular MSE, MAE y R² para el modelo de Random Forest
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

mse_rf, mae_rf, r2_rf


(798844.4922242219, 696.4162912180512, 0.14438267690426454)

In [101]:

# Cargo datos nuevos el archivo con los datos
df_delivery = pd.read_csv('../01_CSV Trabajo/data_analisis.csv')

# Transformar las columnas categóricas
order_day_encoded = encoder.fit_transform(df_delivery[['order_day']])
order_period_encoded = encoder.fit_transform(df_delivery[['order_period']])
order_size_encoded = encoder.fit_transform(df_delivery[['order_size']])
grouped_category_encoded = encoder.fit_transform(df_delivery[['grouped_category']])

# Normalizar las columnas numéricas
numerical_data_scaled = scaler.fit_transform(df_delivery[numerical_columns])

# Concatenar todos los datos preprocesados
X_processed = pd.concat([pd.DataFrame(order_day_encoded), 
                         pd.DataFrame(order_period_encoded),
                         pd.DataFrame(order_size_encoded), 
                         pd.DataFrame(grouped_category_encoded), 
                         pd.DataFrame(numerical_data_scaled)], axis=1)

# Variable objetivo (delivery_duration)
y_processed = df_delivery['delivery_duration']

# Dividir los datos en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size=0.2, random_state=42)

# Volver a realizar el ajuste de hiperparámetros

# Definir el espacio de búsqueda para los hiperparámetros
parametros = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Inicializar el modelo de Random Forest
modelo_rf = RandomForestRegressor(random_state=42)

# Inicializar GridSearchCV con validación cruzada
grid_search = GridSearchCV(estimator=modelo_rf, param_grid=parametros, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

# Entrenar el modelo con GridSearch
grid_search.fit(X_train, y_train)

# Mostrar los mejores parámetros encontrados
mejores_parametros = grid_search.best_params_

# Evaluar el modelo con los mejores parámetros
mejor_modelo = grid_search.best_estimator_
y_pred_rf_ajustado = mejor_modelo.predict(X_test)

# Calcular MSE, MAE y R² para el modelo ajustado

mse_rf_ajustado = mean_squared_error(y_test, y_pred_rf_ajustado)
mae_rf_ajustado = mean_absolute_error(y_test, y_pred_rf_ajustado)
r2_rf_ajustado = r2_score(y_test, y_pred_rf_ajustado)

mejores_parametros, mse_rf_ajustado, mae_rf_ajustado, r2_rf_ajustado


405 fits failed out of a total of 1215.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
219 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/luismgl/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/luismgl/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/Users/luismgl/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/Users/luismgl/Library/Python/3.9/lib/python/site-packages/sklearn/utils/_param_validation.py", l

({'max_depth': 15,
  'max_features': 'sqrt',
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 200},
 679652.905898803,
 638.5535686208785,
 0.2720450530237265)

In [145]:
# Cargar el archivo con los datos
df_analisis = pd.read_csv('../01_CSV Trabajo/data_analisis.csv')

# Mostrar las primeras filas del dataframe para revisar su estructura
df_analisis.head()

Unnamed: 0,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_partners,total_busy_partners,total_outstanding_orders,delivery_duration,partner_density,order_day,order_hour,order_period,busy_ratio,avg_item_price,order_size,grouped_category
0,4,3441,4,557.0,1239,33.0,14.0,21.0,3779.0,1.5,Friday,22,Night,0.411765,860.25,Medium,American
1,1,1900,1,1400.0,1400,1.0,2.0,2.0,4024.0,0.333333,Tuesday,21,Evening,1.0,1900.0,Small,Mexican
2,4,4771,3,820.0,1604,8.0,6.0,18.0,1586.0,0.421053,Monday,0,Night,0.666667,1192.75,Medium,Indian
3,1,1525,1,1525.0,1525,5.0,6.0,8.0,2273.0,0.555556,Thursday,3,Night,1.0,1525.0,Small,Italian
4,2,3620,2,1425.0,2195,5.0,5.0,7.0,2988.0,0.625,Tuesday,2,Night,0.833333,1810.0,Small,Italian


In [146]:
# Tomar una muestra más pequeña de los datos para evitar problemas de memoria
df_sample = df_analisis.sample(frac=0.1, random_state=42)  # Tomamos el 10% de los datos

# Mostrar las primeras filas del dataframe con la muestra más pequeña
df_sample.head()

Unnamed: 0,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_partners,total_busy_partners,total_outstanding_orders,delivery_duration,partner_density,order_day,order_hour,order_period,busy_ratio,avg_item_price,order_size,grouped_category
45222,3,4627,3,1029.0,1799,34.0,33.0,38.0,3706.0,0.871795,Thursday,2,Night,0.942857,1542.333333,Medium,Italian
100397,3,3275,3,225.0,1500,123.0,101.0,157.0,2994.0,0.778481,Friday,3,Night,0.814516,1091.666667,Medium,Other
63589,1,1070,1,695.0,695,15.0,16.0,19.0,2976.0,0.75,Wednesday,22,Night,1.0,1070.0,Small,Other
118513,3,4240,3,1095.0,1650,107.0,111.0,196.0,4313.0,0.543147,Sunday,3,Night,1.027778,1413.333333,Medium,Desserts
48796,8,5400,4,250.0,2200,36.0,27.0,27.0,2480.0,1.285714,Wednesday,0,Night,0.72973,675.0,Medium,Italian


In [149]:
df_sample.dtypes

total_items                   int64
subtotal                      int64
num_distinct_items            int64
min_item_price              float64
max_item_price                int64
total_onshift_partners      float64
total_busy_partners         float64
total_outstanding_orders    float64
delivery_duration           float64
partner_density             float64
order_day                    object
order_hour                    int64
order_period                 object
busy_ratio                  float64
avg_item_price              float64
order_size                   object
grouped_category             object
dtype: object

In [157]:
# Tomar una muestra más pequeña del conjunto de datos para evitar problemas de memoria
df_sample_small = df_analisis.sample(frac=0.1, random_state=42)  # Tomar el 10% de los datos

# Verificar la estructura de la muestra
df_sample_small.head()

Unnamed: 0,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_partners,total_busy_partners,total_outstanding_orders,delivery_duration,partner_density,order_day,order_hour,order_period,busy_ratio,avg_item_price,order_size,grouped_category
45222,3,4627,3,1029.0,1799,34.0,33.0,38.0,3706.0,0.871795,Thursday,2,Night,0.942857,1542.333333,Medium,Italian
100397,3,3275,3,225.0,1500,123.0,101.0,157.0,2994.0,0.778481,Friday,3,Night,0.814516,1091.666667,Medium,Other
63589,1,1070,1,695.0,695,15.0,16.0,19.0,2976.0,0.75,Wednesday,22,Night,1.0,1070.0,Small,Other
118513,3,4240,3,1095.0,1650,107.0,111.0,196.0,4313.0,0.543147,Sunday,3,Night,1.027778,1413.333333,Medium,Desserts
48796,8,5400,4,250.0,2200,36.0,27.0,27.0,2480.0,1.285714,Wednesday,0,Night,0.72973,675.0,Medium,Italian


In [158]:
# Codificar las columnas categóricas (order_day, order_period, grouped_category) utilizando OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Transformar las columnas categóricas
order_day_encoded = encoder.fit_transform(df_sample[['order_day']])
order_period_encoded = encoder.fit_transform(df_sample[['order_period']])
grouped_category_encoded = encoder.fit_transform(df_sample[['grouped_category']])

# Normalizar las columnas numéricas
numerical_columns = ['total_items', 'subtotal', 'num_distinct_items', 'min_item_price', 'max_item_price',
                     'total_onshift_partners', 'total_busy_partners', 'total_outstanding_orders', 
                     'order_hour', 'busy_ratio', 'avg_item_price']

scaler = StandardScaler()
numerical_data_scaled = scaler.fit_transform(df_sample[numerical_columns])

# Concatenar todas las características preprocesadas
X_processed = pd.concat([pd.DataFrame(order_day_encoded), 
                         pd.DataFrame(order_period_encoded), 
                         pd.DataFrame(grouped_category_encoded), 
                         pd.DataFrame(numerical_data_scaled)], axis=1)

# Variable objetivo (delivery_duration)
y_processed = df_sample['delivery_duration']

# Dividir los datos en conjunto de entrenamiento y prueba
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size=0.2, random_state=42)

# Inicializar el modelo de Random Forest
modelo_rf = RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10)

# Entrenar el modelo
modelo_rf.fit(X_train, y_train)

# Realizar predicciones
y_pred_rf = modelo_rf.predict(X_test)

# Calcular MSE, MAE y R² para el modelo de Random Forest
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

mse_rf, mae_rf, r2_rf


(715594.0172925747, 661.4014816703027, 0.23539837592006319)

In [154]:
df_analisis.columns

Index(['total_items', 'subtotal', 'num_distinct_items', 'min_item_price',
       'max_item_price', 'total_onshift_partners', 'total_busy_partners',
       'total_outstanding_orders', 'delivery_duration', 'partner_density',
       'order_day', 'order_hour', 'order_period', 'busy_ratio',
       'avg_item_price', 'order_size', 'grouped_category'],
      dtype='object')

In [159]:
# Filtrar las columnas necesarias para el modelo
caracteristicas_relevantes = [
    'grouped_category', 'total_onshift_partners', 'total_busy_partners', 'total_outstanding_orders', 
    'order_hour', 'order_day'
]

# Filtrar el dataframe para que solo contenga las columnas necesarias
df_relevantes = df_analisis[caracteristicas_relevantes + ['delivery_duration']]

# Comprobar si hay valores nulos o atípicos en los datos
df_relevantes.isnull().sum()


grouped_category            0
total_onshift_partners      0
total_busy_partners         0
total_outstanding_orders    0
order_hour                  0
order_day                   0
delivery_duration           0
dtype: int64

In [160]:
# Identificar y eliminar outliers en 'delivery_duration' usando el método de IQR
Q1 = df_relevantes['delivery_duration'].quantile(0.25)
Q3 = df_relevantes['delivery_duration'].quantile(0.75)
IQR = Q3 - Q1

# Calcular los límites inferior y superior para los outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filtrar los datos eliminando los outliers
df_cleaned = df_relevantes[(df_relevantes['delivery_duration'] >= lower_bound) & (df_relevantes['delivery_duration'] <= upper_bound)]

# Verificar que los outliers han sido eliminados
df_cleaned['delivery_duration'].describe()


count    162381.000000
mean       2704.661309
std         862.780617
min         297.000000
25%        2065.000000
50%        2592.000000
75%        3244.000000
max        5154.000000
Name: delivery_duration, dtype: float64

In [162]:
# Codificar las columnas categóricas (order_day, grouped_category) utilizando OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Transformar las columnas categóricas
order_day_encoded = encoder.fit_transform(df_cleaned[['order_day']])
grouped_category_encoded = encoder.fit_transform(df_cleaned[['grouped_category']])

# Normalizar las columnas numéricas
numerical_columns = ['total_onshift_partners', 'total_busy_partners', 'total_outstanding_orders', 'order_hour']

scaler = StandardScaler()
numerical_data_scaled = scaler.fit_transform(df_cleaned[numerical_columns])

# Concatenar todas las características preprocesadas
X_processed = pd.concat([pd.DataFrame(order_day_encoded), 
                         pd.DataFrame(grouped_category_encoded), 
                         pd.DataFrame(numerical_data_scaled)], axis=1)

# Variable objetivo (delivery_duration)
y_processed = df_cleaned['delivery_duration']

# Dividir los datos en conjunto de entrenamiento y prueba
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size=0.2, random_state=42)

# Inicializar el modelo de Random Forest
modelo_rf = RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10)

# Entrenar el modelo
modelo_rf.fit(X_train, y_train)

# Realizar predicciones
y_pred_rf = modelo_rf.predict(X_test)

# Calcular MSE, MAE y R² para el modelo de Random Forest
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

mse_rf, mae_rf, r2_rf


(577283.4462228369, 605.0270788340529, 0.2242878209817858)

In [164]:
# Codificar las columnas categóricas (order_day, grouped_category) utilizando OneHotEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler

encoder = OneHotEncoder(sparse_output=False)

# Transformar las columnas categóricas
order_day_encoded = encoder.fit_transform(df_cleaned[['order_day']])
grouped_category_encoded = encoder.fit_transform(df_cleaned[['grouped_category']])

# Normalizar las columnas numéricas
numerical_columns = ['total_onshift_partners', 'total_busy_partners', 'total_outstanding_orders', 'order_hour']

scaler = StandardScaler()
numerical_data_scaled = scaler.fit_transform(df_cleaned[numerical_columns])

# Concatenar todas las características preprocesadas
X_processed = pd.concat([pd.DataFrame(order_day_encoded), 
                         pd.DataFrame(grouped_category_encoded), 
                         pd.DataFrame(numerical_data_scaled)], axis=1)

# Variable objetivo (delivery_duration)
y_processed = df_cleaned['delivery_duration']

# Dividir los datos en conjunto de entrenamiento y prueba
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size=0.2, random_state=42)

# Inicializar el modelo de Random Forest
from sklearn.ensemble import RandomForestRegressor

modelo_rf = RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10)

# Entrenar el modelo
modelo_rf.fit(X_train, y_train)

# Realizar predicciones
y_pred_rf = modelo_rf.predict(X_test)

# Calcular MSE, MAE y R² para el modelo de Random Forest
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

mse_rf, mae_rf, r2_rf


(577283.4462228369, 605.0270788340529, 0.2242878209817858)

In [165]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

# Definir las columnas categóricas y numéricas
columnas_categoricas = ['order_day', 'grouped_category']
columnas_numericas = ['order_hour', 'total_onshift_partners', 'total_busy_partners', 'total_outstanding_orders']

# Preprocesamiento
preprocesador = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), columnas_categoricas),
        ('num', StandardScaler(), columnas_numericas)
    ])

# Definir el modelo
modelo = GradientBoostingRegressor(random_state=42)

# Crear el pipeline
pipeline = Pipeline(steps=[('preprocesamiento', preprocesador),
                           ('modelo', modelo)])

# Definir el espacio de búsqueda para los hiperparámetros
parametros = {
    'modelo__n_estimators': [100, 200, 300],
    'modelo__learning_rate': [0.01, 0.1, 0.2],
    'modelo__max_depth': [3, 5, 7],
    'modelo__subsample': [0.8, 1.0]
}

# Dividir los datos en conjunto de entrenamiento y prueba
X = df_delivery[['order_day', 'grouped_category', 'order_hour', 'total_onshift_partners', 'total_busy_partners', 'total_outstanding_orders']]
y = df_delivery['delivery_duration']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inicializar GridSearchCV con validación cruzada
grid_search = GridSearchCV(estimator=pipeline, param_grid=parametros, cv=5, n_jobs=-1, scoring='r2')

# Entrenar el modelo con GridSearch
grid_search.fit(X_train, y_train)

# Mostrar los mejores parámetros encontrados
mejores_parametros = grid_search.best_params_
print("Mejores parámetros:", mejores_parametros)

# Evaluar el modelo con los mejores parámetros
mejor_modelo = grid_search.best_estimator_
y_pred = mejor_modelo.predict(X_test)

# Calcular MSE, MAE y R² para el modelo ajustado
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("MAE:", mae)
print("R²:", r2)



Mejores parámetros: {'modelo__learning_rate': 0.1, 'modelo__max_depth': 7, 'modelo__n_estimators': 200, 'modelo__subsample': 0.8}
MSE: 685868.0877393779
MAE: 640.785326836029
R²: 0.26538816635711193


In [166]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Definir el espacio de búsqueda para los hiperparámetros
parametros = {
    'modelo__n_estimators': [100, 200, 300, 500],
    'modelo__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'modelo__max_depth': [3, 5, 7, 9],
    'modelo__subsample': [0.8, 0.9, 1.0]
}

# Inicializar el modelo de GradientBoostingRegressor con los mismos parámetros previos
modelo = GradientBoostingRegressor(random_state=42)

# Crear el pipeline
pipeline = Pipeline(steps=[('preprocesamiento', preprocesador),
                           ('modelo', modelo)])

# Inicializar RandomizedSearchCV con validación cruzada
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=parametros, n_iter=10, cv=5, verbose=2, n_jobs=-1, scoring='r2', random_state=42)

# Entrenar el modelo con RandomizedSearchCV
random_search.fit(X_train, y_train)

# Mostrar los mejores parámetros encontrados
mejores_parametros = random_search.best_params_
print("Mejores parámetros:", mejores_parametros)

# Evaluar el modelo con los mejores parámetros
mejor_modelo = random_search.best_estimator_
y_pred = mejor_modelo.predict(X_test)

# Calcular MSE, MAE y R² para el modelo ajustado
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("MAE:", mae)
print("R²:", r2)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END modelo__learning_rate=0.05, modelo__max_depth=7, modelo__n_estimators=200, modelo__subsample=0.9; total time=  33.9s
[CV] END modelo__learning_rate=0.05, modelo__max_depth=7, modelo__n_estimators=200, modelo__subsample=0.9; total time=  34.3s
[CV] END modelo__learning_rate=0.05, modelo__max_depth=7, modelo__n_estimators=200, modelo__subsample=0.9; total time=  34.3s
[CV] END modelo__learning_rate=0.05, modelo__max_depth=7, modelo__n_estimators=200, modelo__subsample=0.9; total time=  34.3s
[CV] END modelo__learning_rate=0.1, modelo__max_depth=9, modelo__n_estimators=200, modelo__subsample=0.9; total time=  48.1s
[CV] END modelo__learning_rate=0.1, modelo__max_depth=9, modelo__n_estimators=200, modelo__subsample=0.9; total time=  48.1s
[CV] END modelo__learning_rate=0.1, modelo__max_depth=9, modelo__n_estimators=200, modelo__subsample=0.9; total time=  48.4s
[CV] END modelo__learning_rate=0.1, modelo__max_depth=9, mod

In [167]:
# Seleccionar solo las características relevantes para el modelo
caracteristicas_relevantes = [
    'grouped_category', 'total_onshift_partners', 'total_busy_partners', 'total_outstanding_orders', 
    'order_hour', 'order_day'
]

# Filtrar el dataframe para que solo contenga las columnas necesarias
df_relevantes = df_analisis[caracteristicas_relevantes + ['delivery_duration']]

# Verificar si hay valores nulos o atípicos en los datos
df_relevantes.isnull().sum()


grouped_category            0
total_onshift_partners      0
total_busy_partners         0
total_outstanding_orders    0
order_hour                  0
order_day                   0
delivery_duration           0
dtype: int64

In [168]:
# Filtrar el dataframe para que solo contenga las columnas necesarias
df_final = df_analisis[['grouped_category', 'total_onshift_partners', 'total_busy_partners', 
                        'total_outstanding_orders', 'order_hour', 'order_day', 'delivery_duration']]

# Verificar la estructura de las columnas seleccionadas
df_final.head()

Unnamed: 0,grouped_category,total_onshift_partners,total_busy_partners,total_outstanding_orders,order_hour,order_day,delivery_duration
0,American,33.0,14.0,21.0,22,Friday,3779.0
1,Mexican,1.0,2.0,2.0,21,Tuesday,4024.0
2,Indian,8.0,6.0,18.0,0,Monday,1586.0
3,Italian,5.0,6.0,8.0,3,Thursday,2273.0
4,Italian,5.0,5.0,7.0,2,Tuesday,2988.0


In [169]:
df_final.to_csv('df_final.csv', index=False)