In [180]:
# Importado de librerías

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [181]:
# Lectura de fichero

df_1 = pd.read_csv('../data/input/train_2_pr.csv')
df_2 = pd.read_csv('../data/input/shipping_companies_details_1.csv')

In [182]:
# Mostrado de datos

df_1.head()

Unnamed: 0.1,Unnamed: 0,shipment_id,send_timestamp,pick_up_point,drop_off_point,source_country,destination_country,freight_cost,gross_weight,shipment_charges,shipment_mode,shipping_company,selected,shipping_time
0,0,S000720,2019-06-08 07:17:51,A,Y,GB,IN,88.61,355.0,0.75,Air,SC3,Y,5.00741
1,1,S000725,2019-07-12 15:23:21,A,Y,GB,IN,85.65,105.0,0.9,Ocean,SC1,Y,21.41215
2,2,S000736,2019-10-04 14:23:29,A,Y,GB,IN,86.22,100.0,0.75,Air,SC3,Y,5.33692
3,3,S000738,2020-01-07 09:19:50,A,Y,GB,IN,94.43,1071.0,1.05,Air,SC2,Y,5.14792
4,4,S000739,2020-04-11 06:36:03,A,Y,GB,IN,94.24,2007.0,0.75,Air,SC3,Y,5.03067


In [183]:
# Eliminado de variable `Unnamed: 0` por ser un índice sin uso

df_1 = df_1.drop(['Unnamed: 0'], axis=1, errors='ignore')

In [184]:
# Eliminado de variable `shipment_id`

df_1 = df_1.drop(['shipment_id'], axis=1, errors='ignore')

In [185]:
# Eliminado de variables sobrantes `pick_up_point` y `drop_off_point`

df_1 = df_1.drop(['pick_up_point', 'drop_off_point'], axis=1, errors='ignore')

In [186]:
# Eliminado de variable `selected` por tener un único valor

df_1 = df_1.drop(['selected'], axis=1, errors='ignore')

In [187]:
# Labelizado de variables `source_country`, `destination_country`, `shipment_mode` y `shipping_company`

from sklearn.preprocessing import LabelEncoder

df_1['source_country'] = LabelEncoder().fit_transform(df_1['source_country'])
df_1['destination_country'] = LabelEncoder().fit_transform(df_1['destination_country'])
df_1['shipment_mode'] = LabelEncoder().fit_transform(df_1['shipment_mode'])
df_1['shipping_company'] = LabelEncoder().fit_transform(df_1['shipping_company'])

In [188]:
# Conversión de timestamp `send_timestamp` a segundos desde Epoch

df_1['send_timestamp'] = pd.to_datetime(df_1['send_timestamp']).astype(np.int64) // 10**9

In [189]:
# Mostrado de datos tras preprocesado

df_1.head()

Unnamed: 0,send_timestamp,source_country,destination_country,freight_cost,gross_weight,shipment_charges,shipment_mode,shipping_company,shipping_time
0,1559978271,0,1,88.61,355.0,0.75,0,2,5.00741
1,1562945001,0,1,85.65,105.0,0.9,1,0,21.41215
2,1570199009,0,1,86.22,100.0,0.75,0,2,5.33692
3,1578388790,0,1,94.43,1071.0,1.05,0,1,5.14792
4,1586586963,0,1,94.24,2007.0,0.75,0,2,5.03067


In [190]:
# Generacion de subconjunto de datos para entrenamiento y prueba

from sklearn.model_selection import train_test_split

X = df_1.drop(['shipping_time'], axis=1)
y = df_1['shipping_time']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [191]:
# Creacion de tabla de resultados

models_results = pd.DataFrame(columns = [
    'Model',
    'Mean Squared Error',
    'Training time',
    'Prediction time'
])

In [192]:
# Entrenamiento y evaluación de SVM de regresion

from sklearn.svm import SVR
import time
from sklearn.metrics import mean_squared_error

kernel_list = ['poly', 'rbf', 'sigmoid']

for kernel in kernel_list:
    model = SVR(kernel=kernel)

    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    start_time = time.time()
    y_pred = model.predict(X_test)
    prediction_time = time.time() - start_time

    actual_results =  pd.DataFrame([[
            f'SVR ({kernel})',
            mean_squared_error(y_test, y_pred),
            training_time,
            prediction_time
        ]],
        columns = models_results.columns
    )

    if models_results.empty:
        models_results = actual_results
    else:
        models_results = pd.concat([models_results, actual_results], ignore_index=True)

In [193]:
# Entrenamiento y evaluación de Árbol de regresión

from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()

start_time = time.time()
model.fit(X_train, y_train)
training_time = time.time() - start_time

start_time = time.time()
y_pred = model.predict(X_test)
prediction_time = time.time() - start_time

actual_results =  pd.DataFrame([[
        'Decision Tree Regressor',
        mean_squared_error(y_test, y_pred),
        training_time,
        prediction_time
    ]],
    columns = models_results.columns
)

if models_results.empty:
    models_results = actual_results
else:
    models_results = pd.concat([models_results, actual_results], ignore_index=True)

In [194]:
# Entrenamiento y evaluación de Random Forest de regresión

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

start_time = time.time()
model.fit(X_train, y_train)
training_time = time.time() - start_time

start_time = time.time()
y_pred = model.predict(X_test)
prediction_time = time.time() - start_time

actual_results =  pd.DataFrame([[
        'Random Forest Regressor',
        mean_squared_error(y_test, y_pred),
        training_time,
        prediction_time
    ]],
    columns = models_results.columns
)

if models_results.empty:
    models_results = actual_results
else:
    models_results = pd.concat([models_results, actual_results], ignore_index=True)

In [195]:
# Entrenamiento y evaluación de Ada Boost de regresión

from sklearn.ensemble import AdaBoostRegressor

model = AdaBoostRegressor()

start_time = time.time()
model.fit(X_train, y_train)
training_time = time.time() - start_time

start_time = time.time()
y_pred = model.predict(X_test)
prediction_time = time.time() - start_time

actual_results =  pd.DataFrame([[
        'Ada Boost Regressor',
        mean_squared_error(y_test, y_pred),
        training_time,
        prediction_time
    ]],
    columns = models_results.columns
)

if models_results.empty:
    models_results = actual_results
else:
    models_results = pd.concat([models_results, actual_results], ignore_index=True)

In [196]:
# Mostrado de resultados

display(models_results)

Unnamed: 0,Model,Mean Squared Error,Training time,Prediction time
0,SVR (poly),145.641919,0.616025,0.076967
1,SVR (rbf),149.987031,0.699035,0.406
2,SVR (sigmoid),150.672898,0.916037,0.184999
3,Decision Tree Regressor,92.197683,0.075997,0.001036
4,Random Forest Regressor,52.194069,4.668031,0.026969
5,Ada Boost Regressor,46.001172,0.090035,0.003004


In [197]:
# Mostrado de resultados sin cuadrado (Absoluto para mejor comprensión)

models_results['Mean Squared Error'] = models_results['Mean Squared Error'].apply(lambda x: np.sqrt(x))

display(models_results)

Unnamed: 0,Model,Mean Squared Error,Training time,Prediction time
0,SVR (poly),12.068219,0.616025,0.076967
1,SVR (rbf),12.246919,0.699035,0.406
2,SVR (sigmoid),12.274889,0.916037,0.184999
3,Decision Tree Regressor,9.601962,0.075997,0.001036
4,Random Forest Regressor,7.224546,4.668031,0.026969
5,Ada Boost Regressor,6.782416,0.090035,0.003004


Estoy calentito ahora mismo, será a lo mejor del catarro-gripe