In [42]:
# Proceso secuencial en python
# Procedemos a importar las librerias de scikit learn
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Se cargan los datos
trips_df_sk = pd.read_csv("/data/trips.csv")
trips_df_sk = pd.DataFrame(trips_df_sk)

# Se calcula el porcentaje de la propina
trips_df_sk['tip_pct'] = trips_df_sk.tip_amount / trips_df_sk.fare_amount
trips_df_sk['tip_pct'] = trips_df_sk['tip_pct'] * 10

# Preparamos los datos
trips_df_sk = trips_df_sk[['tip_pct','trip_distance',"car_type"]]
trips_df_sk = trips_df_sk.dropna(axis = 0, how = 'any')
label_encoder = LabelEncoder()
trips_df_sk["car_type"] = label_encoder.fit_transform(trips_df_sk["car_type"])
y = trips_df_sk.tip_pct

# Separamos los datos en entrenamiento y prueba, se utilizará un 70%/30%
X_train, X_test, y_train, y_test = train_test_split(trips_df_sk, y, test_size = 0.3)

# Ajustamos un modelo de regresión lineal
lm = linear_model.LinearRegression(fit_intercept = False)
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)

In [47]:
%%time

# Cargamos las librerias para el grid search
import numpy as np
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor

# Configuramos el grid search
X_train, y_train = make_regression(n_features = 3)
model = RandomForestRegressor()
parameters = {'n_estimators':[2, 5, 10, 15, 20], 'max_depth':[10, 50, 100, 500, None]}
grid = GridSearchCV(estimator = model, param_grid = parameters, cv = 15)

# Comenzamos a ejecutar el grid search
grid.fit(X_train, y_train)
"r2 / variance : ", grid.best_score_
grid.best_score_
"Residual sum of squares: %.2f"% np.mean((grid.predict(X_test) - y_test) ** 2)

CPU times: user 11 s, sys: 27.7 s, total: 38.6 s
Wall time: 15.9 s


In [21]:
"Mejores parámetros  : ", grid.best_params_

('Mejores parámetros  : ',
 {'copy_X': True, 'fit_intercept': True, 'n_jobs': 100, 'normalize': True})

In [48]:
%%time

# Cargamos las librerias para usar dask
from dask.distributed import Client
import dask_ml.joblib
from sklearn.externals.joblib import parallel_backend

# start a local Dask client
client = Client()  

# Configuramos los parametros
with parallel_backend('dask'):
    from sklearn.grid_search import GridSearchCV
    X_train, y_train = make_regression(n_features = 3)
    model = RandomForestRegressor()
    parameters = {'n_estimators':[2, 5, 10, 15, 20], 'max_depth':[10, 50, 100, 500, None]}
    grid = GridSearchCV(estimator = model, param_grid = parameters, cv = 15)
    grid_dask.fit(X_train, y_train)
    "Residual sum of squares: %.2f"% np.mean((grid_dask.predict(X_test) - y_test) ** 2)
    grid_dask.best_params_

CPU times: user 29.8 s, sys: 2min 12s, total: 2min 42s
Wall time: 57.2 s


In [23]:
"Mejores parámetros con Dask : ",grid_dask.best_params_

('Mejores parámetros con Dask : ',
 {'copy_X': True, 'fit_intercept': True, 'n_jobs': 100, 'normalize': True})

In [49]:
# Repetimos para delayed
def paralelo(A):
    from sklearn.grid_search import GridSearchCV
    X_train, y_train = make_regression(n_features = 3)
    model = RandomForestRegressor()
    parameters = {'n_estimators':[2, 5, 10, 15, 20], 'max_depth':[10, 50, 100, 500, None]}
    grid = GridSearchCV(estimator = model, param_grid = parameters, cv = 15)
    grid.fit(X_train, y_train)
    return grid.best_params_

delayed_grid = delayed(grid)
print("Paralelo:")
%time paralelo(trips_df_sk)

Paralelo:
CPU times: user 12.2 s, sys: 33.7 s, total: 45.9 s
Wall time: 18.1 s


{'max_depth': 100, 'n_estimators': 20}

In [26]:
#  Con esta configuracion, se observa que skitlear gana con 38.6 s, luego delayed con 45.9 s y por ultimo dask_ml con 2min 42s