# Python vs DASK

Importamos las librerías necesarias y abrimos la conexión

In [181]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline

In [182]:
from dask.distributed import Client

client = Client("scheduler:8786")

Ahora cargamos la base de datos en Pandas

In [183]:
data = pd.read_csv("/data/trips.csv")

In [184]:
data.head()

Unnamed: 0,car_type,fare_amount,passenger_count,taxi_id,tip_amount,tpep_dropoff_datetime,tpep_pickup_datetime,trip_distance
0,A,22.0,1,1,4.6,2015-01-03 01:37:02,2015-01-03 01:17:32,6.9
1,A,9.0,1,1,0.0,2015-01-05 23:35:02,2015-01-05 23:25:15,1.81
2,A,7.5,1,1,1.0,2015-01-06 15:22:12,2015-01-06 15:11:45,0.96
3,A,8.5,1,1,1.0,2015-01-08 08:31:23,2015-01-08 08:22:12,1.9
4,A,7.5,1,1,1.66,2015-01-08 12:35:54,2015-01-08 12:26:26,1.0


Ahora generamos una nueva columna con la proporción de propina de cada viajem

In [185]:
data['prop_tip_amount'] =  data['tip_amount'] / data['fare_amount']

y revisamos si generamos valores faltantes

In [186]:
data[pd.isna(data['prop_tip_amount'])]

Unnamed: 0,car_type,fare_amount,passenger_count,taxi_id,tip_amount,tpep_dropoff_datetime,tpep_pickup_datetime,trip_distance,prop_tip_amount
3276,A,0.0,5,177,0.0,2015-01-06 12:46:07,2015-01-06 12:43:31,0.23,
4050,B,0.0,2,218,0.0,2015-01-24 00:35:26,2015-01-23 23:57:43,13.4,
5739,B,0.0,2,313,0.0,2015-01-28 20:23:19,2015-01-28 20:22:19,4.8,


Dado que estos casos son anómalos y no tiene sentido intentarlos predecir ya que no se le cobró a los pasajeros nos deshacemos de estas tres obsevaciones para analizar el resto de la base de datos

In [187]:
data = data.dropna()

Verificamos que ya no contenga NAs nuestra base

In [188]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9195 entries, 0 to 9197
Data columns (total 9 columns):
car_type                 9195 non-null object
fare_amount              9195 non-null float64
passenger_count          9195 non-null int64
taxi_id                  9195 non-null int64
tip_amount               9195 non-null float64
tpep_dropoff_datetime    9195 non-null object
tpep_pickup_datetime     9195 non-null object
trip_distance            9195 non-null float64
prop_tip_amount          9195 non-null float64
dtypes: float64(4), int64(2), object(3)
memory usage: 718.4+ KB


El siguiente paso es generar una variable categórica a partir de la variable 'car_type'

In [189]:
car_dummies = pd.get_dummies(data['car_type'], prefix='car_type')
car_dummies.head()

Unnamed: 0,car_type_A,car_type_B
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


Añadimos estas nuevas columnas a base de datos y nos deshacemos de 'car_type'

In [190]:
data = pd.concat([data,car_dummies],axis=1)

In [191]:
data.drop('car_type', axis=1, inplace=True)

In [192]:
data.head()

Unnamed: 0,fare_amount,passenger_count,taxi_id,tip_amount,tpep_dropoff_datetime,tpep_pickup_datetime,trip_distance,prop_tip_amount,car_type_A,car_type_B
0,22.0,1,1,4.6,2015-01-03 01:37:02,2015-01-03 01:17:32,6.9,0.209091,1,0
1,9.0,1,1,0.0,2015-01-05 23:35:02,2015-01-05 23:25:15,1.81,0.0,1,0
2,7.5,1,1,1.0,2015-01-06 15:22:12,2015-01-06 15:11:45,0.96,0.133333,1,0
3,8.5,1,1,1.0,2015-01-08 08:31:23,2015-01-08 08:22:12,1.9,0.117647,1,0
4,7.5,1,1,1.66,2015-01-08 12:35:54,2015-01-08 12:26:26,1.0,0.221333,1,0


Dividimos la base de datos en nuestras variables explicativas y nuestra variable explicada

In [193]:
X = data[['fare_amount','passenger_count','trip_distance','car_type_A','car_type_B']]
y = data['prop_tip_amount']

Ahora, generamos el grid y la cross validation para seleccionar los hyperparámetros de nuestra Ridge Regression

In [194]:
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import Ridge

Hyperparámetros a probar:

In [195]:
param_grid = {'alpha':list(np.arange(0.1,2,0.1))}
print(param_grid)
len(param_grid['alpha'])

{'alpha': [0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6, 0.7000000000000001, 0.8, 0.9, 1.0, 1.1, 1.2000000000000002, 1.3000000000000003, 1.4000000000000001, 1.5000000000000002, 1.6, 1.7000000000000002, 1.8000000000000003, 1.9000000000000001]}


19

Utilizamos el parámetro cv=10, para indicar que haga un cross validation por 10-folds para escoger el mejor modelo

In [196]:
seq_grid = GridSearchCV(estimator = Ridge(),param_grid=param_grid,scoring='neg_mean_squared_error',verbose=2, cv=10)
parallel_grid = GridSearchCV(estimator = Ridge(),param_grid=param_grid,scoring='neg_mean_squared_error',verbose=1, cv=10)

In [197]:
from dask import delayed

def run_grid(X,y):
    return parallel_grid.fit(X,y)

delayed_run_grid = delayed(run_grid)

Aquí debajo vemos que la implementación en paralelo usando DASK es más rápida que la implementación secuencial

In [198]:
print('Sequencial:')
%time seq_grid.fit(X,y)
print('En paralelo:')
%time run_grid(X,y)

Sequencial:
Fitting 10 folds for each of 19 candidates, totalling 190 fits
[CV] alpha=0.1 .......................................................
[CV] .............................................. alpha=0.1 -   0.0s
[CV] alpha=0.1 .......................................................
[CV] .............................................. alpha=0.1 -   0.0s
[CV] alpha=0.1 .......................................................
[CV] .............................................. alpha=0.1 -   0.0s
[CV] alpha=0.1 .......................................................
[CV] .............................................. alpha=0.1 -   0.0s
[CV] alpha=0.1 .......................................................
[CV] .............................................. alpha=0.1 -   0.0s
[CV] alpha=0.1 .......................................................
[CV] .............................................. alpha=0.1 -   0.0s
[CV] alpha=0.1 .......................................................
[C

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s



[CV] alpha=0.4 .......................................................
[CV] .............................................. alpha=0.4 -   0.0s
[CV] alpha=0.4 .......................................................
[CV] .............................................. alpha=0.4 -   0.0s
[CV] alpha=0.4 .......................................................
[CV] .............................................. alpha=0.4 -   0.0s
[CV] alpha=0.4 .......................................................
[CV] .............................................. alpha=0.4 -   0.0s
[CV] alpha=0.4 .......................................................
[CV] .............................................. alpha=0.4 -   0.0s
[CV] alpha=0.4 .......................................................
[CV] .............................................. alpha=0.4 -   0.0s
[CV] alpha=0.4 .......................................................
[CV] .............................................. alpha=0.4 -   0.0s
[CV] 

[CV] .............................................. alpha=1.0 -   0.0s
[CV] alpha=1.0 .......................................................
[CV] .............................................. alpha=1.0 -   0.0s
[CV] alpha=1.0 .......................................................
[CV] .............................................. alpha=1.0 -   0.0s
[CV] alpha=1.0 .......................................................
[CV] .............................................. alpha=1.0 -   0.0s
[CV] alpha=1.0 .......................................................
[CV] .............................................. alpha=1.0 -   0.0s
[CV] alpha=1.0 .......................................................
[CV] .............................................. alpha=1.0 -   0.0s
[CV] alpha=1.0 .......................................................
[CV] .............................................. alpha=1.0 -   0.0s
[CV] alpha=1.0 .......................................................
[CV] .

[CV] alpha=1.6 .......................................................
[CV] .............................................. alpha=1.6 -   0.0s
[CV] alpha=1.6 .......................................................
[CV] .............................................. alpha=1.6 -   0.0s
[CV] alpha=1.6 .......................................................
[CV] .............................................. alpha=1.6 -   0.0s
[CV] alpha=1.6 .......................................................
[CV] .............................................. alpha=1.6 -   0.0s
[CV] alpha=1.6 .......................................................
[CV] .............................................. alpha=1.6 -   0.0s
[CV] alpha=1.6 .......................................................
[CV] .............................................. alpha=1.6 -   0.0s
[CV] alpha=1.7000000000000002 ........................................
[CV] ............................... alpha=1.7000000000000002 -   0.0s
[CV] a

[Parallel(n_jobs=1)]: Done 190 out of 190 | elapsed:    1.2s finished


CPU times: user 770 ms, sys: 490 ms, total: 1.26 s
Wall time: 674 ms


[Parallel(n_jobs=1)]: Done 190 out of 190 | elapsed:    0.7s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6, 0.7000000000000001, 0.8, 0.9, 1.0, 1.1, 1.2000000000000002, 1.3000000000000003, 1.4000000000000001, 1.5000000000000002, 1.6, 1.7000000000000002, 1.8000000000000003, 1.9000000000000001]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring='neg_mean_squared_error', verbose=1)

Vemos que efectivamente coinciden las soluciones con las dos implementaciones

In [199]:
print('Sequencial:')
print(seq_grid.best_estimator_)
print('En paralelo:')
print(parallel_grid.best_estimator_)

Sequencial:
Ridge(alpha=1.9000000000000001, copy_X=True, fit_intercept=True,
   max_iter=None, normalize=False, random_state=None, solver='auto',
   tol=0.001)
En paralelo:
Ridge(alpha=1.9000000000000001, copy_X=True, fit_intercept=True,
   max_iter=None, normalize=False, random_state=None, solver='auto',
   tol=0.001)


### Implementación con DASK-ML

Generamos los DASK arrays necesarios para la implementación utilizando esta librería

In [200]:
import dask.array as da

In [203]:
X = da.from_array(np.asarray(X), chunks=(10, 10))
y = da.from_array(np.asarray(y), chunks=(10))

Ahora importamos las funciones para GridSearch y Ridge regression en las librerías de DASK_ML

In [207]:
from dask_ml.model_selection import GridSearchCV
from dask_ml.linear_model import LinearRegression

generamos el grid de parámetros

In [222]:
param_grid = {'C':list(np.arange(0.5,10,0.5))}
print(param_grid)
len(param_grid['C'])

{'C': [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0, 9.5]}


19

Ahora podemos generar el grid de DASK utilizando 10-folds como método de validación para escoger el mejor modelo

In [223]:
dask_grid = GridSearchCV(estimator=LinearRegression(),param_grid=param_grid, scoring= 'neg_mean_squared_error',cv=10 )

Ahora corremos y medimos el tiempo de ejecución utilizando DASK-ML

In [224]:
%time dask_grid.fit(X,y)

CPU times: user 270 ms, sys: 60 ms, total: 330 ms
Wall time: 50.2 s


GridSearchCV(cache_cv=True, cv=10, error_score='raise',
       estimator=LinearRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
         intercept_scaling=1.0, max_iter=100, multiclass='ovr', n_jobs=1,
         penalty='l2', random_state=None, solver='admm',
         solver_kwargs=None, tol=0.0001, verbose=0, warm_start=False),
       iid=True, n_jobs=-1,
       param_grid={'C': [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0, 9.5]},
       refit=True, return_train_score='warn', scheduler=None,
       scoring='neg_mean_squared_error')

Observamos que el tiempo de ejecución de dask_ml es mayor que el de las implementaciones del ejercicio anterior

El mejor parámetro de regularización correspondiente es el siguientes

In [225]:
print('Dask:')
print(dask_grid.best_estimator_)

Dask:
LinearRegression(C=3.0, class_weight=None, dual=False, fit_intercept=True,
         intercept_scaling=1.0, max_iter=100, multiclass='ovr', n_jobs=1,
         penalty='l2', random_state=None, solver='admm',
         solver_kwargs=None, tol=0.0001, verbose=0, warm_start=False)
