# RocketML tutorial on Logistic Regression on NYC Taxi Data to predict tips

## Import Libraries

In [6]:
# Scikit learn
from sklearn.metrics import f1_score, confusion_matrix,roc_auc_score
from sklearn.datasets import load_svmlight_file

# Numpy
import numpy as np

# Subprocess
import subprocess
# Dask
import dask.dataframe as dd
from dask import persist, compute
from dask_glm.estimators import LogisticRegression

## Start dask for data processing

In [7]:
hostname = subprocess.check_output('hostname').strip()
from dask.distributed import Client
client = Client(hostname+":8786")
client

0,1
Client  Scheduler: tcp://ip-172-31-24-238:8786  Dashboard: http://ip-172-31-24-238:8787/status,Cluster  Workers: 4  Cores: 64  Memory: 130.65 GB


## Read Data File and Inspect its contents

In [8]:
filename = "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2017-01.csv"
df = dd.read_csv(filename)

## Check data statistics

In [9]:
stats = df.describe()
print(stats.compute())

           VendorID  passenger_count  trip_distance    RatecodeID  \
count  9.710124e+06     9.710124e+06   9.710124e+06  9.710124e+06   
mean   1.547079e+00     1.628982e+00   2.813899e+00  1.039581e+00   
std    4.977787e-01     1.271994e+00   3.611680e+00  5.059084e-01   
min    1.000000e+00     0.000000e+00   0.000000e+00  1.000000e+00   
25%    1.000000e+00     1.000000e+00   1.020000e+00  1.000000e+00   
50%    2.000000e+00     1.000000e+00   1.780000e+00  1.000000e+00   
75%    2.000000e+00     2.000000e+00   3.380000e+00  1.000000e+00   
max    2.000000e+00     9.000000e+00   2.647100e+02  9.900000e+01   

       PULocationID  DOLocationID  payment_type   fare_amount         extra  \
count  9.710124e+06  9.710124e+06  9.710124e+06  9.710124e+06  9.710124e+06   
mean   1.641065e+02  1.617627e+02  1.337541e+00  1.237423e+01  3.234861e-01   
std    6.664998e+01  7.067207e+01  4.913703e-01  2.652315e+02  4.425577e-01   
min    1.000000e+00  1.000000e+00  1.000000e+00 -3.500000e+02 

## Create test and train data

In [10]:
df_train,df_test = df.random_split([0.8,0.2],random_state=2)

In [11]:
columns = ['VendorID','passenger_count','trip_distance','payment_type','fare_amount']
X_train,y_train = df_train[columns], df_train["tip_amount"] > 0
X_test,y_test = df_test[columns], df_test["tip_amount"] > 0
X_train,y_train,X_test,y_test = persist(X_train,y_train,X_test,y_test)

In [12]:
X_train.head()

Unnamed: 0,VendorID,passenger_count,trip_distance,payment_type,fare_amount
0,1,1,3.3,1,12.5
1,1,1,0.9,1,5.0
2,1,1,1.1,1,5.5
3,1,1,1.1,1,6.0
4,2,1,0.02,2,52.0


## Logistic Regression

In [21]:
%%time
lm = LogisticRegression(fit_intercept=False,max_iter=10)
lm.fit(X_train.values,y_train.values)

CPU times: user 396 ms, sys: 36 ms, total: 432 ms
Wall time: 31.9 s


## Predict and compute F1 score and AUC

In [22]:
y_predict = lm.predict(X_test.values)
y_predict = y_predict.compute()
y_true = y_test.values.compute()

In [23]:
f1 = f1_score(y_true,y_predict)
roc = roc_auc_score(y_true,y_predict)

In [24]:
print("F1 score = %f, AUC = %f"%(f1,roc))

F1 score = 0.875427, AUC = 0.855151
