# RocketML tutorial on Logistic Regression on NYC Taxi Data to predict tips

## Import Libraries

In [None]:
# Scikit learn
from sklearn.metrics import f1_score, confusion_matrix,roc_auc_score
from sklearn.datasets import load_svmlight_file

# Numpy
import numpy as np

# Dask
import dask.dataframe as dd
from dask import persist, compute
from dask_glm.estimators import LogisticRegression

## Start dask for data processing

In [None]:
hostname = subprocess.check_output('hostname').strip()
from dask.distributed import Client
client = Client(hostname+":8786")
client

## Read Data File and Inspect its contents

In [None]:
filename = "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2017-01.csv"
df = dd.read_csv(filename)

## Check data statistics

In [None]:
stats = df.describe()
print(stats.compute())

## Create test and train data

In [None]:
df_train,df_test = df.random_split([0.8,0.2],random_state=2)

In [None]:
columns = ['VendorID','passenger_count','trip_distance','payment_type','fare_amount']
X_train,y_train = df_train[columns], df_train["tip_amount"] > 0
X_test,y_test = df_test[columns], df_test["tip_amount"] > 0
X_train,y_train,X_test,y_test = persist(X_train,y_train,X_test,y_test)

In [None]:
X_train.head()

## Logistic Regression

In [None]:
%%time
lm = LogisticRegression(fit_intercept=False)
lm.fit(X_train.values,y_train.values)

## Predict and compute F1 score and AUC

In [None]:
y_predict = lm.predict(X_test.values)
y_predict = y_predict.compute()
y_true = y_test.values.compute()

In [None]:
f1 = f1_score(y_true,y_predict)
roc = roc_auc_score(y_true,y_predict)

In [None]:
print("F1 score = %f, AUC = %f"%(f1,roc))