In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score, roc_auc_score

df_train = pd.read_csv('data/churn_train.csv', parse_dates=['last_trip_date', 'signup_date'])
df_test = pd.read_csv('data/churn_test.csv', parse_dates=['last_trip_date', 'signup_date'])
df_tot = pd.read_csv('data/churn.csv', parse_dates=['last_trip_date', 'signup_date'])

cutoff = cutoff = df_tot.last_trip_date.max() - pd.to_timedelta(30,'D')
avg_driver_rating_impute = df_train.avg_rating_by_driver.mean()
avg_rating_of_driver_impute = df_train.avg_rating_of_driver.mean()

def prepare_data(df):
    missing = df.isna().sum()
    clean_labels = missing[missing == 0].index
    dirtyl_labels = missing[missing > 0].index
    new_df = df[clean_labels]
    
    to_keep = ['avg_dist', 'avg_surge', 'surge_pct', 'weekday_pct', 'avg_rating_by_driver', 'avg_rating_of_driver']
    filtered_df = new_df.loc[:, to_keep]
    
    y = df.last_trip_date <= cutoff
    return filtered_df, y


train_X, train_y = prepare_data(df_train)
test_X, test_y = prepare_data(df_test)

rf = RandomForestClassifier()
model = rf.fit(train_X, train_y)
yhat = model.predict(test_X)

def specificity_score(y_true, y_pred):
    TN = len(y_true[(y_true == y_pred) & (y_true == False)])
    FP = len(y_true[(y_true != y_pred) & (y_true == False)])
    return TN / (TN + FP)

def ezprint_cmatrix(ytrue, yhat):
    print("Precision: ", precision_score(ytrue, yhat))
    print("Recall: ", recall_score(ytrue, yhat))
    print("Accuracy score", accuracy_score(ytrue, yhat))
    print("Specificity: ", specificity_score(ytrue, yhat))



ezprint_cmatrix(test_y, yhat)
ezprint_cmatrix(test_y, np.ones(len(yhat)))


Precision:  0.7541959108941104
Recall:  0.781501976284585
Accuracy score 0.7007
Specificity:  0.5616326530612245
Precision:  0.6325
Recall:  1.0
Accuracy score 0.6325
Specificity:  0.0


In [43]:
df_tot.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49990,49991,49992,49993,49994,49995,49996,49997,49998,49999
avg_dist,3.67,8.26,0.77,2.36,3.13,10.56,3.95,2.04,4.36,2.37,...,3.38,1.06,7.58,2.53,2.25,5.63,0,3.86,4.58,3.49
avg_rating_by_driver,5,5,5,4.9,4.9,5,4,5,5,5,...,5,5,5,4.7,4.5,4.2,4,5,3.5,5
avg_rating_of_driver,4.7,5,4.3,4.6,4.4,3.5,,5,4.5,,...,4.7,5,1,4.8,4.6,5,,5,3,
avg_surge,1.1,1,1,1.14,1.19,1,1,1,1,1,...,1.08,1.25,1,1.11,1.44,1,1,1,1,1
city,King's Landing,Astapor,Astapor,King's Landing,Winterfell,Winterfell,Astapor,Winterfell,Winterfell,Winterfell,...,Astapor,Winterfell,King's Landing,Astapor,Astapor,King's Landing,Astapor,Winterfell,Astapor,Astapor
last_trip_date,2014-06-17 00:00:00,2014-05-05 00:00:00,2014-01-07 00:00:00,2014-06-29 00:00:00,2014-03-15 00:00:00,2014-06-06 00:00:00,2014-01-25 00:00:00,2014-01-29 00:00:00,2014-02-01 00:00:00,2014-01-05 00:00:00,...,2014-05-18 00:00:00,2014-06-29 00:00:00,2014-01-19 00:00:00,2014-07-01 00:00:00,2014-05-31 00:00:00,2014-06-05 00:00:00,2014-01-25 00:00:00,2014-05-22 00:00:00,2014-01-15 00:00:00,2014-04-20 00:00:00
phone,iPhone,Android,iPhone,iPhone,Android,iPhone,Android,iPhone,Android,Android,...,iPhone,iPhone,iPhone,iPhone,iPhone,iPhone,iPhone,Android,iPhone,Android
signup_date,2014-01-25 00:00:00,2014-01-29 00:00:00,2014-01-06 00:00:00,2014-01-10 00:00:00,2014-01-27 00:00:00,2014-01-09 00:00:00,2014-01-24 00:00:00,2014-01-28 00:00:00,2014-01-21 00:00:00,2014-01-03 00:00:00,...,2014-01-13 00:00:00,2014-01-08 00:00:00,2014-01-18 00:00:00,2014-01-03 00:00:00,2014-01-03 00:00:00,2014-01-25 00:00:00,2014-01-24 00:00:00,2014-01-31 00:00:00,2014-01-14 00:00:00,2014-01-18 00:00:00
surge_pct,15.4,0,0,20,11.8,0,0,0,0,0,...,33.3,100,0,11.1,37.5,0,0,0,0,0
trips_in_first_30_days,4,0,3,9,14,2,1,2,2,1,...,1,0,1,3,1,0,1,0,2,0


In [44]:
categorical_data = ['city', 'phone', 'luxury_car_user']

In [None]:
def dummies(X, cols):
    pd.get_dummies(X, columns = cols, drop_first=True,)