In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_json('ultimate_data_challenge.json')

In [3]:
df.head()

Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver
0,King's Landing,4,2014-01-25,4.7,1.1,2014-06-17,iPhone,15.4,True,46.2,3.67,5.0
1,Astapor,0,2014-01-29,5.0,1.0,2014-05-05,Android,0.0,False,50.0,8.26,5.0
2,Astapor,3,2014-01-06,4.3,1.0,2014-01-07,iPhone,0.0,False,100.0,0.77,5.0
3,King's Landing,9,2014-01-10,4.6,1.14,2014-06-29,iPhone,20.0,True,80.0,2.36,4.9
4,Winterfell,14,2014-01-27,4.4,1.19,2014-03-15,Android,11.8,False,82.4,3.13,4.9


In [4]:
df.tail()

Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver
49995,King's Landing,0,2014-01-25,5.0,1.0,2014-06-05,iPhone,0.0,False,100.0,5.63,4.2
49996,Astapor,1,2014-01-24,,1.0,2014-01-25,iPhone,0.0,False,0.0,0.0,4.0
49997,Winterfell,0,2014-01-31,5.0,1.0,2014-05-22,Android,0.0,True,100.0,3.86,5.0
49998,Astapor,2,2014-01-14,3.0,1.0,2014-01-15,iPhone,0.0,False,100.0,4.58,3.5
49999,Astapor,0,2014-01-18,,1.0,2014-04-20,Android,0.0,False,0.0,3.49,5.0


In [5]:
df.describe()

Unnamed: 0,trips_in_first_30_days,avg_rating_of_driver,avg_surge,surge_pct,weekday_pct,avg_dist,avg_rating_by_driver
count,50000.0,41878.0,50000.0,50000.0,50000.0,50000.0,49799.0
mean,2.2782,4.601559,1.074764,8.849536,60.926084,5.796827,4.778158
std,3.792684,0.617338,0.222336,19.958811,37.081503,5.707357,0.446652
min,0.0,1.0,1.0,0.0,0.0,0.0,1.0
25%,0.0,4.3,1.0,0.0,33.3,2.42,4.7
50%,1.0,4.9,1.0,0.0,66.7,3.88,5.0
75%,3.0,5.0,1.05,8.6,100.0,6.94,5.0
max,125.0,5.0,8.0,100.0,100.0,160.96,5.0


In [6]:
df.isnull().sum()

city                         0
trips_in_first_30_days       0
signup_date                  0
avg_rating_of_driver      8122
avg_surge                    0
last_trip_date               0
phone                      396
surge_pct                    0
ultimate_black_user          0
weekday_pct                  0
avg_dist                     0
avg_rating_by_driver       201
dtype: int64

We can see from the above that three fields are missing values: avg_rating_of_driver, avg_rating_by_driver, and phone. We feel pretty comfortable filling in the avg_rating fields with means, and for the phone column we'll replace nulls with the most common value (iPhone).

In [7]:
df['avg_rating_by_driver'].fillna(df['avg_rating_by_driver'].mean(),inplace=True)
df['avg_rating_of_driver'].fillna(df['avg_rating_of_driver'].mean(),inplace=True)

In [8]:
df.isnull().sum()

city                        0
trips_in_first_30_days      0
signup_date                 0
avg_rating_of_driver        0
avg_surge                   0
last_trip_date              0
phone                     396
surge_pct                   0
ultimate_black_user         0
weekday_pct                 0
avg_dist                    0
avg_rating_by_driver        0
dtype: int64

In [9]:
df['phone'].value_counts()

iPhone     34582
Android    15022
Name: phone, dtype: int64

In [10]:
df['phone'].fillna('iPhone',inplace=True)

In [11]:
df.isnull().sum()

city                      0
trips_in_first_30_days    0
signup_date               0
avg_rating_of_driver      0
avg_surge                 0
last_trip_date            0
phone                     0
surge_pct                 0
ultimate_black_user       0
weekday_pct               0
avg_dist                  0
avg_rating_by_driver      0
dtype: int64

In [12]:
df.dtypes

city                       object
trips_in_first_30_days      int64
signup_date                object
avg_rating_of_driver      float64
avg_surge                 float64
last_trip_date             object
phone                      object
surge_pct                 float64
ultimate_black_user          bool
weekday_pct               float64
avg_dist                  float64
avg_rating_by_driver      float64
dtype: object

To make our feature engineering easier, we're going to convert the two date columns - "signup_date" and "last_trip_date" - to datetimes. This will make it easier for us to determine if a customer is retained - that is, took a trip in the preceding 30 days. We'll use a lambda function to convert the values in these columns to datetimes.

In [13]:
df['signup_date'] = df['signup_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
df['last_trip_date'] = df['last_trip_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

In [14]:
df.dtypes

city                              object
trips_in_first_30_days             int64
signup_date               datetime64[ns]
avg_rating_of_driver             float64
avg_surge                        float64
last_trip_date            datetime64[ns]
phone                             object
surge_pct                        float64
ultimate_black_user                 bool
weekday_pct                      float64
avg_dist                         float64
avg_rating_by_driver             float64
dtype: object

In [15]:
today = datetime.strptime('2014-07-01','%Y-%m-%d')

In [16]:
df.loc[df['last_trip_date'] > (today - timedelta(days=30)), 'retained'] = 1

In [17]:
df['retained'].value_counts()

1.0    18310
Name: retained, dtype: int64

We can see that 18310/50000 customers are retained, or roughly 36%.

In [18]:
df['retained'].fillna(0,inplace=True)

In [19]:
df

Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver,retained
0,King's Landing,4,2014-01-25,4.700000,1.10,2014-06-17,iPhone,15.4,True,46.2,3.67,5.0,1.0
1,Astapor,0,2014-01-29,5.000000,1.00,2014-05-05,Android,0.0,False,50.0,8.26,5.0,0.0
2,Astapor,3,2014-01-06,4.300000,1.00,2014-01-07,iPhone,0.0,False,100.0,0.77,5.0,0.0
3,King's Landing,9,2014-01-10,4.600000,1.14,2014-06-29,iPhone,20.0,True,80.0,2.36,4.9,1.0
4,Winterfell,14,2014-01-27,4.400000,1.19,2014-03-15,Android,11.8,False,82.4,3.13,4.9,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,King's Landing,0,2014-01-25,5.000000,1.00,2014-06-05,iPhone,0.0,False,100.0,5.63,4.2,1.0
49996,Astapor,1,2014-01-24,4.601559,1.00,2014-01-25,iPhone,0.0,False,0.0,0.00,4.0,0.0
49997,Winterfell,0,2014-01-31,5.000000,1.00,2014-05-22,Android,0.0,True,100.0,3.86,5.0,0.0
49998,Astapor,2,2014-01-14,3.000000,1.00,2014-01-15,iPhone,0.0,False,100.0,4.58,3.5,0.0


For the next step in our feature engineering, we'll encode our categorical data using the get_dummies method.

In [20]:
dummies = pd.get_dummies(df,columns=['city','phone'])

In [21]:
dummies

Unnamed: 0,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver,retained,city_Astapor,city_King's Landing,city_Winterfell,phone_Android,phone_iPhone
0,4,2014-01-25,4.700000,1.10,2014-06-17,15.4,True,46.2,3.67,5.0,1.0,0,1,0,0,1
1,0,2014-01-29,5.000000,1.00,2014-05-05,0.0,False,50.0,8.26,5.0,0.0,1,0,0,1,0
2,3,2014-01-06,4.300000,1.00,2014-01-07,0.0,False,100.0,0.77,5.0,0.0,1,0,0,0,1
3,9,2014-01-10,4.600000,1.14,2014-06-29,20.0,True,80.0,2.36,4.9,1.0,0,1,0,0,1
4,14,2014-01-27,4.400000,1.19,2014-03-15,11.8,False,82.4,3.13,4.9,0.0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0,2014-01-25,5.000000,1.00,2014-06-05,0.0,False,100.0,5.63,4.2,1.0,0,1,0,0,1
49996,1,2014-01-24,4.601559,1.00,2014-01-25,0.0,False,0.0,0.00,4.0,0.0,1,0,0,0,1
49997,0,2014-01-31,5.000000,1.00,2014-05-22,0.0,True,100.0,3.86,5.0,0.0,0,0,1,1,0
49998,2,2014-01-14,3.000000,1.00,2014-01-15,0.0,False,100.0,4.58,3.5,0.0,1,0,0,0,1


We'll continue our feature engineering by adjusting the datetime fields to "days since signup," instead of "date of signup," and "days since last trip," instead of "last trip." This will help with the dimensionality of these fields and allow us to use these fields in our modeling.

In [22]:
dummies['days_since_signup'] = dummies['signup_date'].apply(lambda x: (today - x).days)

In [23]:
dummies['days_since_last_trip'] = dummies['last_trip_date'].apply(lambda x: (today - x).days)

In [24]:
dummies = dummies.drop(['signup_date','last_trip_date'],axis=1)

In [25]:
scaler = StandardScaler()

In [26]:
dummies['days_since_signup'] = scaler.fit_transform(dummies['days_since_signup'].values.reshape(-1,1))

In [27]:
dummies['days_since_last_trip'] = scaler.fit_transform(dummies['days_since_last_trip'].values.reshape(-1,1))

In [29]:
X = dummies.drop('retained',axis=1)
y = dummies['retained']

In [30]:
y

0        1.0
1        0.0
2        0.0
3        1.0
4        0.0
        ... 
49995    1.0
49996    0.0
49997    0.0
49998    0.0
49999    0.0
Name: retained, Length: 50000, dtype: float64

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

Because we're looking at a classificaiton problem with labeled data with no significant class imbalance, we feel good trying a RandomForest right out of the box as our estimator. Let's see how it fares.

In [32]:
forest = RandomForestClassifier()

In [33]:
forest.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [34]:
prediction = forest.predict(X_test)

In [35]:
y_train

48202    0.0
46401    1.0
25087    0.0
33192    1.0
9227     0.0
        ... 
34442    1.0
19359    1.0
35409    1.0
12377    0.0
44626    0.0
Name: retained, Length: 37500, dtype: float64

In [36]:
acc=accuracy_score(prediction,y_test)

In [37]:
acc

1.0

In [38]:
confusion_matrix(y_test,prediction,labels=[0,1])

array([[7895,    0],
       [   0, 4605]], dtype=int64)

Our confusion matrix is looking pretty good! No false positives or false negatives. Let's use a Randomized Search CV just to see whether any hyperparameter tuning can improve our model performance over time.

In [39]:
from sklearn.model_selection import RandomizedSearchCV

We'll use just a small param_grid, given that our model hit 100% accuracy out of the box.

In [40]:
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [2, 5, 10]
}

In [41]:
rs = RandomizedSearchCV(forest,param_grid,n_iter=5)

In [42]:
rs.fit(X,y)

RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
            

In [43]:
rs.cv_results_

{'mean_fit_time': array([1.48860612, 1.14898958, 0.09675174, 0.13449116, 0.37435174]),
 'std_fit_time': array([0.0824593 , 0.0269652 , 0.00334645, 0.00977446, 0.003977  ]),
 'mean_score_time': array([0.09813485, 0.0693193 , 0.01156664, 0.01779194, 0.0287425 ]),
 'std_score_time': array([0.00583955, 0.00123606, 0.00079674, 0.00965839, 0.00040452]),
 'param_n_estimators': masked_array(data=[200, 100, 10, 10, 50],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[2, 10, 2, 10, 2],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 200, 'max_depth': 2},
  {'n_estimators': 100, 'max_depth': 10},
  {'n_estimators': 10, 'max_depth': 2},
  {'n_estimators': 10, 'max_depth': 10},
  {'n_estimators': 50, 'max_depth': 2}],
 'split0_test_score': array([0.965 , 1.    , 0.9413, 1.    , 0.9886]),
 'split1_test_score': arra

In [44]:
rs.best_params_

{'n_estimators': 100, 'max_depth': 10}

While our model performance was generally good, we can see that hyperparameter values of 100 estimators with 10 nodes returned the best score, although all of our models showed incredibly strong performance as evidenced above.

Let's see if we can simplify the model by using the model's feature importances to identify the strongest predictor variables.

In [45]:
forest.feature_importances_

array([0.00903347, 0.00427847, 0.02301507, 0.02634036, 0.01096512,
       0.0176115 , 0.00728734, 0.03770434, 0.00482719, 0.0279636 ,
       0.00309395, 0.01077877, 0.00968814, 0.00477995, 0.80263275])

In [46]:
listicle = list(zip(dummies.columns,forest.feature_importances_))

In [47]:
listicle

[('trips_in_first_30_days', 0.009033468006895147),
 ('avg_rating_of_driver', 0.0042784687044919376),
 ('avg_surge', 0.023015069390921492),
 ('surge_pct', 0.026340355141910524),
 ('ultimate_black_user', 0.01096512413587918),
 ('weekday_pct', 0.01761150061711563),
 ('avg_dist', 0.007287337357095114),
 ('avg_rating_by_driver', 0.03770433877346061),
 ('retained', 0.004827185409368727),
 ('city_Astapor', 0.027963597239519723),
 ("city_King's Landing", 0.003093949962482528),
 ('city_Winterfell', 0.010778770166099889),
 ('phone_Android', 0.009688135059466455),
 ('phone_iPhone', 0.004779954751193965),
 ('days_since_signup', 0.8026327452840991)]

We can see that by far the highest predictor of a customer's retention is the days since signup, which makes sense intuitively - customers who signed up earlier have many more chances to churn. Importantly, we can also see that avg rating by driver is the next variable with the most predictive power, indicating that our drivers ratings can be critical information on our riders. 