In [1]:
%matplotlib inline
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

## Summary

#### Data Cleaning

Average rating by driver has 201 null values. Average rating of driver has 8122 null values. Phone has 396 null values.

Average rating of driver null values can be replaced with the mean average rating of driver value. Reason behind this is because since we are looking for reasons behind user retention, a user can expect to get a driver of the average quality on any given ride.

Average rating by driver null value rows can be dropped. These rows only account for 0.4% of the dataset.

Phone null values are inconsequential. Phone is turned into a binary variable - is_iPhone - to denote whether the phone is an iPhone or not for logistic regression.

In [2]:
with open('ultimate_data_challenge.json') as f:
    data = json.load(f)

In [3]:
data[0]

{'avg_dist': 3.67,
 'avg_rating_by_driver': 5.0,
 'avg_rating_of_driver': 4.7,
 'avg_surge': 1.1,
 'city': "King's Landing",
 'last_trip_date': '2014-06-17',
 'phone': 'iPhone',
 'signup_date': '2014-01-25',
 'surge_pct': 15.4,
 'trips_in_first_30_days': 4,
 'ultimate_black_user': True,
 'weekday_pct': 46.2}

In [4]:
df = pd.DataFrame.from_dict(data)

In [5]:
df.last_trip_date = pd.to_datetime(df.last_trip_date, format='%Y-%m-%d')
df.signup_date = pd.to_datetime(df.signup_date, format='%Y-%m-%d')

In [6]:
# create working dataframe
df_clean = df.copy()

In [7]:
# calculating active users
df_clean['active_six'] = np.where(df_clean.last_trip_date.dt.month >= 6, 1, 0)

In [8]:
# converting categorical data to binary
df_clean['is_Astapor'] = np.where(df_clean.city == 'Astapor', 1, 0)
df_clean['is_Kings'] = np.where(df_clean.city == 'King\'s Landing', 1, 0)
df_clean['is_Winterfell'] = np.where(df_clean.city == 'Winterfell', 1, 0)
df_clean['is_iPhone'] = np.where(df_clean.phone == 'iPhone', 1, 0)
df_clean['is_Android'] = np.where(df_clean.phone == 'Android', 1, 0)
df_clean['ultimate_black_user_binary'] = np.where(df_clean.ultimate_black_user == True, 1, 0)

In [9]:
# need to process NaNs
df_clean.isnull().sum()

avg_dist                         0
avg_rating_by_driver           201
avg_rating_of_driver          8122
avg_surge                        0
city                             0
last_trip_date                   0
phone                          396
signup_date                      0
surge_pct                        0
trips_in_first_30_days           0
ultimate_black_user              0
weekday_pct                      0
active_six                       0
is_Astapor                       0
is_Kings                         0
is_Winterfell                    0
is_iPhone                        0
is_Android                       0
ultimate_black_user_binary       0
dtype: int64

In [10]:
# fill in avg driver rating with mean driver rating
df_clean.avg_rating_of_driver.fillna(df.avg_rating_of_driver.mean(skipna=True), inplace=True)

In [11]:
df_clean.isnull().sum()

avg_dist                        0
avg_rating_by_driver          201
avg_rating_of_driver            0
avg_surge                       0
city                            0
last_trip_date                  0
phone                         396
signup_date                     0
surge_pct                       0
trips_in_first_30_days          0
ultimate_black_user             0
weekday_pct                     0
active_six                      0
is_Astapor                      0
is_Kings                        0
is_Winterfell                   0
is_iPhone                       0
is_Android                      0
ultimate_black_user_binary      0
dtype: int64

In [12]:
# put mask on df_clean to remove null rating_by_driver and phone rows
df_clean = df_clean[df.avg_rating_by_driver.notnull() & df.phone.notnull()]

In [13]:
# drop features we binarized or irrelevant
df_clean.drop(labels=['city', 'phone', 'signup_date', 'last_trip_date', 'ultimate_black_user'], axis=1, inplace=True)

In [14]:
df_clean.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,surge_pct,trips_in_first_30_days,weekday_pct,active_six,is_Astapor,is_Kings,is_Winterfell,is_iPhone,is_Android,ultimate_black_user_binary
0,3.67,5.0,4.7,1.1,15.4,4,46.2,1,0,1,0,1,0,1
1,8.26,5.0,5.0,1.0,0.0,0,50.0,0,1,0,0,0,1,0
2,0.77,5.0,4.3,1.0,0.0,3,100.0,0,1,0,0,1,0,0
3,2.36,4.9,4.6,1.14,20.0,9,80.0,1,0,1,0,1,0,1
4,3.13,4.9,4.4,1.19,11.8,14,82.4,0,0,0,1,0,1,0


#### Data Summary

There are 18,804 active users and 31,196 inactive users.

In [15]:
df.active_six.value_counts().plot(kind='bar')
plt.xticks(range(2), ('Inactive', 'Active'))
plt.ylabel("User Counts")
plt.title("Active vs. Inactive Users")
plt.show()

AttributeError: 'DataFrame' object has no attribute 'active_six'

#### Predictive Model

In [None]:
# split into train/test
from sklearn.model_selection import train_test_split

X = df_clean.drop('active_six', axis=1)
Y = df_clean.active_six

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

In [None]:
# Standardize values
from sklearn.preprocessing import StandardScaler

std_mask = ['avg_dist', 'avg_rating_by_driver', 'avg_rating_of_driver', 'avg_surge', 'surge_pct', 'trips_in_first_30_days', 'weekday_pct']
std_scale = StandardScaler().fit(x_train[std_mask].values)

In [None]:
x_train[std_mask] = std_scale.transform(x_train[std_mask].values)
x_test[std_mask] = std_scale.transform(x_test[std_mask].values)

In [None]:
from statsmodels.api import Logit
from statsmodels.tools import add_constant

#model = Logit(y_train, x_train.drop(labels='is_Astapor', axis=1)).fit()
model = Logit(y_train, add_constant(x_train.drop(labels=['is_Kings', 'is_Android'], axis=1))).fit()

In [None]:
print(model.summary())

In [None]:
predictions = model.predict(add_constant(x_test.drop(labels=['is_Kings', 'is_Android'], axis=1)))
class_predictions = (predictions > 0.5).astype(int)

In [None]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, class_predictions))

In [None]:
for x in np.arange(.3, .61, 0.01):
    print(x, accuracy_score(y_test, (predictions > x).astype(int)))

In [None]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, _ = roc_curve(y_test, predictions)
roc_auc = auc(fpr, tpr) 

In [None]:
df_roc = pd.DataFrame(dict(fpr=fpr, tpr=tpr))

In [None]:
roc_auc

In [None]:
df_roc.plot(x='fpr', y='tpr', kind='line', legend=None)

In [None]:
model2 = Logit(y_train, add_constant(x_train.drop(labels=['is_Kings', 'is_Android', 'weekday_pct'], axis=1))).fit()
print(model2.summary())

In [None]:
predictions2 = model2.predict(add_constant(x_test.drop(labels=['is_Kings', 'is_Android', 'weekday_pct', 'avg_surge', 'avg_rating_of_driver'], axis=1)))
class_predictions2 = (predictions > 0.5).astype(int)
print(accuracy_score(y_test, class_predictions2))

In [None]:
np.exp(model2.params)