In [508]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score,KFold,train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, NMF
import datetime

In [487]:
df = pd.read_csv('data/churn.csv')

In [488]:
print df.isnull().sum()  # missing values for each column
print "Total number of rows in this Dataset: ", df.count().max()

avg_dist                     0
avg_rating_by_driver       201
avg_rating_of_driver      8122
avg_surge                    0
city                         0
last_trip_date               0
phone                      396
signup_date                  0
surge_pct                    0
trips_in_first_30_days       0
luxury_car_user              0
weekday_pct                  0
dtype: int64
Total number of rows in this Dataset:  50000


In [489]:
def set_dtypes_df(df,datetime='%Y-%m-%d',use_datetime=False):
    # Either the values are numbers, datetime, or strings (if string can remain as objects)
    for column in df.columns:
        try:
            df[column] = df[column].astype(float)
        except:
            try:
                if use_datetime==True:
                    df[column] = pd.to_datetime(df[column],format=datetime)
                else: 
                    print "Couldn't convert columns: ", column, " so leaving it as object dtype"
            except:
                print "Couldn't convert columns: ", column, " so leaving it as object dtype"
    return df

In [490]:
def make_target(df):
    y = (df['last_trip_date'] < datetime.datetime(2014,6,1)).astype(int).rename('response')
    return y

In [496]:
def clean_split_df(X_df,test_size=0.25):
    
    X_df = set_dtypes_df(X_df,use_datetime=True)
    
    # Create response variable
    y_df = make_target(X_df)
    
    # replace missing of_driver ratings with the by_driver ratings if available
    mask = X_df['avg_rating_of_driver'].isnull()
    X_df.loc[:,'avg_rating_of_driver'][mask] = X_df.loc[:,'avg_rating_by_driver'][mask]
    
    # set remaining missing ratings to their column averages
    mask1_of = X_df['avg_rating_of_driver'].isnull()
    mask1_by = X_df['avg_rating_by_driver'].isnull()
    X_df['avg_rating_of_driver'][mask1_of] = X_df['avg_rating_of_driver'].mean()
    X_df['avg_rating_by_driver'][mask1_by] = X_df['avg_rating_by_driver'].mean()
    
    # drop these columns as they're not useful, see assumptions below
    X_df = X_df.drop(['phone','last_trip_date','signup_date'],axis=1)
    
    # dummify city column
    X_df = pd.get_dummies(X_df,columns=['city'])
    
    #train_test_split, user can pass argument to increase test_size
    X_train, X_test, y_train, y_test = train_test_split(X_df,y_df,test_size=test_size)
    
    return X_train, X_test, y_train, y_test
    
'''
Assumptions:
1. ratings_by_driver is a good representation of the ride
2. where missing, an average rating of all ratings is a good representation of the missing rating
3. not using last_trip column because response variable is directly measured from this - multicollinearity
4. not using signup_date because all dates are from January by design in this dataset.
5. not using phone because too much variation. Might lemmatize later or something.
'''

'\nAssumptions:\n1. ratings_by_driver is a good representation of the ride\n2. where missing, an average rating of all ratings is a good representation of the missing rating\n3. not using last_trip column because response variable is directly measured from this - multicollinearity\n4. not using signup_date because all dates are from January by design in this dataset.\n5. not using phone because too much variation. Might lemmatize later or something.\n'

In [492]:
X_train, X_test, y_train, y_test = clean_split_df(df)

Couldn't convert columns:  city  so leaving it as object dtype
Couldn't convert columns:  phone  so leaving it as object dtype


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [493]:
X_train.head()  #clean and full matrix, ready for analysis

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct,city_Astapor,city_King's Landing,city_Winterfell
44701,3.5,4.5,4.4,1.0,0.0,2.0,1.0,41.7,0,0,1
39091,3.33,3.5,4.5,1.0,0.0,0.0,1.0,0.0,0,0,1
41782,2.72,5.0,4.7,1.0,0.0,3.0,1.0,100.0,0,0,1
3345,9.93,4.9,4.7,1.11,6.9,2.0,1.0,86.2,1,0,0
1928,2.62,5.0,4.0,1.0,0.0,0.0,0.0,100.0,1,0,0


In [494]:
y_train.value_counts()  # this class imbalance isn't too bad

1    23420
0    14080
Name: response, dtype: int64

In [505]:
# a little PCA to figure out where the highest variance is.
pca = PCA()
pca.fit(X_train)
sorted(zip(X_train.columns,pca.components_[0]), key=lambda x:x[1], reverse=True)

[('surge_pct', 0.11199606123768718),
 ('avg_surge', 0.00097298345567489513),
 ('city_Astapor', 0.00021828475721517339),
 ('city_Winterfell', 0.00010785860318650092),
 ('avg_rating_by_driver', -0.00024837859137594176),
 ('avg_rating_of_driver', -0.00029681391147539857),
 ("city_King's Landing", -0.00032614336040167387),
 ('luxury_car_user', -0.00056008323933226609),
 ('trips_in_first_30_days', -0.0051884288787581445),
 ('avg_dist', -0.017053716808244627),
 ('weekday_pct', -0.99354796430991266)]

```` This looks interesting because it shows that surge_pct contributes the most to the direction of the first eigenvector. However, if you look at this column, you can quickly realize it's because surge_pct is 0 when there is no surge and only becomes non-zero when there is a surge. So this makes sense (Even the second eigenvector shows surge_pct at the top). It may still be a good predictor intuitively though. Customers that have used surge_pct before may very well be more likely to stay customers as they are showing dependency on the app, are less likely to take the train to save money etc. ```` 

In [511]:
X_train.shape

(37500, 11)

In [530]:
# a little NNMF to get more of an interpretable sense of the inputs
nmf = NMF(n_components=5, max_iter=10)
nmf.fit(X_train)

NMF(alpha=0.0, beta=1, eta=0.1, init=None, l1_ratio=0.0, max_iter=10,
  n_components=5, nls_max_iter=2000, random_state=None, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)

In [531]:
def mse(W, H, V):
    a = (V - (W.dot(H)))
    a = np.array(a)**2
    return np.sum(a) / reduce(lambda x, y: x*y, V.shape)

In [532]:
W, H = nmf.transform(X), nmf.components_
print mse(W,H,X)  # so 5 dimensions describe our input matrix quite well

0.107151911806
