# Customer Retention Classifier

In [289]:
import math
import re
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import Imputer
from scipy.stats import mode
from sklearn.model_selection import cross_val_score

In [290]:
pd.set_option("display.max_columns", 100)
pd.options.display.max_columns

100

In [291]:
TRAIN_DATA = pd.read_csv("../../../kaggle_data/customer_retention/midterm_train.csv")
TEST_DATA = pd.read_csv("../../../kaggle_data/customer_retention/midterm_test.csv")
X = TRAIN_DATA
X_combined = pd.concat([X, TEST_DATA])

In [292]:
X_combined.shape

(200000, 51)

### Clean Month Data

In [293]:
def cleanMonth(x):
    if isinstance(x, str):
        if x == 'Jun': return 'june'
        elif x == 'July': return 'july'
        elif x == 'Aug': return 'august'
        elif x == 'May': return 'may'
        elif x == 'Mar': return 'march'
        elif x == 'Apr': return 'april'
        elif x == 'sept.': return 'september'
        elif x == 'Feb': return 'february'
        elif x == 'Oct': return 'october'
        elif x == 'Nov': return 'november'
        elif x == 'January': return 'january'
        elif x == 'Dev': return 'december'

month = pd.Series([cleanMonth(x) for x in X_combined.x19], name='month')

### Clean Weekday Data

In [294]:
def cleanWeekday(x):
    if isinstance(x, str):
        if x == 'monday': return 'monday'
        elif x == 'tuesday': return 'tuesday'
        elif x == 'wednesday': return 'wednesday'
        elif x == 'thurday': return 'thursday'
        elif x == 'friday': return 'friday'

weekday = pd.Series([cleanWeekday(x) for x in X_combined.x43], name='weekday')

### Clean Region Data

In [295]:
def cleanRegion(x):
    if isinstance(x, str):
        if x == 'asia': return 'asia'
        elif x == 'euorpe': return 'europe'
        elif x == 'america': return 'america'

region = pd.Series([cleanRegion(x) for x in X_combined.x16], name='region')

### Parse Money to Float

In [296]:
def parseMoneyToFloat(x):
    if isinstance(x, str): return float(re.sub('[$,()]', '', x))

X_combined.x44 = [parseMoneyToFloat(x) for x in X_combined.x44]

### Parse Percent to Float

In [297]:
def parsePercentToFloat(x):
    if isinstance(x, str): return float(re.sub('%', '', x)) / 100.00

X_combined.x09 = [parsePercentToFloat(x) for x in X_combined.x09]

### Imputation  Strategies

In [298]:
X_combined.shape

(200000, 51)

1. **df0**: Numerical: Drop, Categorical: Drop
2. **df1**: Numerical: Mean, Categorical: Frequency
3. **df2**: Numerical: Median, Categorical: Frequency
4. **df3**: k-Nearest Neighbors
5. **df4**: Random Forest

In [299]:
X_numeric = X_combined.copy()
X_numeric.drop(['x16', 'x19', 'x43'], axis=1, inplace=True)

In [300]:
df0 = X_numeric.copy()
df1 = X_numeric.copy()
df2 = X_numeric.copy()

### Impute Missing Categorical Data with Most Frequent Value & Get Dummies

In [301]:
month.value_counts()[[0]]

june    55795
Name: month, dtype: int64

In [302]:
weekday.value_counts()[[0]]

wednesday    126413
Name: weekday, dtype: int64

In [303]:
region.value_counts()[[0]]

asia    173129
Name: region, dtype: int64

In [304]:
month.fillna('june', inplace=True)
weekday.fillna('wednesday', inplace=True)
region.fillna('asia', inplace=True)

### df0: Numerical: Drop, Categorical: Drop

In [305]:
#df0['month'] = month.copy()
#df0['weekday'] = weekday.copy()
#df0['region'] = region.copy()
#df0 = pd.get_dummies(df0)

In [306]:
#df0.dropna(inplace=True, subset=[col for col in df0.columns if col not in ['y']])

### df1: Numerical: Mean, Categorical: Most Frequent

In [307]:
y_df1 = df1.pop('y')

In [308]:
meanImputer = Imputer(strategy='mean', copy=False)
meanImputer.fit_transform(df1)

array([[ -5.90070664e+00,   4.69388657e-01,   4.73306785e+00, ...,
          4.59280348e+00,  -3.44776760e-02,   3.68040134e+00],
       [ -3.53039065e+00,   4.22981797e+00,  -4.61943288e+00, ...,
         -5.46246671e+00,  -2.29517509e-01,  -2.33294711e+00],
       [  1.00338855e+00,  -1.25474570e-02,   6.25050323e+00, ...,
         -1.14070719e+01,   6.39555117e+00,   3.54533178e+00],
       ..., 
       [  1.71043472e+01,   1.08815589e+01,   9.25710482e+00, ...,
          6.07498027e+00,  -2.24617902e-01,   5.16245808e+00],
       [  1.03181728e+01,  -8.79384077e+00,  -2.67099784e+00, ...,
         -7.78997798e+00,   1.60228442e-01,  -2.71319396e+00],
       [ -3.75580866e+00,   6.22956464e+00,  -1.28682043e+00, ...,
          6.10818001e+01,   1.34704559e+00,  -7.67326474e-01]])

In [309]:
df1['y'] = y_df1
df1['month'] = month.copy()
df1['weekday'] = weekday.copy()
df1['region'] = region.copy()
df1 = pd.get_dummies(df1)

### df2: Numerical: Median, Categorical: Most Frequent

In [310]:
y_df2 = df2.pop('y')

In [311]:
medianImputer = Imputer(strategy='median', copy=False)
medianImputer.fit_transform(df2)

array([[ -5.90070664e+00,   4.69388657e-01,   4.73306785e+00, ...,
          4.59280348e+00,  -3.44776760e-02,   3.68040134e+00],
       [ -3.53039065e+00,   4.22981797e+00,  -4.61943288e+00, ...,
         -5.46246671e+00,  -2.29517509e-01,  -2.33294711e+00],
       [  1.00338855e+00,  -1.25474570e-02,   6.25050323e+00, ...,
         -1.14070719e+01,   6.39555117e+00,   3.54533178e+00],
       ..., 
       [  1.71043472e+01,   1.08815589e+01,   9.25710482e+00, ...,
          6.07498027e+00,  -2.24617902e-01,   5.16245808e+00],
       [  1.03181728e+01,  -8.79384077e+00,  -2.67099784e+00, ...,
         -7.78997798e+00,   1.60228442e-01,  -2.71319396e+00],
       [ -3.75580866e+00,   6.22956464e+00,  -1.28682043e+00, ...,
          6.10818001e+01,   1.34704559e+00,  -7.67326474e-01]])

In [312]:
df2['y'] = y_df2
df2['month'] = month.copy()
df2['weekday'] = weekday.copy()
df2['region'] = region.copy()
df2 = pd.get_dummies(df2)

### Create Test/Train Splits

In [313]:
# df1: Numerical: Mean, Categorical: Most Frequent

# X TEST
X_test_df1 = df1[np.isnan(df1.y)].copy()
X_test_df1.drop('y', axis=1, inplace=True)

# Xy TRAIN
train_df1 = df1.dropna(subset=['y'])

# y TRAIN
y_train_df1 = train_df1['y'].copy()

# X TRAIN
X_train_df1 = train_df1.drop('y', axis=1)

In [314]:
# df2: Numerical: Median, Categorical: Most Frequent

# X TEST
X_test_df2 = df2[np.isnan(df2.y)].copy()
X_test_df2.drop('y', axis=1, inplace=True)

# Xy TRAIN
train_df2 = df2.dropna(subset=['y'])

# y TRAIN
y_train_df2 = train_df2['y'].copy()

# X TRAIN
X_train_df2 = train_df2.drop('y', axis=1)

### Random Forest Classifer with K-Fold Cross Validation

In [315]:
def getMeanAndConfidence(scores, name):
    mean_score = scores.mean()
    std_dev = scores.std()
    std_error = scores.std() / math.sqrt(scores.shape[0])
    ci =  2.262 * std_error
    lower_bound = mean_score - ci
    upper_bound = mean_score + ci
    print ("%s is %f +/-  %f" % (name, mean_score, ci))
    print ('95 percent probability that if this experiment were repeated over and over the average score would be between %f and %f' % (lower_bound, upper_bound))

In [316]:
rfc_df1 = RandomForestClassifier(n_jobs=-1)
df1_scores = cross_val_score(estimator=rfc_df1, X=X_train_df1, y=y_train_df1, cv=10, n_jobs=-1)
getMeanAndConfidence(df1_scores, 'df1')

df1 is 0.869244 +/-  0.003206
95 percent probability that if this experiment were repeated over and over the average score would be between 0.866037 and 0.872450


In [317]:
rfc_df2 = RandomForestClassifier(n_jobs=-1)
df2_scores = cross_val_score(estimator=rfc_df2, X=X_train_df2, y=y_train_df2, cv=10, n_jobs=-1)
getMeanAndConfidence(df2_scores, 'df2')

df2 is 0.866537 +/-  0.002472
95 percent probability that if this experiment were repeated over and over the average score would be between 0.864065 and 0.869010
