# Customer Retention Classifier

In [348]:
import math
import re
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import Imputer
from scipy.stats import mode

In [349]:
pd.set_option("display.max_columns", 100)
pd.options.display.max_columns

100

In [350]:
TRAIN_DATA = pd.read_csv("../../../kaggle_data/customer_retention/midterm_train.csv")
TEST_DATA = pd.read_csv("../../../kaggle_data/customer_retention/midterm_test.csv")
X = TRAIN_DATA
X_combined = pd.concat([X, TEST_DATA])

In [351]:
X_combined.shape

(200000, 51)

### Clean Month Data

In [352]:
def cleanMonth(x):
    if isinstance(x, str):
        if x == 'Jun': return 'june'
        elif x == 'July': return 'july'
        elif x == 'Aug': return 'august'
        elif x == 'May': return 'may'
        elif x == 'Mar': return 'march'
        elif x == 'Apr': return 'april'
        elif x == 'sept.': return 'september'
        elif x == 'Feb': return 'february'
        elif x == 'Oct': return 'october'
        elif x == 'Nov': return 'november'
        elif x == 'January': return 'january'
        elif x == 'Dev': return 'december'

month = [cleanMonth(x) for x in X_combined.x19]

### Clean Weekday Data

In [353]:
def cleanWeekday(x):
    if isinstance(x, str):
        if x == 'monday': return 'monday'
        elif x == 'tuesday': return 'tuesday'
        elif x == 'wednesday': return 'wednesday'
        elif x == 'thurday': return 'thursday'
        elif x == 'friday': return 'friday'

weekday = [cleanWeekday(x) for x in X_combined.x43]

### Clean Region Data

In [354]:
def cleanRegion(x):
    if isinstance(x, str):
        if x == 'asia': return 'asia'
        elif x == 'euorpe': return 'europe'
        elif x == 'america': return 'america'

region = [cleanRegion(x) for x in X_combined.x16]

### Parse Money to Float

In [355]:
def parseMoneyToFloat(x):
    if isinstance(x, str): return float(re.sub('[$,()]', '', x))

X_combined.x44 = [parseMoneyToFloat(x) for x in X_combined.x44]

### Parse Percent to Float

In [356]:
def parsePercentToFloat(x):
    if isinstance(x, str): return float(re.sub('%', '', x)) / 100.00

X_combined.x09 = [parsePercentToFloat(x) for x in X_combined.x09]

### Imputation  Strategies

In [357]:
X_combined.shape

(200000, 51)

1. **df0**: Numerical: Drop, Categorical: Drop
2. **df1**: Numerical: Mean, Categorical: Frequency
3. **df2**: Numerical: Median, Categorical: Frequency
4. **df3**: k-Nearest Neighbors
5. **df4**: Random Forest

In [358]:
X_numeric = X_combined.copy()
X_numeric.drop(['x16', 'x19', 'x43'], axis=1, inplace=True)

In [359]:
df0 = X_numeric.copy()
df1 = X_numeric.copy()
df2 = X_numeric.copy()

### Impute Missing Categorical Data with Most Frequent Value

In [360]:
X_categorical = pd.DataFrame.from_dict({ 'month': month, 'weekday': weekday, 'region': region })

In [361]:
X_categorical.month.value_counts()[[0]]

june    55795
Name: month, dtype: int64

In [362]:
X_categorical.weekday.value_counts()[[0]]

wednesday    126413
Name: weekday, dtype: int64

In [363]:
X_categorical.region.value_counts()[[0]]

asia    173129
Name: region, dtype: int64

In [364]:
X_categorical.month.fillna('june', inplace=True)
X_categorical.weekday.fillna('wednesday', inplace=True)
X_categorical.region.fillna('asia', inplace=True)

### df0: Numerical: Drop, Categorical: Drop

In [365]:
df0.dropna(inplace=True, subset=df0.columns[0:-1])
df0.shape

(198116, 48)

### df1: Numerical: Mean, Categorical: Most Frequent

In [366]:
y_df1 = df1.pop('y')

In [367]:
meanImputer = Imputer(strategy='mean', copy=False)
meanImputer.fit_transform(df1)

array([[ -5.90070664e+00,   4.69388657e-01,   4.73306785e+00, ...,
          4.59280348e+00,  -3.44776760e-02,   3.68040134e+00],
       [ -3.53039065e+00,   4.22981797e+00,  -4.61943288e+00, ...,
         -5.46246671e+00,  -2.29517509e-01,  -2.33294711e+00],
       [  1.00338855e+00,  -1.25474570e-02,   6.25050323e+00, ...,
         -1.14070719e+01,   6.39555117e+00,   3.54533178e+00],
       ..., 
       [  1.71043472e+01,   1.08815589e+01,   9.25710482e+00, ...,
          6.07498027e+00,  -2.24617902e-01,   5.16245808e+00],
       [  1.03181728e+01,  -8.79384077e+00,  -2.67099784e+00, ...,
         -7.78997798e+00,   1.60228442e-01,  -2.71319396e+00],
       [ -3.75580866e+00,   6.22956464e+00,  -1.28682043e+00, ...,
          6.10818001e+01,   1.34704559e+00,  -7.67326474e-01]])

In [368]:
df1['y'] = y_df1

### df2: Numerical: Median, Categorical: Most Frequent

In [369]:
y_df2 = df2.pop('y')

In [370]:
medianImputer = Imputer(strategy='median', copy=False)
medianImputer.fit_transform(df2)

array([[ -5.90070664e+00,   4.69388657e-01,   4.73306785e+00, ...,
          4.59280348e+00,  -3.44776760e-02,   3.68040134e+00],
       [ -3.53039065e+00,   4.22981797e+00,  -4.61943288e+00, ...,
         -5.46246671e+00,  -2.29517509e-01,  -2.33294711e+00],
       [  1.00338855e+00,  -1.25474570e-02,   6.25050323e+00, ...,
         -1.14070719e+01,   6.39555117e+00,   3.54533178e+00],
       ..., 
       [  1.71043472e+01,   1.08815589e+01,   9.25710482e+00, ...,
          6.07498027e+00,  -2.24617902e-01,   5.16245808e+00],
       [  1.03181728e+01,  -8.79384077e+00,  -2.67099784e+00, ...,
         -7.78997798e+00,   1.60228442e-01,  -2.71319396e+00],
       [ -3.75580866e+00,   6.22956464e+00,  -1.28682043e+00, ...,
          6.10818001e+01,   1.34704559e+00,  -7.67326474e-01]])

In [371]:
df2['y'] = y_df2

### Convert Categorical Variables to Dummies & Append to Numeric Dataframes

In [372]:
X_dummies = pd.get_dummies(X_categorical)

In [373]:
df0

Unnamed: 0,x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15,x17,x18,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36,x37,x38,x39,x40,x41,x42,x44,x45,x46,x47,x48,x49,y
0,-5.900707,0.469389,4.733068,9.223027,3.170787,0.024669,-2.055880,-2.045466,3.004170,0.0002,-4.685655,5.895884,12.235718,-17.658996,-19.322599,1.978951,0.831200,1.980735,-1.347138,0.320392,-1.177156,-2.704137,-14.103989,-5.655715,5.481989,-1.161496,-3.931582,7.643024,3.421198,3.762529,8.698367,-6.727415,-10.732802,-0.728671,0.018602,-32.680226,-0.106695,3.913680,-5.335641,-3.694592,-1.008670,180.16,1.236134,4.462951,4.592803,-0.034478,3.680401,0.0
1,-3.530391,4.229818,-4.619433,2.154913,5.619903,-1.955667,7.713001,-1.279395,14.941269,0.0000,-3.103170,0.982095,3.753298,7.537105,5.942761,0.088104,0.022300,-5.310811,0.898115,-1.935222,-0.398599,-0.979549,-10.746822,2.507350,1.280838,3.452034,0.624896,0.426659,2.874107,18.712973,7.312360,-15.853969,-1.455398,0.236479,-3.252878,-1.289889,14.377643,6.201034,-8.490805,6.414948,5.831287,432.31,-6.570015,-5.308026,-5.462467,-0.229518,-2.332947,1.0
2,1.003389,-0.012547,6.250503,3.368675,1.711321,6.192718,4.166159,-0.363055,15.175681,-0.0001,-5.424805,7.969230,-16.521783,-10.213318,9.272947,1.128275,0.725844,-10.495085,4.546223,2.019324,4.339569,-0.205385,-15.868465,-0.501901,2.002275,0.225154,-0.882617,-14.174509,-14.707616,19.006559,2.732869,-35.200541,-21.825129,0.365506,-5.381501,16.668153,-7.932493,-0.397368,-4.172583,0.371862,-7.025987,119.17,-66.851394,-11.084558,-11.407072,6.395551,3.545332,0.0
3,-16.041769,-3.243507,-0.546649,-11.594427,5.111965,0.609785,-1.540136,-1.227567,-16.460535,0.0000,5.624369,9.138195,2.538420,14.740934,10.200809,-2.862860,-0.358076,2.548242,-1.207699,-1.183008,-11.578898,-2.626346,15.628244,-16.420842,-6.891503,-4.772674,-0.172465,2.928158,-11.267913,-20.615756,5.138457,20.096343,11.691170,0.442023,-0.380077,29.446823,10.975024,1.148454,1.666316,-2.920478,-2.135088,1309.54,0.442939,11.808383,12.151957,-2.923916,-1.471336,0.0
4,20.498575,10.766191,1.478032,-5.723594,-0.768839,-3.482515,-11.169389,-0.361478,2.194909,-0.0002,1.331710,13.616031,2.503163,-21.545901,-31.082999,0.964522,-0.340416,-4.412438,-0.504193,0.198800,-1.763387,3.767706,-13.312645,-6.421953,-3.401994,-2.892450,-5.740850,7.334497,0.836239,2.748982,7.637753,-15.819318,-7.478481,0.470682,-2.954060,-10.051513,4.656588,-5.186225,-0.757455,-1.769904,-0.532896,1103.32,61.136654,48.077895,49.476758,1.043061,9.927952,1.0
5,-0.830855,-4.026283,-2.961508,-14.230334,-4.052836,0.486097,5.562573,-1.111238,0.673142,-0.0001,2.264092,12.986227,-0.607595,16.967915,36.167907,-5.008706,0.181912,-16.087763,2.555256,-1.266805,-22.834668,-0.826063,1.419909,-3.410898,-8.458235,5.619322,-8.455152,-2.409589,-1.721869,0.843066,-4.070249,-14.508538,0.017890,1.709389,-3.304359,63.047442,-7.192511,-5.705965,-0.095350,-0.264656,-9.053090,572.55,-22.080144,19.977108,20.558357,3.800816,2.070705,0.0
6,-25.664561,9.442661,4.179044,-6.127033,9.789203,2.792734,6.647490,-1.075430,5.730019,0.0000,-4.530808,-3.014907,-9.968481,10.439753,3.167594,2.655570,-0.218175,-15.568244,-7.576256,0.645758,3.769370,-4.350432,-9.658014,1.513447,-3.641790,-4.438467,-0.583134,-12.712065,-3.613269,7.176478,-0.015860,-19.912903,4.551199,-0.085954,-6.577846,36.173496,-14.135372,6.551475,0.817773,6.152542,-2.385970,2.07,-24.201930,26.247495,27.011185,-0.786127,4.876578,0.0
7,14.727019,-4.769810,0.545849,-14.474038,3.551106,0.135631,-17.896485,-1.078326,0.863689,0.0001,-1.144070,9.929508,-0.801430,20.895282,23.943976,-2.283730,-0.328941,-1.730728,-1.422557,2.677431,-0.177821,-0.848249,11.706081,-7.193009,-8.603089,0.470595,-5.671523,-5.477891,-4.603916,1.081714,-10.812646,-15.578844,-2.731962,0.644438,-4.396941,57.260435,-15.601965,-1.460882,-1.108780,10.641338,8.232490,455.82,-3.934082,21.175780,21.791905,-8.193265,5.359723,0.0
8,11.340836,1.756355,3.809453,9.021020,10.804382,-1.448350,8.786048,2.908056,11.322027,0.0000,-3.085007,2.036985,9.705944,6.028328,4.319063,-13.067015,0.522621,0.013919,4.354501,-1.543508,3.236829,2.562643,-15.890755,1.391232,5.361920,8.752352,1.952422,4.898276,-8.145477,14.180107,3.099720,-18.099858,3.709307,-0.183672,-7.177636,-4.056117,-11.271334,8.816055,6.459969,4.518976,4.910201,1151.57,17.153284,6.926340,7.127867,-5.727830,-2.863172,0.0
9,15.281702,4.187856,-1.403868,-1.072465,0.804286,-2.001035,2.026249,2.220199,8.607572,-0.0001,0.248389,-0.270010,9.859761,-5.076034,5.719406,6.252716,-0.235326,-3.144990,-5.060363,1.283294,1.222724,10.391263,-10.721060,1.468960,-0.637453,-5.938025,-1.510768,-14.961053,-0.919696,10.780427,1.509052,-19.187487,-3.194786,-2.164356,3.429701,12.089005,3.370316,8.389640,-3.415456,2.595894,10.662026,381.92,16.374766,19.926241,20.506010,2.298746,-0.288104,0.0
