In [None]:
# standard libraries
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
from sklearn.preprocessing import Imputer
import os
import re

# plotting libraries
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# get the datetime library for date & time calcs
from datetime import datetime, timedelta

In [None]:
os.chdir(os.path.normpath('C:/Users/n846490/Documents/Python Scripts/SurvivalAnalysis/ChurnModelFiles/'))

In [None]:
url = os.path.normpath('C:/Users/n846490/Documents/Python Scripts/SurvivalAnalysis/ChurnModelFiles/AllAttritionClean.csv')
data = pd.read_csv(url, index_col=0)
data.head()

In [None]:
# get the columns that need cleaning

colsToTransform = data[['EHI', 'TotAssets', 'EqiScore', 'Age', 'Custyears']]

colsToTransform.describe().transpose()

In [None]:
########  Use the following values to impute for missing

# ZERO TRANSFORMATIONS
# 0,TotAssets use .01, Set NAN to Median INDEX = 3
# 0, NAN EHI set to Median INDEX = 2

# NAN TRANSFORMATIONS
# EquiScore = Mean (lower than median) INDEX = 4
# Age = Mean (mean and median nearly identical) INDEX = 11
# Custyears = Mean (older accounts likely to have more error) INDEX = 13

# handling some missing data

imputerMean = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputerMedian = Imputer(missing_values = 'NaN', strategy = 'median', axis = 0)

# cols indexes are 2,4,11
colsToMean = [4,11,13]
colsToMedian = [2,3]

imputerMean = imputerMean.fit(data.iloc[:, colsToMean])
imputerMedian = imputerMedian.fit(data.iloc[:, colsToMedian])

data.iloc[:,colsToMean] = imputerMean.transform(data.iloc[:,colsToMean])
data.iloc[:,colsToMedian] = imputerMedian.transform(data.iloc[:,colsToMedian])

# now check the transformations by viewing the distributions
newCols = data[['EHI', 'TotAssets', 'EqiScore', 'Age', 'Custyears']]

newCols.describe().transpose()

In [None]:
# get a view of EHI

fig = plt.figure(figsize = (10,5))

sns.set(font_scale=1.25)

fig = sns.distplot(data['EHI'], bins = 1000)

fig.set(xlim=(0,500000))

plt.show()

In [None]:
# get a view of tot assets

# get a view of EHI

fig = plt.figure(figsize = (10,6))

sns.set(font_scale=1.25)

fig = sns.distplot(data['TotAssets'], bins = 1000)

fig.set(xlim=(0,500000))

plt.show()

In [None]:
def clean_TotAssets_Zero(row):
    
    #logs of 0 or negative numbers will raise errors
    
    if row == 0:
        return float(0.01)
    else:
        return row
    
data['TotAssets'] = data['TotAssets'].apply(clean_TotAssets_Zero)

data.TotAssets.describe().transpose()

In [None]:
# clean up the EHI

replace_value = data['EHI'].quantile(0.50)

def clean_EHI_zero(row):
    
    if row == 0:
        return float(replace_value)
    else:
        return row
    
data['EHI'] = data['EHI'].apply(clean_EHI_zero)

data.EHI.describe().transpose()

In [None]:
data.drop('VinScore', axis = 1, inplace = True)

data.describe().transpose()

In [None]:
data.info()

In [None]:
# make the log transforms

data['LogEHI'] = np.log(data['EHI'])

data['LogAssets'] = np.log(data['TotAssets'])

data.drop('EHI', axis = 1, inplace = True)
data.drop('TotAssets', axis = 1, inplace = True)

data.describe().transpose()

In [None]:
# re-arrange data

Outcome = pd.DataFrame(data['LeftBank'].copy())

Outcome.head()

data.drop('LeftBank', axis = 1, inplace = True)

# recombine at end

frames = [data, Outcome]

data = pd.concat(frames, axis = 1)

data.head()

In [None]:
os.getcwd()

In [None]:
data.info()

In [None]:
# create an indicator for footprint

# create a condition to check

def footprint_col(colCheck):
    
    footprint = ['ME', 'VT', 'NH', 'MA', 'RI', 'CT', 'NY', 'NJ', 'PA']
    
    if colCheck in footprint:
        return 1
    else:
        return 0

In [None]:
# now apply the function to create a new column called footprint

data['Footprint'] = data['State'].apply(footprint_col)

data.head()

In [None]:
# get the numeric columns only

mlData = data._get_numeric_data()

In [None]:
mlData.info()

In [None]:
# removed the closed columns
# in most cases the closed = num accounts
# also remove open

# first rename num accounts
# data.rename(columns={'gdp':'log(gdp)'}, inplace=True)

# mlData.rename(columns = {'NumProducts': 'MaxNumProd'}, inplace = True)

# mlData.drop('Closed', axis = 1, inplace = True)
# mlData.drop('Open', axis = 1, inplace = True)
mlData.drop('OpenAccts', axis = 1, inplace = True)



In [None]:
mlData.to_csv('CleanAttritionDataForML.csv')

In [None]:
# check the footprint distribution

mlData.groupby(['Footprint']).agg({'Footprint' : pd.Series.count})

In [None]:
# gt a vew of the closed accounts
# use the following template: mask = (checkingData['DateOpened'] > '2013-12-31')
# newChecking = checkingData.loc[mask]

mask = (mlData['LeftBank'] == 1)

closed = mlData.loc[mask].copy()

closed.info()

In [None]:
closed.head(25)

In [None]:
mlData.head(10)

In [None]:
fig = plt.figure(figsize = (8,4))

sns.set(font_scale=1.25)

fig = sns.distplot(closed['OpenAccts'], bins = 11)

# fig.set(xlim=(0,500000))

plt.show()

In [None]:
closed.groupby(['OpenAccts']).agg({'OpenAccts' : pd.Series.count})