In [None]:
# standard libraries
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import os
import re

# plotting libraries
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# get the datetime library for date & time calcs
from datetime import datetime, timedelta

# for handling excel
from openpyxl import Workbook
from openpyxl import load_workbook

# lifelines for survival analysis
import lifelines as lf

In [None]:
os.chdir(os.path.normpath('C:/Users/n846490/Documents/Python Scripts/SurvivalAnalysis/ChurnModelFiles/'))

In [None]:
# get the csat table

url = os.path.normpath('C:/Users/n846490/Documents/Python Scripts/SurvivalAnalysis/ChurnModelFiles/csatAttritionForSurvival.csv')
csat = pd.read_csv(url, index_col=0)

In [None]:
csat.info()

In [None]:
# convert objects to dates
# need to convert the SurveyDate, Datecsat and End_Date

# convert the End_Dates that were just created
csat['End_Date'] = pd.to_datetime(csat['End_Date'], format='%Y-%m-%d')

# convert the SurveyDate that were just created
csat['SurveyDate'] = pd.to_datetime(csat['SurveyDate'], format='%Y-%m-%d')

# convert the Datecsat that were just created
csat['DateLeft'] = pd.to_datetime(csat['DateLeft'], format='%Y-%m-%d')

In [None]:
# calculate days from survey

# calculate the time from survey to attrition

csat['SurveyTenure'] = csat.iloc[:,9] - csat.iloc[:,1]

# this works to convert days into years using the hours first
csat['DaysToLeave'] = (csat.iloc[:,12].astype('timedelta64[h]'))/(24)

csat.head()

In [None]:
# get a slice of people that left the bank

left = csat[(csat['LeftBank'] == 1)].copy()

left.head()

In [None]:
left.info()

In [None]:
# clean out the negative values

cleanLeft = left[(left['DaysToLeave'] > 0)]
                 
cleanLeft.head()

In [None]:
# melt the dataframe for plotting
# m = pd.melt(df, id_vars=['Year'], var_name='Name')

meltLeft = pd.melt(cleanLeft, id_vars = ['CustID', 'SurveyDate', 'MonthYear', 'DateLeft', 'LeftBank', 'End_Date', 'SurveyTenure', 'DaysToLeave'],var_name = 'Satisfaction',
                   )
meltLeft.head()

In [None]:
# sort by customer id to check the melting

meltLeft.sort_values(by='CustID', inplace = True)

meltLeft = meltLeft.reset_index(drop = True)

meltLeft.head(20)

In [None]:
# multiply the value y the days to leave for the column

meltLeft['SatisfactionDays'] = meltLeft['value'] * meltLeft['DaysToLeave']

In [None]:
meltLeft.head(20)

In [None]:
meltLeft.drop('value', axis = 1, inplace = True)

meltLeft.head()

In [None]:
# drop a few unnecessary columns

meltRed = meltLeft.iloc[:,[0,1,2,3,5,7,8,9]].copy()

meltRed.head()

In [None]:
# drop the satisfaction days = 0 and Unknown Satisfaction

cleanMelt = meltRed[(meltRed['SatisfactionDays'] != 0) & (meltRed['Satisfaction'] != 'Unknown')].copy()
cleanMelt.head(20)

In [None]:
# find out how many negative days
# there are none
# df[(df['A']>0)].count()

print(cleanMelt[cleanMelt.SatisfactionDays < 0].shape[0])

In [None]:
# convert satisfacion to an ordered categorical
# make an ordered factor
# raw_cat = pd.Categorical(["a","b","c","a"], categories=["b","c","d"],ordered=True)

labels = ['Highly Satisfied', 'Satisfied', 'Neutral', 'Dissatisfied', 'Highly Dissatisfied']

cleanMelt['Satisfaction'] = pd.Categorical(cleanMelt['Satisfaction'], categories = labels, ordered=True)

# leave out the unknown
# cleanMelt.drop(['Satisfaction'] == 'Unknown', axis = 0, inplace = True)

# dtest.numdept.cat.remove_unused_categories()
# this removes the unused category

# cleanMelt.Satisfaction.cat.remove_unused_categories()


In [None]:

sns.set(font_scale=1.25)

p = sns.factorplot(data = cleanMelt, x = 'Satisfaction', y = 'SatisfactionDays', hue = 'Satisfaction', kind = 'violin', size= 6, aspect = 2.5)
p.set(ylim=(0, 1200))

In [None]:
# look at the data as a density plot

sns.set(font_scale=1.5)

fig = sns.FacetGrid(data=cleanMelt, hue='Satisfaction', size = 5, aspect = 2.5)

fig.map(sns.kdeplot,'SatisfactionDays', shade = True, alpha = .6)

fig.set(xlim=(0,1400))

fig.add_legend(title='Satisfaction')

In [None]:
# make a table from days to leave

# get the stats

def get_stats(group):
    return {'median': group.median(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()}


# df['postTestScore'].groupby(df['categories']).apply(get_stats).unstack()

cleanMelt['DaysToLeave'].groupby(cleanMelt['Satisfaction']).apply(get_stats).unstack()

In [None]:
# df.groupby('series_id')['value'].describe()

cleanMelt.groupby('Satisfaction')['DaysToLeave'].describe().unstack()

In [None]:
# plot of multiple histograms overlayed on one it's not the best
# df[df['B']==3]['A']

fig = plt.figure(figsize = (15,8))

# get the separate series
x1 = cleanMelt[cleanMelt['Satisfaction'] == 'Highly Satisfied']['SatisfactionDays']

x2 = cleanMelt[cleanMelt['Satisfaction'] == 'Satisfied']['SatisfactionDays']

x3 = cleanMelt[cleanMelt['Satisfaction'] == 'Neutral']['SatisfactionDays']

x4 = cleanMelt[cleanMelt['Satisfaction'] == 'Dissatisfied']['SatisfactionDays']

x5 = cleanMelt[cleanMelt['Satisfaction'] == 'Highly Dissatisfied']['SatisfactionDays']

n_bins = 100

# cut the grid into 3 rows and 2 columns with a large one at the bottom
ax0 = plt.subplot2grid((3,2), (0,0), rowspan = 1, colspan = 1)  # Upper left
ax1 = plt.subplot2grid((3,2), (0,1), rowspan = 1, colspan = 1)  # Upper Right
ax2 = plt.subplot2grid((3,2), (1,0), rowspan = 1, colspan = 1)  # Middle Left
ax3 = plt.subplot2grid((3,2), (1,1), rowspan = 1, colspan = 1)  # Middle Right
ax4 = plt.subplot2grid((3,2), (2,0), rowspan = 1, colspan = 2)  # Entire Bottom

ax0.hist(x1, n_bins, normed=0, histtype='bar', color='blue')
ax0.set_title('Highly Satisfied')

ax1.hist(x2, n_bins, normed=0, histtype='bar', color = 'forestgreen')
ax1.set_title('Satisfied')

ax2.hist(x3, n_bins, normed=0, histtype='bar', color = 'tan')
ax2.set_title('Neutral')
         
ax3.hist(x4, n_bins, normed=0, histtype='bar', color = 'orange')
ax3.set_title('Dissatisfied')
         
ax4.hist(x5, n_bins, normed=0, histtype='bar', color = 'red')
ax4.set_xlim(0,600)
ax4.set_title('Highly Dissatisfied')                
         
fig.tight_layout()
plt.show()


In [None]:
# plot of multiple histograms 
# df[df['B']==3]['A']

sns.set(font_scale=1.25)

fig = plt.figure(figsize = (15,10))

# get the separate series
x1 = cleanMelt[cleanMelt['Satisfaction'] == 'Highly Satisfied']['SatisfactionDays']

x2 = cleanMelt[cleanMelt['Satisfaction'] == 'Satisfied']['SatisfactionDays']

x3 = cleanMelt[cleanMelt['Satisfaction'] == 'Neutral']['SatisfactionDays']

x4 = cleanMelt[cleanMelt['Satisfaction'] == 'Dissatisfied']['SatisfactionDays']

x5 = cleanMelt[cleanMelt['Satisfaction'] == 'Highly Dissatisfied']['SatisfactionDays']

n_bins = 100

# combine them for multiplotting
labels = ['Highly Satisfied', 'Satisfied', 'Neutral', 'Dissatisfied', 'Highly Dissatisfied']

xnames={'0': 'Highly Satisfied', '1': 'Satisfied', '2': 'Neutral', '3': 'Dissatisfied', '4': 'Highly Dissatisfied'}

x = pd.concat([x1, x2, x3, x4, x5], axis = 1, ignore_index = True)
x.rename(columns=xnames, inplace=True)


# cut the grid into 3 rows and 2 columns with a large one at the bottom
ax0 = plt.subplot2grid((4,2), (0,0), rowspan = 1, colspan = 2)  # Entire Top
ax1 = plt.subplot2grid((4,2), (1,0), rowspan = 1, colspan = 1)  # Upper left
ax2 = plt.subplot2grid((4,2), (1,1), rowspan = 1, colspan = 1)  # Upper Right
ax3 = plt.subplot2grid((4,2), (2,0), rowspan = 1, colspan = 1)  # Middle Left
ax4 = plt.subplot2grid((4,2), (2,1), rowspan = 1, colspan = 2)  # Middle Right
ax5 = plt.subplot2grid((4,2), (3,0), rowspan = 1, colspan = 2)  # Entire Bottom

colors = ['blue', 'forestgreen', 'tan', 'orange', 'red']


# the first is a multiplot of all series and then the rest are individual
# the NaN gives problems with the combined plot

x.plot.hist(ax = ax0, bins = 100, color = colors, alpha=0.6, histtype='bar', stacked = True)
ax0.legend(prop={'size': 12}, labels = labels)
ax0.set_xlim(0,1200)
ax0.set_title('All Satisfaction')

ax1.hist(x1, n_bins, normed=0, histtype='bar', color=colors[0])
ax1.set_title('Highly Satisfied')

ax2.hist(x2, n_bins, normed=0, histtype='bar', color = colors[1])
ax2.set_title('Satisfied')

ax3.hist(x3, n_bins, normed=0, histtype='bar', color = colors[2])
ax3.set_title('Neutral')
         
ax4.hist(x4, n_bins, normed=0, histtype='bar', color = colors[3])
ax4.set_title('Dissatisfied')
         
ax5.hist(x5, n_bins, normed=0, histtype='bar', color = colors[4])
ax5.set_xlim(0,600)
ax5.set_title('Highly Dissatisfied')                
         
fig.tight_layout()
# fig.suptitle('Distribution of Days to Attrition', fontsize=18)

# plt.subplots_adjust(top=0.92)

plt.show()


# subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=None)

# left  = 0.125  # the left side of the subplots of the figure
# right = 0.9    # the right side of the subplots of the figure
# bottom = 0.1   # the bottom of the subplots of the figure
# top = 0.9      # the top of the subplots of the figure
# wspace = 0.2   # the amount of width reserved for blank space between subplots
# hspace = 0.2   # the amount of height reserved for white space between subplots

In [None]:
# plot of multiple histograms overlayed on one it's not the best
# df[df['B']==3]['A']

fig = plt.figure(figsize = (15,8))

# get the separate series
x1 = cleanMelt[cleanMelt['Satisfaction'] == 'Highly Satisfied']['SatisfactionDays']

x2 = cleanMelt[cleanMelt['Satisfaction'] == 'Satisfied']['SatisfactionDays']

x3 = cleanMelt[cleanMelt['Satisfaction'] == 'Neutral']['SatisfactionDays']

x4 = cleanMelt[cleanMelt['Satisfaction'] == 'Dissatisfied']['SatisfactionDays']

x5 = cleanMelt[cleanMelt['Satisfaction'] == 'Highly Dissatisfied']['SatisfactionDays']

n_bins = 100

# cut the grid into 3 rows and 2 columns with a large one at the bottom
ax0 = plt.subplot2grid((3,2), (0,0), rowspan = 1, colspan = 1)  # Upper left
ax1 = plt.subplot2grid((3,2), (0,1), rowspan = 1, colspan = 1)  # Upper Right
ax2 = plt.subplot2grid((3,2), (1,0), rowspan = 1, colspan = 1)  # Middle Left
ax3 = plt.subplot2grid((3,2), (1,1), rowspan = 1, colspan = 1)  # Middle Right
ax4 = plt.subplot2grid((3,2), (2,0), rowspan = 1, colspan = 2)  # Entire Bottom

sns.distplot(x1, color='blue', bins = n_bins, ax = ax0)
ax0.set_xlim(0,1200)
ax0.set_title('Highly Satisfied')

sns.distplot(x2, color = 'forestgreen', bins = n_bins, ax = ax1)
ax1.set_xlim(0,1200)
ax1.set_title('Satisfied')

sns.distplot(x3, color = 'tan', bins = n_bins, ax = ax2)
ax2.set_xlim(0,1200)
ax2.set_title('Neutral')
         
sns.distplot(x4, color = 'orange', bins = n_bins, ax = ax3)
ax3.set_xlim(0,1200)
ax3.set_title('Dissatisfied')
         
sns.distplot(x5, color = 'red', bins = n_bins, ax = ax4)
ax4.set_xlim(0,800)
ax4.set_title('Highly Dissatisfied')                
         
fig.tight_layout()
plt.show()


In [None]:
csat.head()

In [None]:
csat.SurveyTenure.describe()

In [None]:
# need satisfaction as a category for survival
# first drop the unknown satisfaction column

csatRed = csat.ix[:, [0,1,2,3,4,5,6,7,9,10,11,12,13]]

csatRed.head()


In [None]:
meltCsat = pd.melt(csatRed, id_vars = ['CustID', 'SurveyDate', 'MonthYear', 'DateLeft', 'LeftBank', 'End_Date', 'SurveyTenure', 'DaysToLeave'],var_name = 'Satisfaction',
                   )
meltCsat.head()

In [None]:
# sort by customer id to check the melting

meltCsat.sort_values(by='CustID', inplace = True)

meltCsat = meltCsat.reset_index(drop = True)

meltCsat.head(20)

In [None]:
# multiply the value y the days to leave for the column
def clean_days(row):
    if pd.isnull(row):
        return int(0)
    else:
        return int(row)


meltCsat['DaysToLeave'] = meltCsat['DaysToLeave'].apply(clean_days)

meltCsat.head()


In [None]:
meltCsat['SatisfactionDays'] = meltCsat['value'] * meltCsat['DaysToLeave']

# drop the value column
# meltCsat.drop('value', axis = 1, inplace = True)

# drop the value = 0
cleanCsat = meltCsat[(meltCsat['value'] != 0)].copy()

cleanCsat.head(20)

In [None]:
os.getcwd()

In [None]:
# cleanCsat.to_csv('CsatSurvivalRaw.csv')

In [None]:
# create the categorical

labels = ['Highly Satisfied', 'Satisfied', 'Neutral', 'Dissatisfied', 'Highly Dissatisfied']

cleanCsat['Satisfaction'] = pd.Categorical(cleanCsat['Satisfaction'], categories = labels, ordered=True)

In [None]:
cleanCsat.SurveyTenure.describe()

In [None]:
# calculate the time from survey to attrition

cleanCsat['NotAttrite'] = cleanCsat.iloc[:,5] - cleanCsat.iloc[:,1]


In [None]:
cleanCsat.info()

In [None]:
cleanCsat.NotAttrite.dtype

In [None]:
# this works to convert days into years using the hours first
cleanCsat['NotAttriteClean'] = (cleanCsat['NotAttrite'].astype('timedelta64[h]'))/24

cleanCsat.head()

In [None]:
def get_days(row):
    
    # check if the value of SatisfactionDays = 0
    # a value of 0 indicates that account is still open
    # therefore we'll calculate the difference from now to the survey date
    
    if row['DaysToLeave'] == 0:
        
        return row['NotAttriteClean']
         
    
    else:
        
        return row['DaysToLeave']
        
        

In [None]:
# df['race_label'] = df.apply (lambda row: label_race (row),axis=1)
# this applies the function row wise

cleanCsat['CleanTenure'] = cleanCsat.apply (lambda row: get_days (row),axis=1)
cleanCsat.head()

In [None]:
# get a view where leftBank = 1

test = cleanCsat[(cleanCsat['LeftBank'] == 1)]
test.head()

In [None]:
# spelling error caused additional coded needed for cleaning
# observed = {'LeftBank' : 'Observed'}

observed = {'Obsered' : 'Observed'}

cleanCsat.rename(columns=observed, inplace=True)

cleanCsat.head()

In [None]:
os.getcwd()

In [None]:
# export the cleaned dataset

cleanCsat.to_csv('cleanCsatData.csv')