In [None]:
# standard libraries
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import os
import re
import gc

# plotting libraries
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# get the datetime library for date & time calcs
from datetime import datetime, timedelta

In [None]:
os.chdir(os.path.normpath('C:/Users/n846490/Documents/DigitalAnalytics/CheckingAnalysis/'))

In [None]:
file = 'Openers_Jan13_to_Present.csv'
checking = pd.read_csv(file)

checking.head()

In [None]:
# need to change the column names to make more sense
# do this with mapping

newNames = {'H1424_IDCENT': 'CostCenter', 'H1424_IDPROD':'ProductType', 'H1424_IDCONTRN':'ContractNumber',
            'H1424_FECCONTR':'DateOpened', 'H1424_FECBAJA':'DateClosed', 'H1424_TIPOPERS':'PersonType',
            'H1424_CODPERS':'PersonCode', 'H1424_CDCANAL':'Channel', 'Category':'Category', 'H9776_NOMPROV1':'State',
            'H1451_EDADPER':'CustomerAge', 'Cust_Start':'CustomerStart'}

checking.rename(columns=newNames, inplace=True)

# get a view of the data and headers
checking.head()

In [None]:
checking.info()

In [None]:
checking.Category.unique()

In [None]:
# check the nulls for state

checking['State'].isnull().sum()

In [None]:
# clean them up
# fill the na with NonUS

checking.State = checking.State.fillna('NonUS')

In [None]:
# need to remove the state with *******

# create the regex for the stars
patternDel = re.compile(r'^\*')

# then create the filter on the state column
filter = checking['State'].str.contains(patternDel)

# keep all items not in the filter
checking = checking[~filter]

In [None]:
# clean up the dates

checking['DateOpened'].isnull().sum()


In [None]:
# drop the na's from Date Opened

checking = checking[pd.notnull(checking['DateOpened'])]

In [None]:
# create a function to clean dates
def try_convert(bad_date):
    
    try:

        return pd.to_datetime(bad_date, format='%d%b%Y')
    
    except:
        return np.nan

In [None]:
def try_convert_other(bad_date):
    
    try:

        return pd.to_datetime(bad_date, format='%m/%d/%Y')
    
    except:
        return np.nan

In [None]:
# clean up the date fields to become dates

checking['DateOpened'] = pd.to_datetime(checking['DateOpened'], format='%d%b%Y')

# for date closed first conver the 31DEC9999 to np.nan

checking['DateClosed'] = checking['DateClosed'].apply(try_convert)

checking['DateClosed'] = pd.to_datetime(checking['DateClosed'], format='%d%b%Y')

checking['CustomerStart'] = checking['CustomerStart'].apply(try_convert_other)

checking['CustomerStart'] = pd.to_datetime(checking['CustomerStart'], format='%m/%d/%Y')

checking.head()


In [None]:
checking.shape[0]

In [None]:
def clean_channel(row):
    if row == 'RED':
        return 'Branch'
    else:
        return 'OnlinePhone'
    
checking['Channel'] = checking['Channel'].apply(clean_channel)

In [None]:
# cut into customer groups

# cut the ages
# set the bins first
# use the generation labels

bins = [0, 18, 25, 35, 50, 100]

age_groups = ['Gen Z', 'Students', 'Millennials', 'Gen X', 'Boomers+']

checking['AgeGroups'] = pd.cut(checking['CustomerAge'], bins, labels=age_groups)

checking.head()

In [None]:
# add a column for account status
def acct_status(date):
    if pd.isnull(date):
        return 'Open'
    else:
        return 'Closed'
    
checking['Status'] = checking['DateClosed'].apply(acct_status)

checking.head()

In [None]:
# clean up the system

gc.collect()

In [None]:
pd.options.display.float_format = '{:,.0f}'.format

checking['CustomerAge'].describe()

In [None]:
# put a cap on the age to prevent distortion
# cap at 85
# stop it out at 85

def set_age_limit(age):
    if float(age) > 85:
        return float(85)
    else:
        return float(age)

checking['CustomerAge'] = checking['CustomerAge'].apply(set_age_limit)

In [None]:
# clean up the custid

checking['PersonCode'] = checking['PersonCode'].apply(lambda x: str(int(x)).zfill(9))


In [None]:
# get the info to see data types

checking.info()

In [None]:
# then join it with PersonType to make a CustID

# ensure that CleanCust is a string

checking['PersonCode'] = checking['PersonCode'].astype(str)

checking['CustID'] = checking['PersonType'] + checking['PersonCode']

checking.head()

In [None]:
# make a Month-Year column for plotting later

checking['MonthYear'] = checking['DateOpened'].apply(lambda x: x.strftime('%b-%y'))

In [None]:
# make a year column for plotting later

checking['Year'] = checking['DateOpened'].apply(lambda x: x.strftime('%Y'))

checking.head()

In [None]:
# subset only the checking accounts now

chkOnly = checking[checking.Category == 'CHECKING'].copy()

chkOnly.reset_index(drop = True)

chkOnly.info()

In [None]:
os.getcwd()

In [None]:
checking.to_csv('2013to2017CheckingOpensClean.csv')

In [None]:
chkOnly.to_csv('CleanCheckingOnly.csv')

In [None]:
recentChk = chkOnly[(chkOnly['DateOpened'] > '2012-12-31')].copy()

In [None]:
recentChk.info()

In [None]:
recentChk = recentChk.sort_values(by='DateOpened')

recentChk = recentChk.reset_index(drop = True)

recentChk.head()

In [None]:
# run a factorplot 
sns.set(font_scale=1.25)

fig = sns.FacetGrid(data=recentChk, hue='Status', size = 4, row = 'Channel', aspect = 3, sharey=False)

fig.map(sns.countplot,'MonthYear', alpha=0.6)

plt.xticks(rotation=90)

fig.add_legend(title='Channel')

In [None]:

pd.options.display.float_format = '{:,.0f}'.format

recentChk.groupby(['Year', 'Channel']).agg({'PersonCode' : pd.Series.count}).unstack()


In [None]:
# run a factorplot 
sns.set(font_scale=1.25)

fig = sns.FacetGrid(data=recentChk, hue='Channel', row = 'Channel', size = 3, aspect = 4, sharey = False)

fig.map(sns.countplot,'MonthYear')

plt.xticks(rotation=90)


In [None]:
# display as a pivot table
# pd.pivot_table(df,index=["Manager","Rep"],values=["Price"],
 #              columns=["Product"],aggfunc=[np.sum])

pd.pivot_table(recentChk,index=['Channel'], values = ['PersonCode'], columns = ['Year'], aggfunc = {'PersonCode':len})

In [None]:
# run a factorplot
# not very good

sns.set(font_scale=1.25)

fig = sns.FacetGrid(data=recentChk, hue='AgeGroups', row = 'Channel', size = 3, aspect = 4, sharey = False)

fig.map(sns.countplot,'MonthYear', alpha = 0.6)

plt.xticks(rotation=90)

fig.add_legend(title='Age Groups')

In [None]:
# need to do a subset so we can see the relationship 
# between online goals and accounts

chkRed = recentChk[['DateOpened', 'Channel', 'Status', 'MonthYear', 'Year', 'CustID']].copy()

In [None]:
chkRed.info()

In [None]:
# df.set_index('Date_Time').groupby(pd.TimeGrouper('D')).mean().dropna()
# df = df.groupby([df['Date_Time'].dt.date]).mean()
# df.groupby(df['Time'].apply(lambda x : x.date()))
# grouped_dates['Time'].aggregate(len)

chkSeries = chkRed.groupby(['DateOpened', 'Channel']).agg({'CustID' : pd.Series.count})

chkSeries.head()

In [None]:
chkSeries.shape[0]

In [None]:
# now get the conversion series

checkingGoals = pd.ExcelFile(os.path.normpath('C:/Users/n846490/Documents/DigitalAnalytics/CheckingAnalysis/CheckingGoals20152016.xlsx'))

goals = checkingGoals.parse('Dataset2')
goals.head()

In [None]:
goals.shape[0]

In [None]:
# drop the index to flatten the data

# the code below would pivot the data but it is not needed for this
# chkWide = chkSeries.pivot(index='DateOpened', columns='Channel', values='CustID')

chkWide = chkSeries.reset_index()

chkWide.head()


In [None]:
goals.tail()

In [None]:
chkWide.info()

In [None]:
# make a merged plot to see the spread

# set the figure size
fig = plt.figure(figsize = (15,10))

# use ax1 for plotting the goals
# (211) cuts the subplot into two plots (rows), and 1 column, plot the first plot

# the SRS
ax1 = fig.add_subplot(211)
ax1.plot(goals['Day Index'], goals.TotalCheckingGoals, color = 'skyblue', label = 'Checking Goals')

# plot the openings
ax1 = fig.add_subplot(211)

# get the separate series from the wide data
X1 = np.array(chkWide[chkWide['Channel'] == 'OnlinePhone']['DateOpened'])
y2 = np.array(chkWide[chkWide['Channel'] == 'OnlinePhone']['CustID'])

ax1.plot(X1, y2, color = 'salmon', label = 'Online')

# plot the legend for the first plot
ax1.legend(loc = 'upper right', fontsize = 14)

plt.ylabel('Checking Goals and Openings for Online', fontsize=16)
plt.setp(ax1.get_yticklabels(), fontsize=14) 


dstart = datetime(2015, 9, 21)
dend = datetime(2017, 1, 1)

plt.xlim(dstart,dend)
plt.ylim(0,300)

plt.tight_layout()
plt.show()

In [None]:
goals.info()

In [None]:
#create a dataframe to export it
# use this format recentChk = chkOnly[(chkOnly['DateOpened'] > '2012-12-31')].copy()

# work on the merge prodRevData = pd.merge(revData, prodData, on = 'CustID')

goals = goals[['Date', 'TotalCheckingGoals']]

online = chkWide[(chkWide['Channel'] == 'OnlinePhone')].copy()

online.rename(columns = {'DateOpened':'Date'}, inplace = True)

goals.rename(columns = {'Day Index':'Date'}, inplace = True)

goalsOnline = pd.merge(online, goals, on = 'Date', how = 'left')

goalsOnline.head()

# online.head()

In [None]:
# fill the nan with zeros

goalsOnline.rename(columns = {'CustID' : 'Accts'}, inplace = True)

goalsOnline.fillna(0)

goalsOnline.head()

In [None]:
# create a function to clean TotalCheckingGoals
def clean_nan(goal):
    if pd.isnull(goal):
        return 0
    else:
        return int(goal)
    
goalsOnline['TotalCheckingGoals'] = goalsOnline['TotalCheckingGoals'].apply(clean_nan)

goalsOnline.head()

In [None]:
goalsOnline.rename(columns = {'TotalCheckingGoals':'Goals'}, inplace = True)

In [None]:
os.getcwd()

In [None]:
goalsOnline.to_csv('checkingGoalsAccts.csv')