In [1]:
# standard libraries
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import os

# plotting libraries
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# get the datetime library for date & time calcs
from datetime import datetime, timedelta

In [2]:
# load the the main data sets
url = os.path.normpath("C:/Users/n846490/Documents/Python Scripts/SurvivalAnalysis/wideDataforRollup.csv")
data = pd.read_csv(url, index_col=0)

data.head()

Unnamed: 0,Dateopened,Persontype,Personcode,Channel,State,End_Date,Observed,Cleancuststart,Custyears,Cleandateclosed,...,Loansavings_Secured,Loc,Locpersonal,Money_Market,Mortgage,Odloc,Other,Other_Loan,Safe_Box,Savings
0,1989-11-29,F,354668,Branch,PA,2016-12-15,1,,,2016-09-20,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1996-09-30,F,132166,Branch,PA,2016-12-15,1,1996-09-19,20.252055,2012-09-17,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1999-03-26,F,5272647,Branch,PA,2016-12-15,1,1999-03-19,17.756164,2010-12-14,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,2001-11-02,F,131869,Branch,PA,2016-12-15,1,,,2007-12-27,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,2001-11-02,F,1019639,Branch,PA,2016-12-15,1,2013-04-27,3.638356,2007-12-27,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4141416 entries, 0 to 4142168
Data columns (total 42 columns):
Dateopened                   object
Persontype                   object
Personcode                   int64
Channel                      object
State                        object
End_Date                     object
Observed                     int64
Cleancuststart               object
Custyears                    float64
Cleandateclosed              object
Productyears                 float64
Age                          float64
Customergroups               object
Closed                       int64
Auto                         float64
Business_Cd                  float64
Business_Checking            float64
Business_Money_Market        float64
Business_Odloc               float64
Business_Savings             float64
Cdira                        float64
Checking                     float64
Commercial_Loan              float64
Credit_Card                  float64
Commercial_Io

In [4]:
# clean up the date columns
# df['just_date'] = df['dates'].dt.date

data['End_Date'] = pd.to_datetime(data['End_Date'], format='%Y-%m-%d')

data['Dateopened'] = pd.to_datetime(data['Dateopened'], format='%Y-%m-%d')

data['Cleancuststart'] = pd.to_datetime(data['Cleancuststart'], format='%Y-%m-%d')

data['Cleandateclosed'] = pd.to_datetime(data['Cleandateclosed'], format='%Y-%m-%d')

data.head()

Unnamed: 0,Dateopened,Persontype,Personcode,Channel,State,End_Date,Observed,Cleancuststart,Custyears,Cleandateclosed,...,Loansavings_Secured,Loc,Locpersonal,Money_Market,Mortgage,Odloc,Other,Other_Loan,Safe_Box,Savings
0,1989-11-29,F,354668,Branch,PA,2016-12-15,1,NaT,,2016-09-20,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1996-09-30,F,132166,Branch,PA,2016-12-15,1,1996-09-19,20.252055,2012-09-17,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1999-03-26,F,5272647,Branch,PA,2016-12-15,1,1999-03-19,17.756164,2010-12-14,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,2001-11-02,F,131869,Branch,PA,2016-12-15,1,NaT,,2007-12-27,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,2001-11-02,F,1019639,Branch,PA,2016-12-15,1,2013-04-27,3.638356,2007-12-27,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# filter out the ones with dateopened as NaN
# there are 15,377 null date opened

data['Dateopened'].isnull().sum()

15377

In [6]:
data['State'].isnull().sum()

116662

In [9]:
# create a in footprint test
# create a condition to check

def footprint_col(colCheck):
    
    footprint = ['ME', 'VT', 'NH', 'MA', 'RI', 'CT', 'NY', 'NJ', 'PA']
    
    if colCheck in footprint:
        return('In')
    else:
        return('Out')
    
data['Footprint'] = data['State'].apply(footprint_col)

In [12]:
data.State.unique()

array(['PA', 'MD', 'NJ', 'MA', 'WA', 'AZ', 'NC', 'DE', 'WI', 'WV', 'OR',
       'CA', 'TX', 'IL', 'VA', 'LA', 'IN', 'FL', 'NH', 'MI', 'MO', nan,
       'OH', 'NY', 'CO', 'GA', 'SC', 'MN', 'CT', 'RI', 'ME', 'PR', 'DC',
       'KY', 'AL', 'OK', 'NV', 'VT', 'ID', 'MT', 'IA', 'AK', 'TN',
       '****************************************************************',
       'MS', 'UT', 'NM', 'AR', 'ND', 'WY', 'NE', 'HI', 'KS', 'AP', 'AE',
       'SD', 'VI', 'GU', 'AA', 'MP'], dtype=object)

In [13]:
# need to remove the state with *******
import re

# create the regex for the stars
patternDel = re.compile(r'^\*')

# then create the filter on the state column
filter = data['State'].str.contains(patternDel)





0          False
1          False
2          False
3          False
4          False
5          False
6          False
7          False
8          False
9          False
10         False
11         False
12         False
13         False
14         False
15         False
16         False
17         False
18         False
19         False
20         False
21         False
22         False
23         False
24         False
25         False
26         False
27         False
28         False
29         False
           ...  
4142139    False
4142140    False
4142141    False
4142142    False
4142143    False
4142144    False
4142145    False
4142146    False
4142147    False
4142148    False
4142149    False
4142150    False
4142151    False
4142152    False
4142153    False
4142154    False
4142155    False
4142156    False
4142157    False
4142158    False
4142159    False
4142160    False
4142161    False
4142162    False
4142163    False
4142164    False
4142165    False
4142166    Fal