 # Capstone Project 2 - Data Wrangling
## Name: Brock Nosbisch

#### Summary:
This dataset contains summary level data for random cities.
The data is read in through a csv (data.csv).  
I then set the index for the dataset to the unique identifier.  
Several fields needed to have their data types updated.
Removed Pending Starts, Reactivated, and Cancelled SAs.
Also removed a few bad records that had Null Rate Class, Company, Bill Cycle.


In [177]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)

In [178]:
# Function: print_counts
# Description: This function returns the Least Frequent and Most Frequent values of the field passed in.
#              We have hardcoded the df DataFrame.
# Valid values for parameters:
# dataframe = DataFrame
# field_name = name of field in the DF dataframe.
# sorting = asc or desc
# num = The number of records you want returned.
def print_counts(dataframe, field_name, sorting = 'asc', num = '5'):
    
    if sorting == 'asc':
        tmpSort = True
    else:
        tmpSort = False
        
    print(dataframe[field_name].value_counts(ascending=tmpSort).head(int(num)))
    
    return None

In [179]:
# Function: print_min_max
# Description: This function returns the Minimum or Maximum values of the field passed in.
#              We have hardcoded the df DataFrame.
# Valid values for parameters:
# dataframe = DataFrame
# field_name = name of field in the DF dataframe.
# min_max = min or max
# num = The number of records you want returned.
def print_min_max(dataframe, field_name, min_max = 'max', num = '5'):
    
    if min_max == 'max':
        tmpMinMax = False
    else:
        tmpMinMax = True

    print(dataframe[field_name].value_counts().to_frame().reset_index().sort_values('index', ascending=tmpMinMax).head(int(num)))
    
    return None

In [180]:

# Creation Functions Used.
def add_value_labels(ax, spacing=5):

    for rect in ax.patches:
        y_value = rect.get_height()
        x_value = rect.get_x() + rect.get_width() / 2

        space = spacing
        va = 'bottom'

        if y_value < 0:
            space *= -1
            # Vertical alignment
            va = 'top'

        label = "{:}".format(y_value)  # Use :.1f if wanting decimals.

        # Create annotation
        ax.annotate(label, (x_value, y_value), xytext=(0, space), textcoords="offset points", ha='center', va=va) 
        

## Load Data

In [181]:
file_name = 'data.csv' # Service Agreement Level

df=pd.DataFrame()

# Read in the file with new column names.
df = pd.read_csv(file_name, header=0)

df.set_index('SA_ID', inplace=True)


In [182]:
df.head(5)


Unnamed: 0_level_0,CIS_DIVISION,ACCOUNT_ID,CUSTOMER_CLASS_CODE,CUSTOMER_CLASS_DESCRIPTION,PERSON_ID,SA_START_DATE,SA_START_YEAR,SA_START_YEAR_MONTH,SA_END_DATE,SA_END_YEAR,...,PERSON_MIN_SA_END_DATE,PREMISE_MAX_SA_START_DATE,PREMISE_MIN_SA_START_DATE,PREMISE_MAX_SA_END_DATE,PREMISE_MIN_SA_END_DATE,PREMISE_PRIOR_STOP_DATE,PERSON_PRIOR_STOP_DATE,PREMISE_DAYS_INACTIVE_BEFORE,PERSON_DAYS_INACTIVE_BEFORE,USAGE_PAST_18_MONTHS
SA_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34935241500,LGC,11645080000,RES,Residential,62534880000,5/19/1993,1993,199305,12/31/2099,2099,...,,5/19/1993,5/19/1993,,,,,0,0,1176.324
16293132432,MGE,5431044444,RES,Residential,33172328888,10/9/2014,2014,201410,12/31/2099,2099,...,,10/9/2014,10/9/2014,,,,,0,0,952.0
34937413572,MGE,11645804444,RES,Residential,19068248888,1/1/1969,1969,196901,12/31/2099,2099,...,,1/1/1969,1/1/1969,,,,,0,0,463.0
97756482240,LGC,32585493536,RES,Residential,28983386912,9/3/2013,2013,201309,10/16/2017,2017,...,10/16/2017,10/16/2017,9/3/2013,10/16/2017,10/16/2017,,,0,0,0.0
103835695332,LGC,34625440000,RES,Residential,47351600000,10/16/2017,2017,201710,12/31/2099,2099,...,10/22/2017,10/16/2017,9/3/2013,10/16/2017,10/16/2017,10/16/2017,10/22/2017,0,-6,1005.784


## Update Data Types and Remove Unneeded/Bad Data

In [183]:
# Need to update data types.
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 709303 entries, 34935241500 to 68398773804
Data columns (total 91 columns):
CIS_DIVISION                     705120 non-null object
ACCOUNT_ID                       709303 non-null int64
CUSTOMER_CLASS_CODE              709303 non-null object
CUSTOMER_CLASS_DESCRIPTION       709303 non-null object
PERSON_ID                        709303 non-null int64
SA_START_DATE                    709303 non-null object
SA_START_YEAR                    709303 non-null int64
SA_START_YEAR_MONTH              709303 non-null int64
SA_END_DATE                      709303 non-null object
SA_END_YEAR                      709303 non-null int64
SA_END_YEAR_MONTH                709303 non-null int64
SA_STATUS_FLAG                   709303 non-null int64
SA_TYPE_CODE                     709303 non-null object
RATE_CLASS_CODE                  708860 non-null object
RATE_CLASS_DESCRIPTION           708860 non-null object
PREMISE_ID                       709303 n

In [184]:
df = df.astype({'ACCOUNT_ID':'str', 
                'PERSON_ID':'str', 
                'SA_STATUS_FLAG':'str', 
                'PREMISE_ID':'str',
                'POSTAL':'str',
                'SA_START_YEAR':'str',
                'SA_START_YEAR_MONTH':'str',
                'SA_END_YEAR':'str',
                'SA_END_YEAR_MONTH':'str',
                'SA_START_DATE':'datetime64[ns]',
                'SA_END_DATE':'datetime64[ns]',
                'MOST_RECENT_PAYMENT_DATE':'datetime64[ns]',
                'PERSON_MAX_SA_START_DATE':'datetime64[ns]',
                'PERSON_MIN_SA_START_DATE':'datetime64[ns]',   
                'PERSON_MAX_SA_END_DATE':'datetime64[ns]',
                'PERSON_MIN_SA_END_DATE':'datetime64[ns]',
                'PREMISE_MAX_SA_START_DATE':'datetime64[ns]',
                'PREMISE_MIN_SA_START_DATE':'datetime64[ns]',
                'PREMISE_MAX_SA_END_DATE':'datetime64[ns]',
                'PREMISE_MIN_SA_END_DATE':'datetime64[ns]',
                'PREMISE_PRIOR_STOP_DATE':'datetime64[ns]',
                'PERSON_PRIOR_STOP_DATE':'datetime64[ns]',
                'PLEDGE_DATE_2013':'datetime64[ns]',
                'PLEDGE_DATE_2014':'datetime64[ns]',
                'PLEDGE_DATE_2015':'datetime64[ns]',
                'PLEDGE_DATE_2016':'datetime64[ns]',
                'PLEDGE_DATE_2017':'datetime64[ns]',
                'PLEDGE_DATE_2018':'datetime64[ns]',
                'PLEDGE_DATE_2019':'datetime64[ns]'
               })

print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 709303 entries, 34935241500 to 68398773804
Data columns (total 91 columns):
CIS_DIVISION                     705120 non-null object
ACCOUNT_ID                       709303 non-null object
CUSTOMER_CLASS_CODE              709303 non-null object
CUSTOMER_CLASS_DESCRIPTION       709303 non-null object
PERSON_ID                        709303 non-null object
SA_START_DATE                    709303 non-null datetime64[ns]
SA_START_YEAR                    709303 non-null object
SA_START_YEAR_MONTH              709303 non-null object
SA_END_DATE                      709303 non-null datetime64[ns]
SA_END_YEAR                      709303 non-null object
SA_END_YEAR_MONTH                709303 non-null object
SA_STATUS_FLAG                   709303 non-null object
SA_TYPE_CODE                     709303 non-null object
RATE_CLASS_CODE                  708860 non-null object
RATE_CLASS_DESCRIPTION           708860 non-null object
PREMISE_ID        

#### SA Status Flag Descriptions:
* 10 - Pending Start
* 20 - Active
* 30 - Pending Stop
* 40 - Stopped
* 50 - Reactivated
* 60 - Closed
* 70 - Cancelled

#### I will be removing the Cancelled SAs since they could have been cancelled for a number of reasons (ex. CSR mistake).  I will also be removing Reactivated since we don't deal with them much and all Pending Starts since we do not know what Division they are in yet.
#### There is also 1 bad record that I am removing.

In [185]:
print ('Before: ')
print_counts(df, 'SA_STATUS_FLAG', 'desc','10')

df = (df[df['SA_STATUS_FLAG'] != '70']) # Cancelled SAs
df = (df[df['SA_STATUS_FLAG'] != '50']) # Reactivated SAs
df = (df[df['SA_STATUS_FLAG'] != '10']) # Pending Start SAs
df = (df[df['RATE_CLASS_CODE'].notnull()]) # Removing 1 bad record
df = (df[df['CIS_DIVISION'].notnull()]) # Removing bad records
df = (df[df['BILL_CYCLE_CODE'].notnull()]) # Removing bad records


print()
print()
print ('After: ')
print_counts(df, 'SA_STATUS_FLAG', 'desc','10')

Before: 
20    334921
60    312015
70     45933
40     14345
30      1047
10       848
50       194
Name: SA_STATUS_FLAG, dtype: int64


After: 
20    334917
60    311654
40     14339
30      1047
Name: SA_STATUS_FLAG, dtype: int64


In [186]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 661957 entries, 34935241500 to 68398773804
Data columns (total 91 columns):
CIS_DIVISION                     661957 non-null object
ACCOUNT_ID                       661957 non-null object
CUSTOMER_CLASS_CODE              661957 non-null object
CUSTOMER_CLASS_DESCRIPTION       661957 non-null object
PERSON_ID                        661957 non-null object
SA_START_DATE                    661957 non-null datetime64[ns]
SA_START_YEAR                    661957 non-null object
SA_START_YEAR_MONTH              661957 non-null object
SA_END_DATE                      661957 non-null datetime64[ns]
SA_END_YEAR                      661957 non-null object
SA_END_YEAR_MONTH                661957 non-null object
SA_STATUS_FLAG                   661957 non-null object
SA_TYPE_CODE                     661957 non-null object
RATE_CLASS_CODE                  661957 non-null object
RATE_CLASS_DESCRIPTION           661957 non-null object
PREMISE_ID        

In [187]:
df.head(5)

Unnamed: 0_level_0,CIS_DIVISION,ACCOUNT_ID,CUSTOMER_CLASS_CODE,CUSTOMER_CLASS_DESCRIPTION,PERSON_ID,SA_START_DATE,SA_START_YEAR,SA_START_YEAR_MONTH,SA_END_DATE,SA_END_YEAR,...,PERSON_MIN_SA_END_DATE,PREMISE_MAX_SA_START_DATE,PREMISE_MIN_SA_START_DATE,PREMISE_MAX_SA_END_DATE,PREMISE_MIN_SA_END_DATE,PREMISE_PRIOR_STOP_DATE,PERSON_PRIOR_STOP_DATE,PREMISE_DAYS_INACTIVE_BEFORE,PERSON_DAYS_INACTIVE_BEFORE,USAGE_PAST_18_MONTHS
SA_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34935241500,LGC,11645080000,RES,Residential,62534880000,1993-05-19,1993,199305,2099-12-31,2099,...,NaT,1993-05-19,1993-05-19,NaT,NaT,NaT,NaT,0,0,1176.324
16293132432,MGE,5431044444,RES,Residential,33172328888,2014-10-09,2014,201410,2099-12-31,2099,...,NaT,2014-10-09,2014-10-09,NaT,NaT,NaT,NaT,0,0,952.0
34937413572,MGE,11645804444,RES,Residential,19068248888,1969-01-01,1969,196901,2099-12-31,2099,...,NaT,1969-01-01,1969-01-01,NaT,NaT,NaT,NaT,0,0,463.0
97756482240,LGC,32585493536,RES,Residential,28983386912,2013-09-03,2013,201309,2017-10-16,2017,...,2017-10-16,2017-10-16,2013-09-03,2017-10-16,2017-10-16,NaT,NaT,0,0,0.0
103835695332,LGC,34625440000,RES,Residential,47351600000,2017-10-16,2017,201710,2099-12-31,2099,...,2017-10-22,2017-10-16,2013-09-03,2017-10-16,2017-10-16,2017-10-16,2017-10-22,0,-6,1005.784


In [188]:
df.describe()

Unnamed: 0,INTERNAL_CREDIT_RATING,PAYMENTS_IN_LAST_18_MONTHS,BILLS_IN_LAST_18_MONTHS,PAY_SEGS_IN_LAST_18_MONTHS,BILL_SEGS_IN_LAST_18_MONTHS,ARREARS_CURRENT_AMOUNT,ARREARS_PAYOFF_AMOUNT,TOTAL_CURRENT_AMOUNT,TOTAL_PAYOFF_AMOUNT,LATE_PAYMENT_COUNT,...,PLEDGE_FLAG_2014,PLEDGE_FLAG_2015,PLEDGE_FLAG_2016,PLEDGE_FLAG_2017,PLEDGE_FLAG_2018,PLEDGE_FLAG_2019,USAGE_IN_LAST_18_MONTHS,PREMISE_DAYS_INACTIVE_BEFORE,PERSON_DAYS_INACTIVE_BEFORE,USAGE_PAST_18_MONTHS
count,661957.0,661957.0,661957.0,661957.0,661957.0,661957.0,661957.0,661957.0,661957.0,661957.0,...,661957.0,661957.0,661957.0,661957.0,661957.0,661957.0,432355.0,661957.0,661957.0,661957.0
mean,98.139562,8.14793,9.348538,8.14793,9.348538,21.041593,12.054118,21.403155,12.377496,0.657487,...,0.0,0.006682,0.008933,0.008644,0.004739,0.0,1504.453,-1271.078943,-2080.937652,982.6283
std,333.794981,7.958601,8.357426,7.958601,8.357426,116.381483,123.366641,115.895349,122.904606,2.341999,...,0.0,0.081468,0.094089,0.092571,0.068677,0.0,48315.3,6099.024596,7712.977248,39053.76
min,-4262.0,0.0,0.0,0.0,0.0,-23124.5,-23124.5,-23124.5,-23124.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-11527.8,-39660.0,-51013.0,-11527.8
25%,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,409.0,0.0,0.0,0.0
50%,-1.0,6.0,9.0,6.0,9.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,879.0,0.0,0.0,363.6
75%,-1.0,18.0,18.0,18.0,18.0,26.25,7.19,26.39,10.49,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1231.0,0.0,0.0,1032.947
max,2511.0,217.0,164.0,217.0,164.0,27252.75,27252.75,27252.75,27252.75,18.0,...,0.0,1.0,1.0,1.0,1.0,0.0,20181020.0,7465.0,7683.0,20181020.0


### Write data to file to use in Data Story

In [189]:
filename = 'data_wrangling_out.csv'
df.to_csv(filename)

### ANNEX

In [202]:
tmp=1000994193*8
tmp=str(tmp)
df.loc[(df['PERSON_ID']==tmp)]

Unnamed: 0_level_0,CIS_DIVISION,ACCOUNT_ID,CUSTOMER_CLASS_CODE,CUSTOMER_CLASS_DESCRIPTION,PERSON_ID,SA_START_DATE,SA_START_YEAR,SA_START_YEAR_MONTH,SA_END_DATE,SA_END_YEAR,...,PERSON_MIN_SA_END_DATE,PREMISE_MAX_SA_START_DATE,PREMISE_MIN_SA_START_DATE,PREMISE_MAX_SA_END_DATE,PREMISE_MIN_SA_END_DATE,PREMISE_PRIOR_STOP_DATE,PERSON_PRIOR_STOP_DATE,PREMISE_DAYS_INACTIVE_BEFORE,PERSON_DAYS_INACTIVE_BEFORE,USAGE_PAST_18_MONTHS
SA_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
41359068480,MGE,13791730408,RES,Residential,8007953544,2018-03-05,2018,201803,2099-12-31,2099,...,2017-12-27,2018-03-05,2014-11-24,2018-03-05,2015-09-07,2018-03-05,2017-12-27,0,68,527.0
41325819840,MGE,13791730408,RES,Residential,8007953544,2017-03-08,2017,201703,2017-12-27,2017,...,2017-12-27,2019-01-14,2014-08-27,2019-01-14,2016-12-05,2017-03-08,NaT,0,0,0.0
