In [1]:
import pandas as pd

# Convert rejected loans csv to dataframe
rejected_loans_df = pd.read_csv('/Users/abubakaral-faki/Documents/Data Project/MPV1/data/raw/rejected_2007_to_2018Q4.csv')

## Explore dataset and variable data types

In [2]:
rejected_loans_df.head()

Unnamed: 0,Amount Requested,Application Date,Loan Title,Risk_Score,Debt-To-Income Ratio,Zip Code,State,Employment Length,Policy Code
0,1000.0,2007-05-26,Wedding Covered but No Honeymoon,693.0,10%,481xx,NM,4 years,0.0
1,1000.0,2007-05-26,Consolidating Debt,703.0,10%,010xx,MA,< 1 year,0.0
2,11000.0,2007-05-27,Want to consolidate my debt,715.0,10%,212xx,MD,1 year,0.0
3,6000.0,2007-05-27,waksman,698.0,38.64%,017xx,MA,< 1 year,0.0
4,1500.0,2007-05-27,mdrigo,509.0,9.43%,209xx,MD,< 1 year,0.0


In [3]:
# Check datatypes of all columns

columns = rejected_loans_df.columns #store all columns in an iterable

print(rejected_loans_df[columns].dtypes)

Amount Requested        float64
Application Date         object
Loan Title               object
Risk_Score              float64
Debt-To-Income Ratio     object
Zip Code                 object
State                    object
Employment Length        object
Policy Code             float64
dtype: object


## Rename column to 'snake_case' format

In [4]:
# make column names lower case and replace spaces with '_' (underscores)

rejected_loans_df.columns = rejected_loans_df.columns.str.lower().str.replace(' ', '_')

print(rejected_loans_df.columns)

Index(['amount_requested', 'application_date', 'loan_title', 'risk_score',
       'debt-to-income_ratio', 'zip_code', 'state', 'employment_length',
       'policy_code'],
      dtype='object')


In [5]:
#rename 'debt-to-income_ratio' to dti_ratio

rejected_loans_df.rename(columns = {'debt-to-income_ratio': 'dti_ratio'}, inplace = True)

print(rejected_loans_df.columns)

Index(['amount_requested', 'application_date', 'loan_title', 'risk_score',
       'dti_ratio', 'zip_code', 'state', 'employment_length', 'policy_code'],
      dtype='object')


In [6]:
# Convert application_date into date data type

date_format = '%Y-%m-%d'

rejected_loans_df['application_date'] = pd.to_datetime(rejected_loans_df['application_date'], format = date_format, errors = 'coerce')

In [7]:
# confirm datatype of all 'application_date'

print('Datatype of \'application_date\' column:', rejected_loans_df['application_date'].dtypes)

Datatype of 'application_date' column: datetime64[ns]


In [8]:
# CHeck for missing values in 'application_date column'

rejected_loans_df['application_date'].isna().sum()

0

No null values in application_date column

## Check for missing values all columns

In [9]:
missing_values = rejected_loans_df.isna().sum()

In [10]:
print(missing_values)

amount_requested            0
application_date            0
loan_title               1305
risk_score           18497630
dti_ratio                   0
zip_code                  293
state                      22
employment_length      951355
policy_code               918
dtype: int64


In [22]:
# Convert missing_values to Dataframe
missing_values_df = missing_values.to_frame(name = 'missing_count')

# Compute percentage of missing values in of a column
missing_values_df['percentage_missing'] = round((missing_values_df['missing_count']/rejected_loans_df.shape[0]) * 100,3)

# Show only columns that have missing values
print(missing_values_df[missing_values_df['missing_count'] != 0])

                   missing_count  percentage_missing
loan_title                  1305               0.005
risk_score              18497630              66.902
zip_code                     293               0.001
state                         22               0.000
employment_length         951355               3.441
policy_code                  918               0.003


we can see that risk_score has a 67% of its values missing which is significant compared to the whole datatset

In [44]:
rejected_loans_df['loan_title'].isna().sum()

1305

In [28]:
#Identify rows with missing values

rejected_loans_df[rejected_loans_df['loan_title'].isna()].head()



Unnamed: 0,amount_requested,application_date,loan_title,risk_score,dti_ratio,zip_code,state,employment_length,policy_code
13863,25000.0,2008-04-06,,643.0,17.33%,681xx,NE,1 year,0.0
31073,15000.0,2009-01-03,,511.0,12.92%,631xx,MO,5 years,0.0
31076,10000.0,2009-01-03,,621.0,2.31%,631xx,MO,5 years,0.0
31976,5000.0,2009-01-12,,626.0,9.06%,631xx,MO,1 year,0.0
31979,2000.0,2009-01-12,,0.0,0%,631xx,MO,3 years,0.0


In [30]:
# create a zip_code_prefix column to contain the first numbers of a zip_code

rejected_loans_df['zip_code_prefix'] = rejected_loans_df['zip_code'].str[:3]

print(rejected_loans_df['zip_code_prefix'].head())

0    481
1    010
2    212
3    017
4    209
Name: zip_code_prefix, dtype: object


In [31]:
print(rejected_loans_df['zip_code_prefix'].dtype)

object


In [35]:
print(rejected_loans_df['zip_code_prefix'].isna().sum())

293


In [38]:
print(rejected_loans_df[rejected_loans_df['zip_code_prefix'].isna()].head())

       amount_requested application_date                  loan_title  \
10521            2000.0       2008-03-07  help credit cards problems   
10553            7500.0       2008-03-07   Looking for better rates!   
10572            7000.0       2008-03-07               PERSONAL LOAN   
10578            7000.0       2008-03-07              Paying of debt   
10588           15000.0       2008-03-07                Loan request   

       risk_score dti_ratio zip_code state employment_length  policy_code  \
10521       673.0       -1%      NaN   NaN          < 1 year          0.0   
10553       614.0       -1%      NaN   NaN          < 1 year          0.0   
10572       612.0       -1%      NaN   NaN          < 1 year          0.0   
10578       733.0       -1%      NaN   NaN          < 1 year          0.0   
10588       672.0       -1%      NaN   NaN          < 1 year          0.0   

      zip_code_prefix  
10521             NaN  
10553             NaN  
10572             NaN  
10578   

In [32]:
rejected_loans_df['zip_code_prefix'].astype('int')

ValueError: cannot convert float NaN to integer

In [None]:
print(rejected_loans_df['zip_code_prefix'].dtype)