# Credit Risk / Data Cleaning 

## Loading Packages

In [1]:
# Python Version
from platform import python_version
print('Python version: ', python_version())

Python version:  3.9.16


In [2]:
# Imports

## Data Manipulation
import numpy as np
import pandas as pd

# Ignore Warning
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [3]:
# Package versions
%reload_ext watermark
%watermark -a "Cézar Mendes" --iversions

Author: Cézar Mendes

numpy : 1.24.3
pandas: 1.5.3
sys   : 3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]



## Loading Data

In [4]:
df = pd.read_csv('../data/credit_risk_dataset.csv')

In [5]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


## Duplicate Values

In [6]:
def drop_duplicates(df):
    old = df.shape[0]
    df.drop_duplicates(inplace = True)
    new = df.shape[0]
    count = old - new
    if (count == 0):
        print("No duplicate rows were found.")
    else:
        print(f"{count} duplicate lines were found and removed.")

drop_duplicates(df)

165 duplicate lines were found and removed.


## Missing Values

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32416 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32416 non-null  int64  
 1   person_income               32416 non-null  int64  
 2   person_home_ownership       32416 non-null  object 
 3   person_emp_length           31529 non-null  float64
 4   loan_intent                 32416 non-null  object 
 5   loan_grade                  32416 non-null  object 
 6   loan_amnt                   32416 non-null  int64  
 7   loan_int_rate               29321 non-null  float64
 8   loan_status                 32416 non-null  int64  
 9   loan_percent_income         32416 non-null  float64
 10  cb_person_default_on_file   32416 non-null  object 
 11  cb_person_cred_hist_length  32416 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.2+ MB


In [8]:
# % Missing Values
def func_missing_values(df):

    totalCells = np.product(df.shape)

    missingCount = df.isnull().sum()

    totalMissing = missingCount.sum()

    print("The dataset has", round(((totalMissing/totalCells) * 100), 2), "%", "of missing values.")

func_missing_values(df)

The dataset has 1.02 % of missing values.


In [9]:
# Total of missing values by row
def func_missing_values_row(df):

    missing_rows = sum([True for idx,row in df.iterrows() if any(row.isna())])

    total_rows = df.shape[0]

    print(round(((missing_rows/total_rows) * 100), 2), "%", "of rows in the dataset contain at least one missing value.")

func_missing_values_row(df)

12.08 % of rows in the dataset contain at least one missing value.


In [10]:
# Total missing values by columns
def func_missing_values_column(df):

# Total missing values
    mis_val = df.isnull().sum()
  
# % Missing Values
    mis_val_percent = 100 * mis_val / len(df)

# Missing values types
    mis_val_dtype = df.dtypes

# Table Result
    mis_val_table = pd.concat([mis_val, mis_val_percent, mis_val_dtype], axis=1)

# Rename Columns
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Missing Values', 2: 'Dtype'})
  
# Order
    mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,0] != 0].sort_values('% of Missing Values', ascending = False).round(2)

# Print
    print ("The dataset has " + str(df.shape[1]) + " columns.\n"
    "Found: " + str(mis_val_table_ren_columns.shape[0]) + " columns that have missing values.")

    if mis_val_table_ren_columns.shape[0] == 0:
        return

    return mis_val_table_ren_columns

func_missing_values_column(df)

The dataset has 12 columns.
Found: 2 columns that have missing values.


Unnamed: 0,Missing Values,% of Missing Values,Dtype
loan_int_rate,3095,9.55,float64
person_emp_length,887,2.74,float64


In [11]:
df.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32416.0,32416.0,31529.0,32416.0,29321.0,32416.0,32416.0,32416.0
mean,27.747008,66091.64,4.79051,9593.845632,11.017265,0.218688,0.17025,5.811297
std,6.3541,62015.58,4.14549,6322.730241,3.24168,0.413363,0.106812,4.05903
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38542.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79218.0,7.0,12250.0,13.47,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


### loan_int_rate	

In [12]:
# Imputation
def fix_missing_median(df, col):
    median = df[col].median()
    count = df[col].isna().sum()
    df[col] = df[col].fillna(median)
    print(f"{count} missing values in column {col} have been replaced with their median value {median}.")
    return df[col]

fix_missing_median(df, 'loan_int_rate')

3095 missing values in column loan_int_rate have been replaced with their median value 10.99.


0        16.02
1        11.14
2        12.87
3        15.23
4        14.27
         ...  
32576    13.16
32577     7.49
32578    10.99
32579    11.48
32580     9.99
Name: loan_int_rate, Length: 32416, dtype: float64

### person_emp_length

In [13]:
def drop_rows_with_missing_values(df, columns=None):
    old = df.shape[0]
    df.dropna(subset=columns, inplace=True)
    new = df.shape[0]
    count = old - new
    print(f"{count} lines containing missing values were dropped.")

In [14]:
drop_rows_with_missing_values(df, 'person_emp_length')

887 lines containing missing values were dropped.


In [15]:
func_missing_values_column(df)

The dataset has 12 columns.
Found: 0 columns that have missing values.


In [16]:
func_missing_values(df)

The dataset has 0.0 % of missing values.


# Save the Dataset

In [17]:
# Save the DataFrame to a CSV file
df.to_csv('../data/credit_risk_dataset-clean.csv', index=False)