In [15]:

import pandas as pd
import numpy as np
from pandas import DataFrame
from typing import Optional
import seaborn as sns
from matplotlib import pyplot as plt

In [17]:
def load_data_to_dta_frame(filepath:str)->Optional[DataFrame]:
  '''
  Load a csv file to a pandas data frame
  Args:
      filepath: str: path to the csv file
  Returns:
       DataFrame: a pandas data frame
  '''
  try:
    return pd.read_csv(filepath)
  except FileNotFoundError:
    print(f'file not found at {filepath}')
    return None
data = load_data_to_dta_frame('../data/raw/credit_risk_dataset.csv')

In [25]:
data.head().T

Unnamed: 0,0,1,2,3,4
person_age,22,21,25,23,24
person_income,59000,9600,9600,65500,54400
person_home_ownership,RENT,OWN,MORTGAGE,RENT,RENT
person_emp_length,123.0,5.0,1.0,4.0,8.0
loan_intent,PERSONAL,EDUCATION,MEDICAL,MEDICAL,MEDICAL
loan_grade,D,B,C,C,C
loan_amnt,35000,1000,5500,35000,35000
loan_int_rate,16.02,11.14,12.87,15.23,14.27
loan_status,1,0,1,1,1
loan_percent_income,0.59,0.1,0.57,0.53,0.55


In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


In [22]:
data.isnull().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [23]:
data.shape

(32581, 12)

In [24]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
person_age,32581.0,27.7346,6.348078,20.0,23.0,26.0,30.0,144.0
person_income,32581.0,66074.84847,61983.119168,4000.0,38500.0,55000.0,79200.0,6000000.0
person_emp_length,31686.0,4.789686,4.14263,0.0,2.0,4.0,7.0,123.0
loan_amnt,32581.0,9589.371106,6322.086646,500.0,5000.0,8000.0,12200.0,35000.0
loan_int_rate,29465.0,11.011695,3.240459,5.42,7.9,10.99,13.47,23.22
loan_status,32581.0,0.218164,0.413006,0.0,0.0,0.0,0.0,1.0
loan_percent_income,32581.0,0.170203,0.106782,0.0,0.09,0.15,0.23,0.83
cb_person_cred_hist_length,32581.0,5.804211,4.055001,2.0,3.0,4.0,8.0,30.0


In [26]:
data.head().T

Unnamed: 0,0,1,2,3,4
person_age,22,21,25,23,24
person_income,59000,9600,9600,65500,54400
person_home_ownership,RENT,OWN,MORTGAGE,RENT,RENT
person_emp_length,123.0,5.0,1.0,4.0,8.0
loan_intent,PERSONAL,EDUCATION,MEDICAL,MEDICAL,MEDICAL
loan_grade,D,B,C,C,C
loan_amnt,35000,1000,5500,35000,35000
loan_int_rate,16.02,11.14,12.87,15.23,14.27
loan_status,1,0,1,1,1
loan_percent_income,0.59,0.1,0.57,0.53,0.55


In [27]:
def rename_observation(data:DataFrame)->DataFrame:
  '''
  Rename the observation column to id
  Args:
      data: DataFrame: a pandas data frame
  Returns:
       DataFrame: a pandas data frame
  '''
  for col in data.columns:
    if data[col].dtype == 'object':
      data[col] = data[col].str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
  return data


In [28]:
data = rename_observation(data)
data.head().T

Unnamed: 0,0,1,2,3,4
person_age,22,21,25,23,24
person_income,59000,9600,9600,65500,54400
person_home_ownership,rent,own,mortgage,rent,rent
person_emp_length,123.0,5.0,1.0,4.0,8.0
loan_intent,personal,education,medical,medical,medical
loan_grade,d,b,c,c,c
loan_amnt,35000,1000,5500,35000,35000
loan_int_rate,16.02,11.14,12.87,15.23,14.27
loan_status,1,0,1,1,1
loan_percent_income,0.59,0.1,0.57,0.53,0.55


In [29]:
data.head(10)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,rent,123.0,personal,d,35000,16.02,1,0.59,y,3
1,21,9600,own,5.0,education,b,1000,11.14,0,0.1,n,2
2,25,9600,mortgage,1.0,medical,c,5500,12.87,1,0.57,n,3
3,23,65500,rent,4.0,medical,c,35000,15.23,1,0.53,n,2
4,24,54400,rent,8.0,medical,c,35000,14.27,1,0.55,y,4
5,21,9900,own,2.0,venture,a,2500,7.14,1,0.25,n,2
6,26,77100,rent,8.0,education,b,35000,12.42,1,0.45,n,3
7,24,78956,rent,5.0,medical,b,35000,11.11,1,0.44,n,4
8,24,83000,rent,8.0,personal,a,35000,8.9,1,0.42,n,2
9,21,10000,own,6.0,venture,d,1600,14.74,1,0.16,n,3


In [33]:
data.duplicated().sum()

165

In [34]:
data = data.drop_duplicates()

In [35]:
data.duplicated().sum()

0

In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32416 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32416 non-null  int64  
 1   person_income               32416 non-null  int64  
 2   person_home_ownership       32416 non-null  object 
 3   person_emp_length           31529 non-null  float64
 4   loan_intent                 32416 non-null  object 
 5   loan_grade                  32416 non-null  object 
 6   loan_amnt                   32416 non-null  int64  
 7   loan_int_rate               29321 non-null  float64
 8   loan_status                 32416 non-null  int64  
 9   loan_percent_income         32416 non-null  float64
 10  cb_person_default_on_file   32416 non-null  object 
 11  cb_person_cred_hist_length  32416 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.2+ MB


In [43]:
data.loan_int_rate.median()

10.99

In [39]:

# Calculate skewness for a specific column, e.g., 'loan_int_rate'
print("Skewness of loan_int_rate:", data['loan_int_rate'].skew())

Skewness of loan_int_rate: 0.2070016541005385


In [42]:
print("Skewness of loan_int_rate:", data['person_emp_length'].skew())

Skewness of loan_int_rate: 2.619915431174932


In [44]:
data['loan_int_rate'].fillna(data['loan_int_rate'].median(), inplace=True)

In [46]:
# Fill with mode
data['person_emp_length'].fillna(data['person_emp_length'].mode()[0], inplace=True)

In [47]:
data.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64