In [1]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn import preprocessing

In [2]:
dataset = pd.read_csv("data/credit_risk.csv", sep=",")

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Id              32581 non-null  int64  
 1   Age             32581 non-null  int64  
 2   Income          32581 non-null  int64  
 3   Home            32581 non-null  object 
 4   Emp_length      31686 non-null  float64
 5   Intent          32581 non-null  object 
 6   Amount          32581 non-null  int64  
 7   Rate            29465 non-null  float64
 8   Status          32581 non-null  int64  
 9   Percent_income  32581 non-null  float64
 10  Default         32581 non-null  object 
 11  Cred_length     32581 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 3.0+ MB


In [4]:
dataset.head()

Unnamed: 0,Id,Age,Income,Home,Emp_length,Intent,Amount,Rate,Status,Percent_income,Default,Cred_length
0,0,22,59000,RENT,123.0,PERSONAL,35000,16.02,1,0.59,Y,3
1,1,21,9600,OWN,5.0,EDUCATION,1000,11.14,0,0.1,N,2
2,2,25,9600,MORTGAGE,1.0,MEDICAL,5500,12.87,1,0.57,N,3
3,3,23,65500,RENT,4.0,MEDICAL,35000,15.23,1,0.53,N,2
4,4,24,54400,RENT,8.0,MEDICAL,35000,14.27,1,0.55,Y,4


- ID: Unique identifier for each loan applicant.
- Age: Age of the loan applicant.
- Income: Income of the loan applicant.
- Home: Home ownership status (Own, Mortgage, Rent).
- Emp_Length: Employment length in years.
- Intent: Purpose of the loan (e.g., education, home improvement).
- Amount: Loan amount applied for.
- Rate: Interest rate on the loan.
- Status: Loan approval status (Fully Paid, Charged Off, Current).
- Percent_Income: Loan amount as a percentage of income.
- Default: Whether the applicant has defaulted on a loan previously (Yes, No).
- Cred_Length: Length of the applicant's credit history.

In [6]:
# Count number of Y and N
dataset['Default'].value_counts()

Default
N    26836
Y     5745
Name: count, dtype: int64

In [7]:
# Get unique values from "intent" column
unique_intents = dataset["Intent"].unique()
print("Unique intents:", unique_intents)

Unique intents: ['PERSONAL' 'EDUCATION' 'MEDICAL' 'VENTURE' 'HOMEIMPROVEMENT'
 'DEBTCONSOLIDATION']


In [8]:
# Transform the columns to lowercase
columns = [col.lower() for col in dataset.columns]
dataset.columns = columns
columns = dataset.columns
print(columns)

Index(['id', 'age', 'income', 'home', 'emp_length', 'intent', 'amount', 'rate',
       'status', 'percent_income', 'default', 'cred_length'],
      dtype='object')


In [9]:
def impute_missing_val(X, neighbors=5):     
    data = X.copy()     
    imputer = KNNImputer(n_neighbors=neighbors)     
    data['emp_length'] = imputer.fit_transform(data[['emp_length']])     
    data['rate'] = imputer.fit_transform(data[['rate']])     
    return data

In [10]:
dataset = impute_missing_val(dataset, neighbors=5)

In [11]:
def drop_outliers(X):     
    data = X.copy()     
    data = data.drop(data[data['age'] > 100].index)     
    data = data.drop(data[data['emp_length'] > 100].index)     
    return data

In [12]:
dataset = drop_outliers(dataset)

In [13]:
default_mapping = {
    "Y": 1,
    "N": 0
}


In [14]:
dataset["default"] = dataset["default"].map(default_mapping)

In [15]:
encoder = preprocessing.LabelEncoder()
levar = {}
# Iterate through all categorical columns
def encode_cat(data):
    for col in data.select_dtypes(include=['object']).columns:
        encoder = preprocessing.LabelEncoder()
        data[col] = encoder.fit_transform(data[col])  # Perform label encoding
        levar[col] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
        
    return data

In [16]:
dataset = encode_cat(dataset)

In [17]:
dataset.head()

Unnamed: 0,id,age,income,home,emp_length,intent,amount,rate,status,percent_income,default,cred_length
1,1,21,9600,2,5.0,1,1000,11.14,0,0.1,0,2
2,2,25,9600,0,1.0,3,5500,12.87,1,0.57,0,3
3,3,23,65500,3,4.0,3,35000,15.23,1,0.53,0,2
4,4,24,54400,3,8.0,3,35000,14.27,1,0.55,1,4
5,5,21,9900,2,2.0,5,2500,7.14,1,0.25,0,2


In [18]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32574 entries, 1 to 32580
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              32574 non-null  int64  
 1   age             32574 non-null  int64  
 2   income          32574 non-null  int64  
 3   home            32574 non-null  int32  
 4   emp_length      32574 non-null  float64
 5   intent          32574 non-null  int32  
 6   amount          32574 non-null  int64  
 7   rate            32574 non-null  float64
 8   status          32574 non-null  int64  
 9   percent_income  32574 non-null  float64
 10  default         32574 non-null  int64  
 11  cred_length     32574 non-null  int64  
dtypes: float64(3), int32(2), int64(7)
memory usage: 3.0 MB


In [19]:
# Save the dataset to a CSV file
dataset.to_csv("data/credit_risk_data_processed.csv", index=False)

In [20]:
# Count number of Y and N
dataset['default'].value_counts()

default
0    26830
1     5744
Name: count, dtype: int64