In [None]:
import pandas as pd

In [14]:
dataset = pd.read_csv("data/credit_risk.csv", sep=",")

In [15]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Id              32581 non-null  int64  
 1   Age             32581 non-null  int64  
 2   Income          32581 non-null  int64  
 3   Home            32581 non-null  object 
 4   Emp_length      31686 non-null  float64
 5   Intent          32581 non-null  object 
 6   Amount          32581 non-null  int64  
 7   Rate            29465 non-null  float64
 8   Status          32581 non-null  int64  
 9   Percent_income  32581 non-null  float64
 10  Default         32581 non-null  object 
 11  Cred_length     32581 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 3.0+ MB


In [16]:
dataset.head()

Unnamed: 0,Id,Age,Income,Home,Emp_length,Intent,Amount,Rate,Status,Percent_income,Default,Cred_length
0,0,22,59000,RENT,123.0,PERSONAL,35000,16.02,1,0.59,Y,3
1,1,21,9600,OWN,5.0,EDUCATION,1000,11.14,0,0.1,N,2
2,2,25,9600,MORTGAGE,1.0,MEDICAL,5500,12.87,1,0.57,N,3
3,3,23,65500,RENT,4.0,MEDICAL,35000,15.23,1,0.53,N,2
4,4,24,54400,RENT,8.0,MEDICAL,35000,14.27,1,0.55,Y,4


- ID: Unique identifier for each loan applicant.
- Age: Age of the loan applicant.
- Income: Income of the loan applicant.
- Home: Home ownership status (Own, Mortgage, Rent).
- Emp_Length: Employment length in years.
- Intent: Purpose of the loan (e.g., education, home improvement).
- Amount: Loan amount applied for.
- Rate: Interest rate on the loan.
- Status: Loan approval status (Fully Paid, Charged Off, Current).
- Percent_Income: Loan amount as a percentage of income.
- Default: Whether the applicant has defaulted on a loan previously (Yes, No).
- Cred_Length: Length of the applicant's credit history.

In [17]:
# Get unique values from "intent" column
unique_intents = dataset["Intent"].unique()
print("Unique intents:", unique_intents)

Unique intents: ['PERSONAL' 'EDUCATION' 'MEDICAL' 'VENTURE' 'HOMEIMPROVEMENT'
 'DEBTCONSOLIDATION']


In [18]:
# Transform the columns to lowercase
columns = [col.lower() for col in dataset.columns]
dataset.columns = columns
columns = dataset.columns
print(columns)

Index(['id', 'age', 'income', 'home', 'emp_length', 'intent', 'amount', 'rate',
       'status', 'percent_income', 'default', 'cred_length'],
      dtype='object')


In [19]:
home_mapping = {item: i for i, item in enumerate(dataset["home"].unique())}
intent_mapping = {item: i for i, item in enumerate(dataset["intent"].unique())}
cred_mapping = {item: i for i, item in enumerate(dataset["cred_length"].unique())}
default_mapping = {
    "Y": 1,
    "N": 0
}


In [20]:
dataset_mapping = {
    "home": home_mapping,
    "intent": intent_mapping,
    "cred_length": cred_mapping,
    "default": default_mapping,
}

In [21]:
def convert_categorical_features_dict(df, mapping):
    for column_name, mapping in mapping.items():
        df[column_name] = df[column_name].map(mapping)
    return df

convert_categorical_features_dict(dataset, dataset_mapping)
dataset.head()

Unnamed: 0,id,age,income,home,emp_length,intent,amount,rate,status,percent_income,default,cred_length
0,0,22,59000,0,123.0,0,35000,16.02,1,0.59,1,0
1,1,21,9600,1,5.0,1,1000,11.14,0,0.1,0,1
2,2,25,9600,2,1.0,2,5500,12.87,1,0.57,0,0
3,3,23,65500,0,4.0,2,35000,15.23,1,0.53,0,1
4,4,24,54400,0,8.0,2,35000,14.27,1,0.55,1,2


In [22]:
# Remove nan values
dataset = dataset.dropna()
dataset.info()
dataset.head()

<class 'pandas.core.frame.DataFrame'>
Index: 28638 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              28638 non-null  int64  
 1   age             28638 non-null  int64  
 2   income          28638 non-null  int64  
 3   home            28638 non-null  int64  
 4   emp_length      28638 non-null  float64
 5   intent          28638 non-null  int64  
 6   amount          28638 non-null  int64  
 7   rate            28638 non-null  float64
 8   status          28638 non-null  int64  
 9   percent_income  28638 non-null  float64
 10  default         28638 non-null  int64  
 11  cred_length     28638 non-null  int64  
dtypes: float64(3), int64(9)
memory usage: 2.8 MB


Unnamed: 0,id,age,income,home,emp_length,intent,amount,rate,status,percent_income,default,cred_length
0,0,22,59000,0,123.0,0,35000,16.02,1,0.59,1,0
1,1,21,9600,1,5.0,1,1000,11.14,0,0.1,0,1
2,2,25,9600,2,1.0,2,5500,12.87,1,0.57,0,0
3,3,23,65500,0,4.0,2,35000,15.23,1,0.53,0,1
4,4,24,54400,0,8.0,2,35000,14.27,1,0.55,1,2


In [23]:
# Save the dataset to a CSV file
dataset.to_csv("data/credit_risk_data_processed.csv", index=False)