# Preparing Data For the Modelling

Importing libraries


In [2]:
import pandas as pd
import numpy as np


In [3]:
## Loading the dataset
df = pd.read_csv("data/Loan_Data_New.csv")
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,1000000,7423388,5583,5583,5530,60 months,9.02,177.59,C,F5,...,73260,38.5,3,8,7195,8.6,75743,2,7,14
1,1000001,7550634,12889,12889,12818,60 months,21.1,486.26,E,F5,...,61649,89.6,2,16,15986,76.7,20687,3,12,11
2,1000002,5304572,28413,28413,28365,60 months,22.0,1085.11,F,G2,...,67743,14.2,0,2,17843,69.1,33780,4,9,6
3,1000003,3234489,13275,13275,13177,36 months,10.78,433.22,C,A1,...,13596,35.8,6,13,7374,73.3,51219,3,7,2
4,1000004,8204212,3461,3461,3451,36 months,24.05,135.88,C,C3,...,32746,69.8,0,2,17830,66.8,90069,8,9,6


In [4]:
pd.set_option('display.max_columns', None)


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 74 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           2000 non-null   int64  
 1   member_id                    2000 non-null   int64  
 2   loan_amnt                    2000 non-null   int64  
 3   funded_amnt                  2000 non-null   int64  
 4   funded_amnt_inv              2000 non-null   int64  
 5   term                         2000 non-null   object 
 6   int_rate                     2000 non-null   float64
 7   installment                  2000 non-null   float64
 8   grade                        2000 non-null   object 
 9   sub_grade                    2000 non-null   object 
 10  emp_title                    1800 non-null   object 
 11  emp_length                   2000 non-null   object 
 12  home_ownership               2000 non-null   object 
 13  annual_inc        

All Continuos Variable Must be Numeric

In [6]:
df['emp_length'].unique()

array(['7 years', '10+ years', '2 years', '6 years', '1 year', '8 years',
       '4 years', '5 years', '< 1 year', '9 years', '3 years'],
      dtype=object)

In [7]:
# Cleaning the emp_length column
df['emp_length'] = df['emp_length'].str.replace(' years', '')
df['emp_length'] = df['emp_length'].str.replace(' year', '')
df['emp_length'] = df['emp_length'].str.replace('< 1', '0')
df['emp_length'] = df['emp_length'].str.replace('10+', '10')
df['emp_length'] = df['emp_length'].str.replace('n/a', '0')
df['emp_length'] = df['emp_length'].astype(int)

Checking the conversion

In [8]:
df['emp_length'].unique()

array([ 7, 10,  2,  6,  1,  8,  4,  5,  0,  9,  3])

In [9]:
type(df['emp_length'][0])

numpy.int64

In [10]:
df['emp_length'] = pd.to_numeric(df['emp_length'])
# All Continuos Variable Must be Numeric
type(df['emp_length'][0])

numpy.int64

## Credit Line Column Handling
Like when the loan was dispersed

In [11]:
df['earliest_cr_line_date'] = df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'], format='%b-%y', errors='coerce')

In [12]:
mask = df['earliest_cr_line_date'].dt.year > 2025
df.loc[mask, 'earliest_cr_line_date'] = df.loc[mask, 'earliest_cr_line_date'] + pd.DateOffset(years=100)
df['earliest_cr_line_date'].head()

# Assuming the date before 2025 should be 1900s
df['earliest_cr_line_date'].min()

Timestamp('1980-01-01 00:00:00')

In [13]:
type(df['earliest_cr_line_date'][0])

pandas._libs.tslibs.timestamps.Timestamp

Finding overall time passed after the earliest_Credit_Line

In [14]:
pd.to_datetime("2025-10-12") - df['earliest_cr_line_date'].head()

0   9234 days
1   7469 days
2   8626 days
3   7742 days
4   8381 days
Name: earliest_cr_line_date, dtype: timedelta64[ns]

Converting this in "Month" Format

In [15]:
## In months
df['earliest_cr_line_months'] = (pd.to_datetime("2025-10-12") - df['earliest_cr_line_date']) / pd.Timedelta(days=30.44)

In [16]:
df['earliest_cr_line_months'] = (pd.to_datetime("2025-10-12") - df['earliest_cr_line_date']).dt.days / 30.44
df['earliest_cr_line_months'].head()

0    303.350854
1    245.367937
2    283.377135
3    254.336399
4    275.328515
Name: earliest_cr_line_months, dtype: float64

In [17]:

df['earliest_cr_line_months'].describe()

count    2000.000000
mean      370.071797
std       105.306377
min       190.341656
25%       277.332457
50%       371.320631
75%       463.370565
max       549.310118
Name: earliest_cr_line_months, dtype: float64

## Default Flag (Dependent Variable)

 Charged Off - > Bank loses hope of getting money and marks Loss in their books. 

### Note: I have taken these Conditions as Default , The terms can change according to the Company or firms

In [28]:
file_path = "data/Loan_Data_New.csv"

try:
    df = pd.read_csv(file_path)

    # 'default' if 1, else 0
    df['default'] = df['loan_status'].apply(lambda x: 1 if x in ['Charged Off', 'Default', 'Late (31-120 days)', 'Late (16-30 days)'] else 0)

    # Count of default and non-default
    default_counts = df['default'].value_counts()
    print("Default Counts Distribution:")
    print(default_counts)
    # 1 shows defaullted Loans

    # Print how many loans fall into each original loan_status category
    print("\nLoan status distribution:")
    print(df['loan_status'].value_counts())

except FileNotFoundError:
    print(f"File not found: {file_path}")
    


Default Counts Distribution:
default
0    1190
1     810
Name: count, dtype: int64

Loan status distribution:
loan_status
Current               418
In Grace Period       415
Late (31-120 days)    415
Charged Off           395
Fully Paid            357
Name: count, dtype: int64


## Train-Test Split 

In [29]:
from sklearn.model_selection import train_test_split

In [34]:
def drop_rows_with_missing_loan_status(df):
    initial_shape = df.shape
    df = df.dropna(subset=['loan_status'])
    final_shape = df.shape
    print(f"Dropped {initial_shape[0] - final_shape[0]} rows with missing loan_status")
    return df

In [38]:
df = drop_rows_with_missing_loan_status(df)

Dropped 0 rows with missing loan_status


Feature Selection (Input Variables)

In [41]:
x = df.drop(columns='loan_status')
y = df['loan_status']

train and test data splitting
- test size = 20% 
- training size = 80%

In [42]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
print("Training features set size:", x_train.shape) # Number of rows and columns in x_train
print("Testing features set size:", x_test.shape) # Number of rows and columns in x_test
print("Training labels size:", y_train.shape) # Number of labels in y_train
print("Testing labels size:", y_test.shape) # Number of labels in y_test



Training features set size: (1600, 74)
Testing features set size: (400, 74)
Training labels size: (1600,)
Testing labels size: (400,)
