# Credit Risk Modeling – Feature Engineering & Modeling

This notebook is part of an end-to-end credit risk modeling project completed
during my Data Science Internship at HDFC Capital Advisors Ltd.

⚠️ Note: Due to data confidentiality, raw datasets are not included.
The notebook demonstrates methodology, feature engineering logic,
modeling approach, and evaluation techniques.

In [3]:
import json
import pandas as pd

In [35]:
LOAD_DATA = False  # Set to True only in secure local environment

if LOAD_DATA:
    with open('./data/senior_ds_test/data/test/accounts_data_test.json', 'r') as f:
        df = json.load(f)
else:
    df = []

flat_list = [item for sublist in df for item in sublist]
df_enq = pd.DataFrame(flat_list)


In [36]:
df_enq.head()

Unnamed: 0,enquiry_type,enquiry_amt,enquiry_date,uid
0,Car loan,143000,2020-12-13,AAA02107680
1,Real estate loan,174000,2020-12-01,AAA14437029
2,Loan for working capital replenishment,65000,2019-07-01,AAA14437029
3,Loan for working capital replenishment,118000,2020-08-05,AAA14437029
4,Car loan,12000,2020-02-28,AAA14437029


In [37]:
df_enq.columns

Index(['enquiry_type', 'enquiry_amt', 'enquiry_date', 'uid'], dtype='object')

In [38]:
df_enq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337662 entries, 0 to 337661
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   enquiry_type  337662 non-null  object
 1   enquiry_amt   337662 non-null  int64 
 2   enquiry_date  337662 non-null  object
 3   uid           337662 non-null  object
dtypes: int64(1), object(3)
memory usage: 10.3+ MB


In [39]:
df_enq.shape

(337662, 4)

In [40]:
df_enq["enquiry_date"] = pd.to_datetime(df_enq["enquiry_date"],format= "%Y-%m-%d")

In [41]:
df_enq["uid"].value_counts().to_frame().sort_values(by='uid')

Unnamed: 0_level_0,count
uid,Unnamed: 1_level_1
AAA02107680,1
AAA14437029,4
AAB12915377,1
AAB55088883,14
AAB68152393,17
...,...
ZZX89351447,2
ZZX95870699,2
ZZY22936172,15
ZZY83980941,5


In [42]:
df_enq["enquiry_type"].value_counts()

enquiry_type
Cash loans                                      53125
Revolving loans                                 21168
Car loan                                        17894
Credit card                                     17701
Another type of loan                            17619
Real estate loan                                17618
Mobile operator loan                            17599
Microloan                                       17589
Loan for purchase of shares (margin lending)    17575
Unknown type of loan                            17524
Loan for business development                   17523
Loan for working capital replenishment          17511
Mortgage                                        17504
Cash loan (non-earmarked)                       17467
Loan for the purchase of equipment              17467
Interbank credit                                17393
Consumer credit                                 17385
Name: count, dtype: int64

In [43]:
df_enq.loc[df_enq.duplicated(subset=["uid","enquiry_amt","enquiry_date"])].sort_values(by="enquiry_date")

Unnamed: 0,enquiry_type,enquiry_amt,enquiry_date,uid
176803,Interbank credit,194000,2019-09-19,NRS91676037
180716,Unknown type of loan,46000,2019-11-06,NZZ86165750
308649,Revolving loans,106000,2020-01-15,XTJ19220207
97924,Loan for working capital replenishment,195000,2020-02-14,HOD10243918
181005,Mobile operator loan,165000,2020-06-18,OAM40530554
87837,Interbank credit,51000,2020-07-02,GTQ90036679
169639,Mobile operator loan,101000,2020-07-10,NEI98297309
5931,Unknown type of loan,66000,2020-08-06,AML54336031
29856,Mortgage,184000,2020-09-18,CIH12552396
137324,Mobile operator loan,78000,2020-10-24,KQD34786939


In [44]:
#No. of enquiries made by each customer for different categories. 
enq_counts = df_enq.groupby(["uid","enquiry_type"])["enquiry_type"].size().to_frame()

enq_counts.head(40)

Unnamed: 0_level_0,Unnamed: 1_level_0,enquiry_type
uid,enquiry_type,Unnamed: 2_level_1
AAA02107680,Car loan,1
AAA14437029,Car loan,1
AAA14437029,Loan for working capital replenishment,2
AAA14437029,Real estate loan,1
AAB12915377,Real estate loan,1
AAB55088883,Another type of loan,1
AAB55088883,Cash loan (non-earmarked),1
AAB55088883,Cash loans,5
AAB55088883,Loan for business development,1
AAB55088883,Loan for the purchase of equipment,1


In [45]:
df_enq.shape

(337662, 4)

## Features

In [46]:
df_enq['enquiry_year']=df_enq["enquiry_date"].dt.year

#month enquired
df_enq['enquiry_month']=df_enq["enquiry_date"].dt.month

# day enquired
df_enq['enquiry_day']=df_enq["enquiry_date"].dt.day

In [47]:
df_enq["Avg_amount_enquired"] = df_enq["uid"].map(df_enq.groupby('uid')['enquiry_amt'].mean().round(2))
df_enq["max_amount_enquired"] = df_enq["uid"].map(df_enq.groupby('uid')['enquiry_amt'].max())
df_enq["min_amount_enquired"] = df_enq["uid"].map(df_enq.groupby('uid')['enquiry_amt'].min())

In [48]:
df_enq['total_enquiries_per_customer'] = df_enq.groupby('uid')['enquiry_type'].transform('count')

In [49]:
df_enq.head(10)

Unnamed: 0,enquiry_type,enquiry_amt,enquiry_date,uid,enquiry_year,enquiry_month,enquiry_day,Avg_amount_enquired,max_amount_enquired,min_amount_enquired,total_enquiries_per_customer
0,Car loan,143000,2020-12-13,AAA02107680,2020,12,13,143000.0,143000,143000,1
1,Real estate loan,174000,2020-12-01,AAA14437029,2020,12,1,92250.0,174000,12000,4
2,Loan for working capital replenishment,65000,2019-07-01,AAA14437029,2019,7,1,92250.0,174000,12000,4
3,Loan for working capital replenishment,118000,2020-08-05,AAA14437029,2020,8,5,92250.0,174000,12000,4
4,Car loan,12000,2020-02-28,AAA14437029,2020,2,28,92250.0,174000,12000,4
5,Real estate loan,137000,2019-11-11,AAB12915377,2019,11,11,137000.0,137000,137000,1
6,Unknown type of loan,129000,2020-09-06,AAB55088883,2020,9,6,78928.57,176000,6000,14
7,Loan for the purchase of equipment,30000,2018-11-28,AAB55088883,2018,11,28,78928.57,176000,6000,14
8,Real estate loan,6000,2020-09-15,AAB55088883,2020,9,15,78928.57,176000,6000,14
9,Mortgage,79000,2019-09-22,AAB55088883,2019,9,22,78928.57,176000,6000,14


In [50]:
# count of enquiry types per customer.
df_enq['unique_enquiry_types_per_customer'] = df_enq.groupby('uid')['enquiry_type'].transform('nunique')


In [51]:
df_enq[df_enq['uid'] == 'AAA14437029'][['enquiry_type', 'uid']].value_counts()

enquiry_type                            uid        
Loan for working capital replenishment  AAA14437029    2
Car loan                                AAA14437029    1
Real estate loan                        AAA14437029    1
Name: count, dtype: int64

In [52]:
df_enq['total_enquiries_per_type'] = df_enq.groupby(['enquiry_type'])['enquiry_type'].transform('count')

In [53]:
#Percentage of each type of enquiries per customer by the total no of enquiries for that type

df_enq['enquiry_percentage'] = (df_enq['total_enquiries_per_customer'] / df_enq['total_enquiries_per_type']) * 100

In [54]:
grouped = df_enq.groupby(['uid', 'enquiry_type']).size().reset_index(name='count')

# Pivot the DataFrame
pivot_table = grouped.pivot(index='uid', columns='enquiry_type', values='count').fillna(0).astype(int)

# Flatten the pivot table columns
pivot_table.columns = [f'{col}_total_enquiries' for col in pivot_table.columns]

# Reset index to prepare for merge
pivot_table.reset_index(inplace=True)

# Merge the pivoted data back into the main DataFrame (df_enq)
df_enq = pd.merge(df_enq, pivot_table, on='uid', how='left')


In [55]:
enquiry_percentage_aggregates = df_enq.groupby('enquiry_type')['enquiry_percentage'].agg(['mean', 'sum', 'min', 'max']).reset_index()
enquiry_percentage_aggregates.columns = ['enquiry_type', 'mean_enquiry_percentage', 'total_enquiry_percentage', 'min_enquiry_percentage', 'max_enquiry_percentage']

# Merge aggregated statistics back into df_enq
df_enq = df_enq.merge(enquiry_percentage_aggregates, on='enquiry_type', how='left')

In [56]:
# first enquiry date per type
df_enq['first_enquiry_date'] = df_enq.groupby(['uid', 'enquiry_type'])['enquiry_date'].transform('min')


In [57]:
# last enquiry date per type
df_enq['last_enquiry_date'] = df_enq.groupby(['uid', 'enquiry_type'])['enquiry_date'].transform('max')

In [58]:
max_date = df_enq['enquiry_date'].max()

In [59]:
max_date

Timestamp('2021-01-01 00:00:00')

In [60]:
df_enq.shape

(337662, 37)

###### COUNT OF ENQUIRIES ACC TO NO. OF MONTHS

In [61]:
#count of enquiries in the last 1 months
a_month_ago = max_date - pd.DateOffset(months=1)
df_enq['enquiries_last_1_month'] = df_enq.groupby('uid')['enquiry_date'].transform(lambda x: x.gt(a_month_ago).sum())

In [62]:
#count of enquiries in the last 3 months
three_months_ago = max_date - pd.DateOffset(months=3)
df_enq['enquiries_last_3_months'] = df_enq.groupby('uid')['enquiry_date'].transform(lambda x: x.gt(three_months_ago).sum())

In [63]:
#count of enquiries in the last 6 months
six_months_ago = max_date - pd.DateOffset(months=6)
df_enq['enquiries_last_6_months'] = df_enq.groupby('uid')['enquiry_date'].transform(lambda x: x.gt(six_months_ago).sum())

In [64]:
#count of enquiries in the last 9 months
nine_months_ago = max_date - pd.DateOffset(months=9)
df_enq['enquiries_last_9_months'] = df_enq.groupby('uid')['enquiry_date'].transform(lambda x: x.gt(nine_months_ago).sum())

In [65]:
#count of enquiries in the last year 
max_date = df_enq['enquiry_date'].max()
one_year_ago = max_date - pd.DateOffset(years=1)
df_enq['enquiries_last_year'] = df_enq.groupby('uid')['enquiry_date'].transform(lambda x: x.gt(one_year_ago).sum())

In [66]:
df_enq.shape

(337662, 42)

In [67]:
df_enq.head()

Unnamed: 0,enquiry_type,enquiry_amt,enquiry_date,uid,enquiry_year,enquiry_month,enquiry_day,Avg_amount_enquired,max_amount_enquired,min_amount_enquired,...,total_enquiry_percentage,min_enquiry_percentage,max_enquiry_percentage,first_enquiry_date,last_enquiry_date,enquiries_last_1_month,enquiries_last_3_months,enquiries_last_6_months,enquiries_last_9_months,enquiries_last_year
0,Car loan,143000,2020-12-13,AAA02107680,2020,12,13,143000.0,143000,143000,...,1398.278753,0.005588,0.312954,2020-12-13,2020-12-13,1,1,1,1,1
1,Real estate loan,174000,2020-12-01,AAA14437029,2020,12,1,92250.0,174000,12000,...,1410.216824,0.005676,0.317857,2020-12-01,2020-12-01,0,1,2,2,3
2,Loan for working capital replenishment,65000,2019-07-01,AAA14437029,2019,7,1,92250.0,174000,12000,...,1394.300725,0.005711,0.319799,2019-07-01,2020-08-05,0,1,2,2,3
3,Loan for working capital replenishment,118000,2020-08-05,AAA14437029,2020,8,5,92250.0,174000,12000,...,1394.300725,0.005711,0.319799,2019-07-01,2020-08-05,0,1,2,2,3
4,Car loan,12000,2020-02-28,AAA14437029,2020,2,28,92250.0,174000,12000,...,1398.278753,0.005588,0.312954,2020-02-28,2020-02-28,0,1,2,2,3


###### ENQUIRY AMOUNT

In [68]:
pivot_mean = pd.pivot_table(df_enq, 
                            index='uid', 
                            columns='enquiry_type', 
                            values='enquiry_amt', 
                            aggfunc='mean', 
                            fill_value=0)

# Flatten the columns of the pivot table
pivot_mean.columns = [f'{credit}_mean_enquiry_amt' for credit in pivot_mean.columns]

# Reset the index to merge with the original dataframe
pivot_mean.reset_index(inplace=True)

df_enq = df_enq.merge(pivot_mean, on='uid', how='left')

In [69]:
pivot_median = pd.pivot_table(df_enq, 
                            index='uid', 
                            columns='enquiry_type', 
                            values='enquiry_amt', 
                            aggfunc='median', 
                            fill_value=0)

# Flatten the columns of the pivot table
pivot_median.columns = [f'{credit}_median_enquiry_amt' for credit in pivot_median.columns]

# Reset the index to merge with the original dataframe
pivot_median.reset_index(inplace=True)

df_enq = df_enq.merge(pivot_median, on='uid', how='left')

In [70]:
pivot_sum = pd.pivot_table(df_enq, 
                            index='uid', 
                            columns='enquiry_type', 
                            values='enquiry_amt', 
                            aggfunc='sum', 
                            fill_value=0)

# Flatten the columns of the pivot table
pivot_sum.columns = [f'{credit}_total_enquiry_amt' for credit in pivot_sum.columns]

# Reset the index to merge with the original dataframe
pivot_sum.reset_index(inplace=True)

df_enq = df_enq.merge(pivot_sum, on='uid', how='left')

In [71]:
df_enq.head()

Unnamed: 0,enquiry_type,enquiry_amt,enquiry_date,uid,enquiry_year,enquiry_month,enquiry_day,Avg_amount_enquired,max_amount_enquired,min_amount_enquired,...,Loan for business development_total_enquiry_amt,Loan for purchase of shares (margin lending)_total_enquiry_amt,Loan for the purchase of equipment_total_enquiry_amt,Loan for working capital replenishment_total_enquiry_amt,Microloan_total_enquiry_amt,Mobile operator loan_total_enquiry_amt,Mortgage_total_enquiry_amt,Real estate loan_total_enquiry_amt,Revolving loans_total_enquiry_amt,Unknown type of loan_total_enquiry_amt
0,Car loan,143000,2020-12-13,AAA02107680,2020,12,13,143000.0,143000,143000,...,0,0,0,0,0,0,0,0,0,0
1,Real estate loan,174000,2020-12-01,AAA14437029,2020,12,1,92250.0,174000,12000,...,0,0,0,183000,0,0,0,174000,0,0
2,Loan for working capital replenishment,65000,2019-07-01,AAA14437029,2019,7,1,92250.0,174000,12000,...,0,0,0,183000,0,0,0,174000,0,0
3,Loan for working capital replenishment,118000,2020-08-05,AAA14437029,2020,8,5,92250.0,174000,12000,...,0,0,0,183000,0,0,0,174000,0,0
4,Car loan,12000,2020-02-28,AAA14437029,2020,2,28,92250.0,174000,12000,...,0,0,0,183000,0,0,0,174000,0,0


In [72]:

df_enq.loc[df_enq['uid'] == 'AAA08065248', ['enquiry_type', 'total_enquiries_per_type', 'uid']]

Unnamed: 0,enquiry_type,total_enquiries_per_type,uid


In [73]:
df_enq.shape

(337662, 93)

In [None]:
def get_user_info(df, user_id):
    user_df = df[df['uid'] == user_id]
    return user_df.to_dict(orient='records')

# Get the information for user ID 'AAA09044550'
user_info = get_user_info(df_with_total, 'AAA02107680')

user_info

### Feature Drops

In [None]:
df_enq.loc[1:3]

In [74]:
columns_to_drop_enq =[
    "enquiry_type",
    "enquiry_amt",
    "enquiry_date",
    "enquiry_year",
    "enquiry_month",
    "enquiry_day",
    "total_enquiries_per_type",
    "enquiry_percentage",
    "mean_enquiry_percentage",
    "total_enquiry_percentage",
    "min_enquiry_percentage",
    "max_enquiry_percentage",
    "first_enquiry_date",
    "last_enquiry_date"   
]
df_enq = df_enq.drop(columns=columns_to_drop_enq)

In [75]:
df_enq.shape

(337662, 79)

In [76]:
df_enq["uid"].nunique()

46127

In [77]:
df_enq["uid"].value_counts()

uid
KUU85591462    56
BPH76877133    56
LDV06779333    56
MHF18611738    54
BNW36481691    53
               ..
TEM25120273     1
HCV23098846     1
TEM32699013     1
TEM41519794     1
AAA02107680     1
Name: count, Length: 46127, dtype: int64

In [78]:
df_enq.duplicated(subset="uid")

0         False
1         False
2          True
3          True
4          True
          ...  
337657     True
337658     True
337659     True
337660     True
337661     True
Length: 337662, dtype: bool

In [79]:
df_enq = df_enq.drop_duplicates(subset="uid",keep="first")

In [80]:
df_enq.shape

(46127, 79)

In [81]:
enq_prefix = 'enqList_'

# Add prefix to all columns in df_enq except 'uid'
df_enq = df_enq.rename(columns=lambda x: enq_prefix + x if x != 'uid' else x)

In [88]:
df_enq.to_csv('df_enq_test.csv', index=False)

In [None]:
user_dict = df_enq[df_enq["uid"]=='KUU85591462'].to_dict('records')

In [None]:
user_dict

In [87]:
df_enq.columns.to_list()

['uid',
 'enqList_Avg_amount_enquired',
 'enqList_max_amount_enquired',
 'enqList_min_amount_enquired',
 'enqList_total_enquiries_per_customer',
 'enqList_unique_enquiry_types_per_customer',
 'enqList_Another type of loan_total_enquiries',
 'enqList_Car loan_total_enquiries',
 'enqList_Cash loan (non-earmarked)_total_enquiries',
 'enqList_Cash loans_total_enquiries',
 'enqList_Consumer credit_total_enquiries',
 'enqList_Credit card_total_enquiries',
 'enqList_Interbank credit_total_enquiries',
 'enqList_Loan for business development_total_enquiries',
 'enqList_Loan for purchase of shares (margin lending)_total_enquiries',
 'enqList_Loan for the purchase of equipment_total_enquiries',
 'enqList_Loan for working capital replenishment_total_enquiries',
 'enqList_Microloan_total_enquiries',
 'enqList_Mobile operator loan_total_enquiries',
 'enqList_Mortgage_total_enquiries',
 'enqList_Real estate loan_total_enquiries',
 'enqList_Revolving loans_total_enquiries',
 'enqList_Unknown type of l