# Credit Risk Modeling – Feature Engineering & Modeling

This notebook is part of an end-to-end credit risk modeling project completed
during my Data Science Internship at HDFC Capital Advisors Ltd.

⚠️ Note: Due to data confidentiality, raw datasets are not included.
The notebook demonstrates methodology, feature engineering logic,
modeling approach, and evaluation techniques.

In [3]:
import json
import pandas as pd

In [None]:
pd.set_option('display.max_rows', 100000)

In [None]:
pd.set_option('display.max_columns', 140)

In [9]:
LOAD_DATA = False  # Set to True only in secure local environment

if LOAD_DATA:
    with open('./data/senior_ds_test/data/test/accounts_data_test.json', 'r') as f:
        df = json.load(f)
else:
    df = []


flat_list = [item for sublist in df for item in sublist]
df_acc_test = pd.DataFrame(flat_list)

In [10]:
df_acc_test.shape

(220013, 7)

In [158]:
df_acc_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220013 entries, 0 to 220012
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   credit_type          220013 non-null  object 
 1   loan_amount          220013 non-null  float64
 2   amount_overdue       220013 non-null  float64
 3   open_date            220013 non-null  object 
 4   closed_date          138377 non-null  object 
 5   payment_hist_string  220013 non-null  object 
 6   uid                  220013 non-null  object 
dtypes: float64(2), object(5)
memory usage: 11.8+ MB


In [159]:
df_acc_test.describe()

Unnamed: 0,loan_amount,amount_overdue
count,220013.0,220013.0
mean,347921.9,29.252941
std,876513.3,3227.887998
min,0.0,0.0
25%,51250.5,0.0
50%,124843.5,0.0
75%,306000.0,0.0
max,58032000.0,959386.5


In [160]:
df_acc_test.nunique()

credit_type               12
loan_amount            58104
amount_overdue           317
open_date               2922
closed_date             2854
payment_hist_string    11948
uid                    39572
dtype: int64

In [161]:
df_acc_test.columns

Index(['credit_type', 'loan_amount', 'amount_overdue', 'open_date',
       'closed_date', 'payment_hist_string', 'uid'],
      dtype='object')

In [162]:
df_acc_test["credit_type"].value_counts()

credit_type
Consumer credit                           160868
Credit card                                51430
Car loan                                    3520
Mortgage                                    2411
Microloan                                   1239
Loan for business development                255
Another type of loan                         122
Unknown type of loan                          98
Loan for working capital replenishment        57
Cash loan (non-earmarked)                      6
Real estate loan                               6
Loan for the purchase of equipment             1
Name: count, dtype: int64

In [163]:
df_acc_test["open_date"]=pd.to_datetime(df_acc_test["open_date"],format='%Y-%m-%d')
df_acc_test["closed_date"]=pd.to_datetime(df_acc_test["closed_date"],format='%Y-%m-%d')

In [164]:
df_acc_test[df_acc_test.duplicated(subset=["uid",'open_date','closed_date',"loan_amount","credit_type"])].sort_values(by='uid').head(100)

Unnamed: 0,credit_type,loan_amount,amount_overdue,open_date,closed_date,payment_hist_string,uid
214,Credit card,0.0,0.0,2018-12-23,NaT,0000000000000000000000000000000000000000000000...,AAP65900343
292,Credit card,0.0,0.0,2020-03-21,NaT,000000000000000000000000000,AAV99121101
843,Consumer credit,4050000.0,0.0,2016-03-24,2017-01-14,000000000000000000000000000,ACS14933700
879,Consumer credit,26262.0,0.0,2019-03-27,2020-04-04,000000000000000000000000000000000000,ACV24888207
1245,Consumer credit,135000.0,0.0,2016-05-13,2017-05-09,000000000000000000000000000000000000,AEC50212068
1402,Consumer credit,270000.0,0.0,2013-04-15,2016-06-26,0000000000000000000000000000000000000000000000...,AEK88063607
1444,Consumer credit,162000.0,0.0,2017-11-20,NaT,0000000000000000000000000000000000000000000000...,AER58510201
1540,Consumer credit,18841.5,0.0,2017-04-11,2017-07-14,000000000,AFD28441871
1568,Credit card,0.0,0.0,2019-08-26,NaT,000000000000000000000000000000000000000000000000,AFF64963110
1592,Microloan,22500.0,0.0,2020-07-18,2020-08-05,,AFG91382199


In [165]:
df_acc_test.duplicated(subset=["uid", "open_date", "closed_date","loan_amount","credit_type"]).sum()

1343

In [166]:
df_acc_test = df_acc_test.sort_values(by="uid").drop_duplicates(subset=["uid", "open_date", "closed_date","loan_amount"],keep='first')


In [167]:
df_acc_test.shape

(218614, 7)

In [168]:
df_acc_test.loc[df_acc_test["loan_amount"].isnull()]

Unnamed: 0,credit_type,loan_amount,amount_overdue,open_date,closed_date,payment_hist_string,uid


In [169]:
df_acc_test[df_acc_test['loan_amount']==0].head()

Unnamed: 0,credit_type,loan_amount,amount_overdue,open_date,closed_date,payment_hist_string,uid
44,Credit card,0.0,0.0,2015-09-21,2017-10-24,0000000000000000000000000000000000000000000000...,AAC85763409
105,Credit card,0.0,0.0,2019-08-23,NaT,000000000000000000000000000000000000000000000000,AAG67018395
140,Credit card,0.0,0.0,2018-11-03,NaT,0000000000000000000000000000000000000000000000...,AAK79175387
158,Credit card,0.0,0.0,2019-06-28,NaT,0000000000000000000000000000000000000000000000...,AAL42245978
149,Credit card,0.0,0.0,2019-06-28,2020-06-27,000000000000000012042072000000000000,AAL42245978


In [170]:


(df_acc_test['loan_amount']==0).sum()

8006

In [171]:
df_acc_test = df_acc_test[df_acc_test['loan_amount']!=0]

In [172]:
df_acc_test.shape

(210608, 7)

In [173]:
#checking if closed date is earlier than the open date. 

df_acc_test[(df_acc_test['open_date'] > df_acc_test['closed_date'])]

Unnamed: 0,credit_type,loan_amount,amount_overdue,open_date,closed_date,payment_hist_string,uid
56901,Mortgage,2250000.0,0.0,2017-07-02,2013-05-27,,GTR00709853


In [174]:
df_acc_test[(df_acc_test['open_date'] > df_acc_test['closed_date'])].shape

(1, 7)

In [175]:
df_acc_test = df_acc_test[~(df_acc_test['open_date'] > df_acc_test['closed_date'])]


df_acc_test.shape
# drops 1 rows. 

(210607, 7)

In [176]:
#Feature 1 - Duration of account() in months. 

df_acc_test['duration_in_months'] = (df_acc_test['closed_date'] - df_acc_test['open_date']).dt.days / 30

df_acc_test.head()

Unnamed: 0,credit_type,loan_amount,amount_overdue,open_date,closed_date,payment_hist_string,uid,duration_in_months
0,Consumer credit,31630.5,0.0,2014-03-30,2014-11-29,000000000000000000000000,AAA14437029,8.133333
10,Consumer credit,1478322.0,0.0,2017-07-03,2019-11-11,0000000000000000000000000000000000000000000000...,AAA14437029,28.7
9,Credit card,67500.0,0.0,2014-09-18,2016-06-21,0000000000000000000000000000000000000000000000...,AAA14437029,21.4
8,Consumer credit,135000.0,0.0,2016-07-24,2016-12-05,000000000000,AAA14437029,4.466667
7,Credit card,42750.0,0.0,2019-10-26,NaT,000000000000000000000000000000000000000000,AAA14437029,


In [177]:
df_acc_test[(df_acc_test['duration_in_months'] > 1) & (df_acc_test['payment_hist_string'].isnull())]

Unnamed: 0,credit_type,loan_amount,amount_overdue,open_date,closed_date,payment_hist_string,uid,duration_in_months


In [178]:
df_acc_test[(df_acc_test['duration_in_months'] > 1) & (df_acc_test['payment_hist_string']=='')]

Unnamed: 0,credit_type,loan_amount,amount_overdue,open_date,closed_date,payment_hist_string,uid,duration_in_months


In [179]:
df_acc_test[df_acc_test['open_date'].isnull()]

Unnamed: 0,credit_type,loan_amount,amount_overdue,open_date,closed_date,payment_hist_string,uid,duration_in_months


In [180]:

print("Number of open accounts:")
df_acc_test['closed_date'].isnull().sum()

Number of open accounts:


76371

In [181]:
df_acc_test.shape

(210607, 8)

## Accounts Features

### Duration Aggregates

In [182]:
duration_aggregates = df_acc_test.groupby('uid')['duration_in_months'].agg(['mean', 'sum', 'min', 'max']).reset_index()
duration_aggregates.columns = ['uid', 'mean_duration_months', 'total_duration_months', 'min_duration_months', 'max_duration_months']

# Merge the aggregates back into df_acc
df_acc_test = df_acc_test.merge(duration_aggregates, on='uid', how='left')

In [183]:
df_acc_test.shape

(210607, 12)

### TIME BASED

In [184]:
print(df_acc_test["loan_amount"].describe())
print()
print('Maximum:',df_acc_test["loan_amount"].max())
print('Minimum:',df_acc_test["loan_amount"].min())

count    2.106070e+05
mean     3.600732e+05
std      8.879333e+05
min      4.500000e-01
25%      5.850000e+04
50%      1.350000e+05
75%      3.150000e+05
max      5.803200e+07
Name: loan_amount, dtype: float64

Maximum: 58032000.0
Minimum: 0.45


In [185]:
earliest_acc = df_acc_test.groupby('uid')['open_date'].min().reset_index()
earliest_acc.columns = ['uid', 'earliest_acc_date']

# Extract year, month, and day from earliest_acc_date
earliest_acc['earliest_acc_year'] = earliest_acc['earliest_acc_date'].dt.year
earliest_acc['earliest_acc_month'] = earliest_acc['earliest_acc_date'].dt.month
earliest_acc['earliest_acc_day'] = earliest_acc['earliest_acc_date'].dt.day

# Merge the extracted columns back into df_acc
df_acc_test = df_acc_test.merge(earliest_acc[['uid', 'earliest_acc_year', 'earliest_acc_month', 'earliest_acc_day']], on='uid', how='left')

In [186]:
latest_acc = df_acc_test.groupby('uid')['open_date'].max().reset_index()
latest_acc.columns = ['uid', 'latest_acc_date']

# Extract year, month, and day from latest_acc_date
latest_acc['latest_acc_year'] = latest_acc['latest_acc_date'].dt.year
latest_acc['latest_acc_month'] = latest_acc['latest_acc_date'].dt.month
latest_acc['latest_acc_day'] = latest_acc['latest_acc_date'].dt.day

df_acc_test = df_acc_test.merge(latest_acc[['uid', 'latest_acc_year', 'latest_acc_month', 'latest_acc_day']], on='uid', how='left')

In [187]:
df_acc_test.shape

(210607, 18)

### Loan Amount Categories

In [188]:
#loan_amount_categories.
bins = [0, 100000, 500000, 1000000, 5000000, 10000000, 50000000,100000000,500000000]

labels = ['0-100k', '100k-500k', '500k-1M', '1M-5M', '5M-10M', '10M-50M','50M-100M','100M-500M']

df_acc_test['loan_amount_category'] = pd.cut(df_acc_test['loan_amount'], bins=bins, labels=labels)
df_acc_test.head()

Unnamed: 0,credit_type,loan_amount,amount_overdue,open_date,closed_date,payment_hist_string,uid,duration_in_months,mean_duration_months,total_duration_months,min_duration_months,max_duration_months,earliest_acc_year,earliest_acc_month,earliest_acc_day,latest_acc_year,latest_acc_month,latest_acc_day,loan_amount_category
0,Consumer credit,31630.5,0.0,2014-03-30,2014-11-29,000000000000000000000000,AAA14437029,8.133333,17.4,174.0,4.466667,45.933333,2014,3,30,2019,10,26,0-100k
1,Consumer credit,1478322.0,0.0,2017-07-03,2019-11-11,0000000000000000000000000000000000000000000000...,AAA14437029,28.7,17.4,174.0,4.466667,45.933333,2014,3,30,2019,10,26,1M-5M
2,Credit card,67500.0,0.0,2014-09-18,2016-06-21,0000000000000000000000000000000000000000000000...,AAA14437029,21.4,17.4,174.0,4.466667,45.933333,2014,3,30,2019,10,26,0-100k
3,Consumer credit,135000.0,0.0,2016-07-24,2016-12-05,000000000000,AAA14437029,4.466667,17.4,174.0,4.466667,45.933333,2014,3,30,2019,10,26,100k-500k
4,Credit card,42750.0,0.0,2019-10-26,NaT,000000000000000000000000000000000000000000,AAA14437029,,17.4,174.0,4.466667,45.933333,2014,3,30,2019,10,26,0-100k


In [189]:
df_acc_test["loan_amount_category"].value_counts()

loan_amount_category
100k-500k    89055
0-100k       87842
500k-1M      17938
1M-5M        14481
5M-10M        1061
10M-50M        229
50M-100M         1
100M-500M        0
Name: count, dtype: int64

In [190]:
# Metrics on Loan Amount
df_acc_test["avg_loan_amount"] = df_acc_test["uid"].map(df_acc_test.groupby("uid")["loan_amount"].mean())
df_acc_test["median_loan_amount"] = df_acc_test["uid"].map(df_acc_test.groupby("uid")["loan_amount"].median())
df_acc_test["max_loan_amt"] = df_acc_test["uid"].map(df_acc_test.groupby("uid")["loan_amount"].max())
df_acc_test["min_loan_amt"] = df_acc_test["uid"].map(df_acc_test.groupby("uid")["loan_amount"].min())

In [191]:
df_acc_test["avg_monthly_payment"] = (df_acc_test["loan_amount"]/df_acc_test["duration_in_months"]).round(2)

In [192]:
df_acc_test.shape

(210607, 24)

### Overdues


In [193]:
df_acc_test["avg_overdues"] = df_acc_test["uid"].map(df_acc_test.groupby("uid")["amount_overdue"].mean())
df_acc_test["median_overdues"] = df_acc_test["uid"].map(df_acc_test.groupby("uid")["amount_overdue"].median())
df_acc_test["max_overdues"] = df_acc_test["uid"].map(df_acc_test.groupby("uid")["amount_overdue"].max())
df_acc_test["min_overdues"] = df_acc_test["uid"].map(df_acc_test.groupby("uid")["amount_overdue"].min())

### PAYMENT HISTORY

In [194]:
#Payment details
def analyze_payment_history(payment_history):
    months = [payment_history[i:i+3] for i in range(0, len(payment_history), 3)]

    late_payment_count = sum(1 for month in months if month != "000")
    max_consecutive_late_payments = 0
    consecutive_late_payments = 0
    last_payment_was_late = False
    
    for month in months:
        if month != "000":
            consecutive_late_payments = consecutive_late_payments + 1 if last_payment_was_late else 1
            last_payment_was_late = True
        else:
            max_consecutive_late_payments = max(max_consecutive_late_payments, consecutive_late_payments)
            consecutive_late_payments = 0
            last_payment_was_late = False
    
    max_consecutive_late_payments = max(max_consecutive_late_payments, consecutive_late_payments)
    on_time_payment_count = len(months) - late_payment_count
    time_since_last_late_payment = next((i for i, month in enumerate(reversed(months)) if month != "000"), len(months))

    return {
        'late_payments': late_payment_count,
        'on_time_payments': on_time_payment_count,
        'max_consecutive_late_payments': max_consecutive_late_payments,
        'time_since_last_late_payment': time_since_last_late_payment
    }
# Apply the function to the 'payment_hist_string' column and expand the result into separate columns
df_analysis = df_acc_test['payment_hist_string'].apply(analyze_payment_history).apply(pd.Series)

# Concatenate the results back to the original DataFrame
df_acc_test = pd.concat([df_acc_test, df_analysis], axis=1)

In [195]:
# metrics for on time payments 
df_acc_test["avg_ontime_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['on_time_payments'].mean().round(2))
df_acc_test["median_ontime_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['on_time_payments'].median())
df_acc_test["max_ontime_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['on_time_payments'].max())
df_acc_test["min_ontime_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['on_time_payments'].min())

In [196]:
# metrics for late payments 
df_acc_test["avg_late_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['late_payments'].mean().round(2))
df_acc_test["median_late_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['late_payments'].median())
df_acc_test["max_late_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['late_payments'].max())
df_acc_test["min_late_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['late_payments'].min())

In [197]:
# metrics for max_consecutive_late_payments
df_acc_test["avg_consecutive_late_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['max_consecutive_late_payments'].mean().round(2))
df_acc_test["median_consecutive_late_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['max_consecutive_late_payments'].median())
df_acc_test["max_consecutive_late_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['max_consecutive_late_payments'].max())
df_acc_test["min_consecutive_late_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['max_consecutive_late_payments'].min())

In [198]:
df_acc_test.shape

(210607, 43)

In [199]:
df_acc_test.head()

Unnamed: 0,credit_type,loan_amount,amount_overdue,open_date,closed_date,payment_hist_string,uid,duration_in_months,mean_duration_months,total_duration_months,min_duration_months,max_duration_months,earliest_acc_year,earliest_acc_month,earliest_acc_day,latest_acc_year,latest_acc_month,latest_acc_day,loan_amount_category,avg_loan_amount,median_loan_amount,max_loan_amt,min_loan_amt,avg_monthly_payment,avg_overdues,median_overdues,max_overdues,min_overdues,late_payments,on_time_payments,max_consecutive_late_payments,time_since_last_late_payment,avg_ontime_payments,median_ontime_payments,max_ontime_payments,min_ontime_payments,avg_late_payments,median_late_payments,max_late_payments,min_late_payments,avg_consecutive_late_payments,median_consecutive_late_payments,min_consecutive_late_payments
0,Consumer credit,31630.5,0.0,2014-03-30,2014-11-29,000000000000000000000000,AAA14437029,8.133333,17.4,174.0,4.466667,45.933333,2014,3,30,2019,10,26,0-100k,311359.6575,56173.5,1575000.0,14613.39,3889.0,0.0,0.0,0.0,0.0,0,8,10,8,19.08,13.0,55,4,0.83,0.0,10,0,0.83,0.0,10
1,Consumer credit,1478322.0,0.0,2017-07-03,2019-11-11,0000000000000000000000000000000000000000000000...,AAA14437029,28.7,17.4,174.0,4.466667,45.933333,2014,3,30,2019,10,26,1M-5M,311359.6575,56173.5,1575000.0,14613.39,51509.48,0.0,0.0,0.0,0.0,0,28,10,28,19.08,13.0,55,4,0.83,0.0,10,0,0.83,0.0,10
2,Credit card,67500.0,0.0,2014-09-18,2016-06-21,0000000000000000000000000000000000000000000000...,AAA14437029,21.4,17.4,174.0,4.466667,45.933333,2014,3,30,2019,10,26,0-100k,311359.6575,56173.5,1575000.0,14613.39,3154.21,0.0,0.0,0.0,0.0,0,21,10,21,19.08,13.0,55,4,0.83,0.0,10,0,0.83,0.0,10
3,Consumer credit,135000.0,0.0,2016-07-24,2016-12-05,000000000000,AAA14437029,4.466667,17.4,174.0,4.466667,45.933333,2014,3,30,2019,10,26,100k-500k,311359.6575,56173.5,1575000.0,14613.39,30223.88,0.0,0.0,0.0,0.0,0,4,10,4,19.08,13.0,55,4,0.83,0.0,10,0,0.83,0.0,10
4,Credit card,42750.0,0.0,2019-10-26,NaT,000000000000000000000000000000000000000000,AAA14437029,,17.4,174.0,4.466667,45.933333,2014,3,30,2019,10,26,0-100k,311359.6575,56173.5,1575000.0,14613.39,,0.0,0.0,0.0,0.0,0,14,10,14,19.08,13.0,55,4,0.83,0.0,10,0,0.83,0.0,10


### DElINQUENCY RATE

In [200]:
def calculate_delinquency_metrics(payment_history_string, delinquency_threshold=60):
    try:
        total_payments = len(payment_history_string) // 3
        if total_payments == 0:
            return None, None, None, None  # No payments to evaluate
        
        delinquent_payments = 0
        total_days_past_due = 0

        for i in range(0, len(payment_history_string), 3):
            payment_status = payment_history_string[i:i+3]
            days_past_due = int(payment_status)

            total_days_past_due += days_past_due

            if days_past_due > delinquency_threshold:
                delinquent_payments += 1

        delinquency_rate = delinquent_payments / total_payments
    except ZeroDivisionError:
        delinquency_rate = None
    except ValueError:  # if there's an invalid payment history format
        delinquency_rate = None
        total_payments = None
        delinquent_payments = None
        total_days_past_due = None

    delinquency_rate = delinquency_rate * 100 if delinquency_rate is not None else None
    
    return delinquency_rate, total_payments, delinquent_payments, total_days_past_due

# Apply the function to the 'payment_hist_string' column
df_acc_test[['delinquency_rate', 'total_payments', 'delinquent_payments', 'total_DPD']] = df_acc_test['payment_hist_string'].apply(
    lambda x: pd.Series(calculate_delinquency_metrics(x))
)



In [201]:
# metrics for Total Payments
df_acc_test["avg_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_payments'].mean().round(2))
df_acc_test["median_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_payments'].median())
df_acc_test["max_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_payments'].max())
df_acc_test["min_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_payments'].min())

In [202]:
df_acc_test["avg_del_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['delinquent_payments'].mean().round(2))
df_acc_test["median_del_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['delinquent_payments'].median())
df_acc_test["max_del_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['delinquent_payments'].max())
df_acc_test["min_del_payments"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['delinquent_payments'].min())

In [203]:
# metrics for Days Past Due
df_acc_test["avg_DPD"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_DPD'].mean().round(2))
df_acc_test["median_DPD"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_DPD'].median())
df_acc_test["max_DPD"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_DPD'].max())
df_acc_test["min_DPD"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_DPD'].min())

In [204]:
# metrics for Deliquency Rates
df_acc_test["avg_DR"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['delinquency_rate'].mean().round(2))
df_acc_test["median_DR"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['delinquency_rate'].median())
df_acc_test["max_DR"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['delinquency_rate'].max())
df_acc_test["min_DR"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['delinquency_rate'].min())

In [205]:
df_acc_test.shape

(210607, 63)

### MONTHLY SEGGREGATION OF PAYMENT HISTORY

In [206]:
# splitting payment history string acc to last 3 mons, 6 mons, and 9 mons. 
def split_payment_history(payment_hist_string):
    last_1_month = None
    last_3_months = None
    last_6_months = None
    last_9_months = None
    if len(payment_hist_string) >= 3:
        last_1_month = payment_hist_string[:3]
    if len(payment_hist_string) >= 3*3:
        last_3_months = payment_hist_string[:3*3]
    if len(payment_hist_string) >= 6*3:
        last_6_months = payment_hist_string[:6*3]
    if len(payment_hist_string) >= 9*3:
        last_9_months = payment_hist_string[:9*3]
    
    return pd.Series([last_1_month,last_3_months, last_6_months, last_9_months])

# Apply the function to the 'payment_hist_string' column
df_acc_test[["last_1_month",'last_3_months', 'last_6_months', 'last_9_months']] = df_acc_test['payment_hist_string'].apply(split_payment_history)


#### PAST 1 MONTH

In [207]:
#avg delinquency in latest month.
def past_month_delinquency(last_1_month):
    if last_1_month and last_1_month.isdigit():
        return int(last_1_month)
    return 0

df_acc_test['DPD_last_1_month'] = df_acc_test['last_1_month'].apply(past_month_delinquency)

In [208]:
#deliquency over the past month
df_acc_test["avg_DPD_last_1_month"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['DPD_last_1_month'].mean().round(2))
df_acc_test["median_DPD_last_1_month"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['DPD_last_1_month'].median())
df_acc_test["max_DPD_last_1_month"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['DPD_last_1_month'].max())
df_acc_test["min_DPD_last_1_month"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['DPD_last_1_month'].min())

In [209]:
df_acc_test.shape

(210607, 72)

#### PAST 3,6,9 MONTHS

In [210]:
#delinquency for last,3,6 and 9 months.

def delinquency(last_x_month):
    if pd.isna(last_x_month):
        return 0
    total_days = 0
    for i in range(0, len(last_x_month), 3):
        total_days += int(last_x_month[i:i+3])
    return total_days


df_acc_test["total_delinquency_3_mons"] = df_acc_test["last_3_months"].apply(delinquency)
df_acc_test["avg_DPD_3_mons"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_delinquency_3_mons'].mean().round(2))
df_acc_test["median_DPD_3_mons"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_delinquency_3_mons'].median())
df_acc_test["max_DPD_3_mons"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_delinquency_3_mons'].max())
df_acc_test["min_DPD_3_mons"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_delinquency_3_mons'].min())


df_acc_test["total_delinquency_6_mons"] = df_acc_test["last_6_months"].apply(delinquency)
df_acc_test["avg_DPD_6_mons"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_delinquency_6_mons'].mean().round(2))
df_acc_test["median_DPD_6_mons"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_delinquency_6_mons'].median())
df_acc_test["max_DPD_6_mons"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_delinquency_6_mons'].max())
df_acc_test["min_DPD_6_mons"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_delinquency_6_mons'].min())

df_acc_test["total_delinquency_9_mons"] = df_acc_test["last_9_months"].apply(delinquency)
df_acc_test["avg_DPD_9_mons"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_delinquency_9_mons'].mean().round(2))
df_acc_test["max_DPD_9_mons"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_delinquency_9_mons'].max())
df_acc_test["median_DPD_9_mons"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_delinquency_9_mons'].median())
df_acc_test["min_DPD_9_mons"] = df_acc_test["uid"].map(df_acc_test.groupby('uid')['total_delinquency_9_mons'].min())

In [211]:
print(df_acc_test["delinquency_rate"].describe())


print()
print("Max. Delinquency: ",df_acc_test["delinquency_rate"].max())
print("Min. Delinquency: ",df_acc_test["delinquency_rate"].min())

count    205848.000000
mean          0.491769
std           3.473184
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          94.117647
Name: delinquency_rate, dtype: float64

Max. Delinquency:  94.11764705882352
Min. Delinquency:  0.0


### CREDIT TYPES

In [212]:
pivot_mean = pd.pivot_table(df_acc_test, 
                            index='uid', 
                            columns='credit_type', 
                            values='loan_amount', 
                            aggfunc='mean', 
                            fill_value=0)

# Flatten the columns of the pivot table
pivot_mean.columns = [f'{credit}_mean_loan' for credit in pivot_mean.columns]

# Reset the index to merge with the original dataframe
pivot_mean.reset_index(inplace=True)

df_acc_test = df_acc_test.merge(pivot_mean, on='uid', how='left')

In [213]:
df_acc_test.shape

(210607, 99)

In [214]:
df_acc_test.head()

Unnamed: 0,credit_type,loan_amount,amount_overdue,open_date,closed_date,payment_hist_string,uid,duration_in_months,mean_duration_months,total_duration_months,min_duration_months,max_duration_months,earliest_acc_year,earliest_acc_month,earliest_acc_day,latest_acc_year,latest_acc_month,latest_acc_day,loan_amount_category,avg_loan_amount,median_loan_amount,max_loan_amt,min_loan_amt,avg_monthly_payment,avg_overdues,median_overdues,max_overdues,min_overdues,late_payments,on_time_payments,max_consecutive_late_payments,time_since_last_late_payment,avg_ontime_payments,median_ontime_payments,max_ontime_payments,min_ontime_payments,avg_late_payments,median_late_payments,max_late_payments,min_late_payments,avg_consecutive_late_payments,median_consecutive_late_payments,min_consecutive_late_payments,delinquency_rate,total_payments,delinquent_payments,total_DPD,avg_payments,median_payments,max_payments,min_payments,avg_del_payments,median_del_payments,max_del_payments,min_del_payments,avg_DPD,median_DPD,max_DPD,min_DPD,avg_DR,median_DR,max_DR,min_DR,last_1_month,last_3_months,last_6_months,last_9_months,DPD_last_1_month,avg_DPD_last_1_month,median_DPD_last_1_month,max_DPD_last_1_month,min_DPD_last_1_month,total_delinquency_3_mons,avg_DPD_3_mons,median_DPD_3_mons,max_DPD_3_mons,min_DPD_3_mons,total_delinquency_6_mons,avg_DPD_6_mons,median_DPD_6_mons,max_DPD_6_mons,min_DPD_6_mons,total_delinquency_9_mons,avg_DPD_9_mons,max_DPD_9_mons,median_DPD_9_mons,min_DPD_9_mons,Another type of loan_mean_loan,Car loan_mean_loan,Cash loan (non-earmarked)_mean_loan,Consumer credit_mean_loan,Credit card_mean_loan,Loan for business development_mean_loan,Loan for the purchase of equipment_mean_loan,Loan for working capital replenishment_mean_loan,Microloan_mean_loan,Mortgage_mean_loan,Real estate loan_mean_loan,Unknown type of loan_mean_loan
0,Consumer credit,31630.5,0.0,2014-03-30,2014-11-29,000000000000000000000000,AAA14437029,8.133333,17.4,174.0,4.466667,45.933333,2014.0,3.0,30.0,2019.0,10.0,26.0,0-100k,311359.6575,56173.5,1575000.0,14613.39,3889.0,0.0,0.0,0.0,0.0,0.0,8.0,10.0,8.0,19.08,13.0,55.0,4.0,0.83,0.0,10.0,0.0,0.83,0.0,10.0,0.0,8.0,0.0,0.0,19.92,13.0,55.0,4.0,0.67,0.0,8.0,0.0,125.0,0.0,1500.0,0.0,2.38,0.0,28.571429,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,469816.984286,89519.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Consumer credit,1478322.0,0.0,2017-07-03,2019-11-11,0000000000000000000000000000000000000000000000...,AAA14437029,28.7,17.4,174.0,4.466667,45.933333,2014.0,3.0,30.0,2019.0,10.0,26.0,1M-5M,311359.6575,56173.5,1575000.0,14613.39,51509.48,0.0,0.0,0.0,0.0,0.0,28.0,10.0,28.0,19.08,13.0,55.0,4.0,0.83,0.0,10.0,0.0,0.83,0.0,10.0,0.0,28.0,0.0,0.0,19.92,13.0,55.0,4.0,0.67,0.0,8.0,0.0,125.0,0.0,1500.0,0.0,2.38,0.0,28.571429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,469816.984286,89519.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Credit card,67500.0,0.0,2014-09-18,2016-06-21,0000000000000000000000000000000000000000000000...,AAA14437029,21.4,17.4,174.0,4.466667,45.933333,2014.0,3.0,30.0,2019.0,10.0,26.0,0-100k,311359.6575,56173.5,1575000.0,14613.39,3154.21,0.0,0.0,0.0,0.0,0.0,21.0,10.0,21.0,19.08,13.0,55.0,4.0,0.83,0.0,10.0,0.0,0.83,0.0,10.0,0.0,21.0,0.0,0.0,19.92,13.0,55.0,4.0,0.67,0.0,8.0,0.0,125.0,0.0,1500.0,0.0,2.38,0.0,28.571429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,469816.984286,89519.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Consumer credit,135000.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [215]:
pivot_median = pd.pivot_table(df_acc_test, 
                            index='uid', 
                            columns='credit_type', 
                            values='loan_amount', 
                            aggfunc='median', 
                            fill_value=0)

# Flatten the columns of the pivot table
pivot_median.columns = [f'{credit}_median_loan' for credit in pivot_median.columns]

# Reset the index to merge with the original dataframe
pivot_median.reset_index(inplace=True)

df_acc_test = df_acc_test.merge(pivot_median, on='uid', how='left')

In [216]:
df_acc_test.shape

(210607, 111)

In [217]:
pivot_sum = pd.pivot_table(df_acc_test, 
                            index='uid', 
                            columns='credit_type', 
                            values='loan_amount', 
                            aggfunc='sum', 
                            fill_value=0)

# Flatten the columns of the pivot table
pivot_sum.columns = [f'{credit}_total_loan' for credit in pivot_sum.columns]

# Reset the index to merge with the original dataframe
pivot_sum.reset_index(inplace=True)

df_acc_test = df_acc_test.merge(pivot_sum, on='uid', how='left')

In [219]:
df_acc_test.head()

Unnamed: 0,credit_type,loan_amount,amount_overdue,open_date,closed_date,payment_hist_string,uid,duration_in_months,mean_duration_months,total_duration_months,min_duration_months,max_duration_months,earliest_acc_year,earliest_acc_month,earliest_acc_day,latest_acc_year,latest_acc_month,latest_acc_day,loan_amount_category,avg_loan_amount,median_loan_amount,max_loan_amt,min_loan_amt,avg_monthly_payment,avg_overdues,median_overdues,max_overdues,min_overdues,late_payments,on_time_payments,max_consecutive_late_payments,time_since_last_late_payment,avg_ontime_payments,median_ontime_payments,max_ontime_payments,min_ontime_payments,avg_late_payments,median_late_payments,max_late_payments,min_late_payments,avg_consecutive_late_payments,median_consecutive_late_payments,min_consecutive_late_payments,delinquency_rate,total_payments,delinquent_payments,total_DPD,avg_payments,median_payments,max_payments,min_payments,avg_del_payments,median_del_payments,max_del_payments,min_del_payments,avg_DPD,median_DPD,max_DPD,min_DPD,avg_DR,median_DR,max_DR,min_DR,last_1_month,last_3_months,last_6_months,last_9_months,DPD_last_1_month,avg_DPD_last_1_month,median_DPD_last_1_month,max_DPD_last_1_month,min_DPD_last_1_month,total_delinquency_3_mons,avg_DPD_3_mons,median_DPD_3_mons,max_DPD_3_mons,min_DPD_3_mons,total_delinquency_6_mons,avg_DPD_6_mons,median_DPD_6_mons,max_DPD_6_mons,min_DPD_6_mons,total_delinquency_9_mons,avg_DPD_9_mons,max_DPD_9_mons,median_DPD_9_mons,min_DPD_9_mons,Another type of loan_mean_loan,Car loan_mean_loan,Cash loan (non-earmarked)_mean_loan,Consumer credit_mean_loan,Credit card_mean_loan,Loan for business development_mean_loan,Loan for the purchase of equipment_mean_loan,Loan for working capital replenishment_mean_loan,Microloan_mean_loan,Mortgage_mean_loan,Real estate loan_mean_loan,Unknown type of loan_mean_loan,Another type of loan_median_loan,Car loan_median_loan,Cash loan (non-earmarked)_median_loan,Consumer credit_median_loan,Credit card_median_loan,Loan for business development_median_loan,Loan for the purchase of equipment_median_loan,Loan for working capital replenishment_median_loan,Microloan_median_loan,Mortgage_median_loan,Real estate loan_median_loan,Unknown type of loan_median_loan,Another type of loan_total_loan,Car loan_total_loan,Cash loan (non-earmarked)_total_loan,Consumer credit_total_loan,Credit card_total_loan,Loan for business development_total_loan,Loan for the purchase of equipment_total_loan,Loan for working capital replenishment_total_loan,Microloan_total_loan,Mortgage_total_loan,Real estate loan_total_loan,Unknown type of loan_total_loan
0,Consumer credit,31630.5,0.0,2014-03-30,2014-11-29,000000000000000000000000,AAA14437029,8.133333,17.4,174.0,4.466667,45.933333,2014.0,3.0,30.0,2019.0,10.0,26.0,0-100k,311359.6575,56173.5,1575000.0,14613.39,3889.0,0.0,0.0,0.0,0.0,0.0,8.0,10.0,8.0,19.08,13.0,55.0,4.0,0.83,0.0,10.0,0.0,0.83,0.0,10.0,0.0,8.0,0.0,0.0,19.92,13.0,55.0,4.0,0.67,0.0,8.0,0.0,125.0,0.0,1500.0,0.0,2.38,0.0,28.571429,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,469816.984286,89519.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31630.5,58347.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3288718.89,447597.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Consumer credit,1478322.0,0.0,2017-07-03,2019-11-11,0000000000000000000000000000000000000000000000...,AAA14437029,28.7,17.4,174.0,4.466667,45.933333,2014.0,3.0,30.0,2019.0,10.0,26.0,1M-5M,311359.6575,56173.5,1575000.0,14613.39,51509.48,0.0,0.0,0.0,0.0,0.0,28.0,10.0,28.0,19.08,13.0,55.0,4.0,0.83,0.0,10.0,0.0,0.83,0.0,10.0,0.0,28.0,0.0,0.0,19.92,13.0,55.0,4.0,0.67,0.0,8.0,0.0,125.0,0.0,1500.0,0.0,2.38,0.0,28.571429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,469816.984286,89519.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31630.5,58347.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3288718.89,447597.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Credit card,67500.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [218]:
df_acc_test.shape

(210607, 123)

In [220]:
#Loan Status
df_acc_test['loan_status'] = df_acc_test['closed_date'].apply(lambda x: 'Open' if pd.isnull(x) else 'Closed')

In [221]:
most_frequent_credit_type = df_acc_test.groupby('uid')['credit_type'].agg(lambda x: x.value_counts().idxmax())

df_acc_test['most_frequent_credit_type'] = df_acc_test['uid'].map(most_frequent_credit_type)

## Feature Selection

In [222]:
columns_to_drop = [
    "credit_type",
    "loan_amount",
    "amount_overdue",
    "open_date",
    "closed_date",
    "payment_hist_string",
    "duration_in_months",
    "late_payments",
    "on_time_payments",
    "time_since_last_late_payment",
    "max_consecutive_late_payments",
    "delinquency_rate",
    "total_payments",
    "delinquent_payments",
    "total_DPD",
    "loan_amount_category",
    "median_ontime_payments",
    "median_late_payments",
    "median_consecutive_late_payments",
    "median_DR","max_DR","min_DR",
    "last_1_month",
    "last_3_months",
    "last_6_months",
    "last_9_months",
    "avg_monthly_payment",
    "total_delinquency_3_mons",
    "total_delinquency_6_mons",
    "total_delinquency_9_mons",
    "DPD_last_1_month",
    "median_DPD_last_1_month",
    "max_DPD_last_1_month",
    "min_DPD_last_1_month",
    "median_DPD_3_mons",
    "max_DPD_3_mons",
    "min_DPD_3_mons",
    "median_DPD_6_mons",
    "max_DPD_6_mons",
    "min_DPD_6_mons",
    "median_DPD_9_mons",
    "max_DPD_9_mons",
    "min_DPD_9_mons",
    "median_overdues"
    
]


df_acc_test = df_acc_test.drop(columns=columns_to_drop)


In [223]:
df_acc_test.shape

(210607, 81)

In [224]:
x = df_acc_test["uid"].value_counts()

In [None]:
x

In [226]:
df_acc_test["uid"].nunique()

39417

In [None]:
df_acc_test.duplicated(subset="uid").sum()

In [227]:
df_acc_test = df_acc_test.drop_duplicates(subset="uid",keep="first")

In [228]:
df_acc_test.shape

(39417, 81)

In [229]:
acc_prefix = 'accList_'
df_acc_test = df_acc_test.rename(columns=lambda x: acc_prefix + x if x != 'uid' else x)

In [230]:
user_dict = df_acc_test[df_acc_test["uid"]=='AAA14437029'].to_dict('records')

In [231]:
user_dict

[{'uid': 'AAA14437029',
  'accList_mean_duration_months': 17.4,
  'accList_total_duration_months': 174.0,
  'accList_min_duration_months': 4.466666666666667,
  'accList_max_duration_months': 45.93333333333333,
  'accList_earliest_acc_year': 2014,
  'accList_earliest_acc_month': 3,
  'accList_earliest_acc_day': 30,
  'accList_latest_acc_year': 2019,
  'accList_latest_acc_month': 10,
  'accList_latest_acc_day': 26,
  'accList_avg_loan_amount': 311359.6575,
  'accList_median_loan_amount': 56173.5,
  'accList_max_loan_amt': 1575000.0,
  'accList_min_loan_amt': 14613.39,
  'accList_avg_overdues': 0.0,
  'accList_max_overdues': 0.0,
  'accList_min_overdues': 0.0,
  'accList_avg_ontime_payments': 19.08,
  'accList_max_ontime_payments': 55,
  'accList_min_ontime_payments': 4,
  'accList_avg_late_payments': 0.83,
  'accList_max_late_payments': 10,
  'accList_min_late_payments': 0,
  'accList_avg_consecutive_late_payments': 0.83,
  'accList_min_consecutive_late_payments': 10,
  'accList_avg_paym

In [233]:
df_acc_test.to_csv('df_acc_test.csv', index=False)