In [12]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import OneHotEncoder  
from sklearn.linear_model import LogisticRegression  
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score  
from sklearn.preprocessing import LabelEncoder



In [13]:
os.chdir("source/")

In [14]:
df_app = pd.read_csv('ListP1.csv')
df_client = pd.read_csv('ListPI1.csv')
df_behav = pd.read_csv('ListB1.csv')

  df_client = pd.read_csv('ListPI1.csv')
  df_behav = pd.read_csv('ListB1.csv')


In [15]:
df = df_app.merge(df_client, on='Account ID').merge(df_behav, on='Account ID')


In [16]:
rename_dict = {
    'Customer ID_x': 'customer_id_x', 
    'Application ID_x': 'application_id_x',
    'Account ID': 'account_id',
    'Branch ID': 'branch_id',
    'Product ID': 'product_id',
    'Application date': 'application_date',
    'Date loan granted': 'date_loan_granted',
    'Loan Amount': 'loan_amount',
    'First instalment due date': 'first_instalment_due_date',
    'Interest rate': 'interest_rate',
    'Collateral type': 'collateral_type',
    'Value of collateral': 'value_of_collateral',
    'Property type/Collateral type': 'property_or_collateral_type',
    'Salary payment in bank account': 'salary_payment_in_bank_account',
    'Loan type': 'loan_type',
    '# of instalments': 'number_of_instalments',
    'Instalment amount/Min instalment amount': 'instalment_or_min_instalment_amount',
    'Customer ID_y': 'customer_id_y',
    'Application ID_y': 'application_id_y',
    'Date of birth': 'date_of_birth',
    'Gender': 'gender',
    'City of Living': 'city_of_living',
    'Region of living': 'region_of_living',
    'City of registration': 'city_of_registration',
    'Region of registration': 'region_of_registration',
    'Work phone number': 'work_phone_number',
    'Mobile phone number': 'mobile_phone_number',
    'Education': 'education',
    'Marital status': 'marital_status',
    '# Dependants': 'number_of_dependants',
    '# Children': 'number_of_children',
    '# months at current address': 'months_at_current_address',
    'Employment type': 'employment_type',
    'Employment sector': 'employment_sector',
    'Employment segment': 'employment_segment',
    '# months at job': 'months_at_job',
    'Net main income': 'net_main_income',
    'Source of main income': 'source_of_main_income',
    'Additional income': 'additional_income',
    'Source of additional income': 'source_of_additional_income',
    'Reported expenses': 'reported_expenses',
    'Months with bank': 'months_with_bank',
    'Current exposure': 'current_exposure',
    'Client type ': 'client_type',
    'Property object': 'property_object',
    'Наличие Эсхата онлайн': 'has_eschata_online',
    'Наличие пластиковых карт': 'has_plastic_cards',
    'Наличие депозита': 'has_deposit',
    'Состояние': 'state',
    'Причина отказа': 'rejection_reason',
    'Run Date': 'run_date',
    'Customer ID': 'customer_id',
    'Date account opened': 'date_account_opened',
    'Current balance': 'current_balance',
    'Date last payment': 'date_last_payment',
    'Date final payment': 'date_final_payment',
    'Due date': 'due_date',
    'Payment amount': 'payment_amount',
    'Account status': 'account_status',
    '# of payments in arrears': 'number_of_payments_in_arrears',
    'кумулятивная просрочка': 'cumulative_delinquency',
    'Amount due – Instalment': 'amount_due_instalment',
    'Principal amount': 'principal_amount',
    'Interest accrued': 'interest_accrued',
    'Outstanding balance': 'outstanding_balance',
    'Arrears amount': 'arrears_amount',
    'Current days past due': 'current_days_past_due',
    'Maximum days past due': 'maximum_days_past_due',
    'Maximum days past due lifetime': 'maximum_days_past_due_lifetime',
    'Default flag': 'default_flag',
    'Кол-во пролонгации': 'number_of_extensions',
    'Рейтинг БКИ': 'bki_rating',
    'Количество кредитов  в БКИ (заемщик)': 'number_of_loans_in_bki',
    'FTD-1': 'ftd_1',
    'FTD-2': 'ftd_2',
    'FTD-3': 'ftd_3',
    'FTD-4': 'ftd_4'
}



In [17]:
df = df.rename(columns=rename_dict)
df = df.drop(['application_id_y', 'customer_id', 'customer_id_y'], axis=1)
df = df.rename(columns={
    'customer_id_x': 'customer_id',
    'application_id_x': 'application_id'
})


In [18]:
df['application_date'] = pd.to_datetime(df['application_date'])
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')

df['age'] = (df['application_date'] - df['date_of_birth']) // pd.Timedelta(days=365.25)

df.loc[df['age'] == 17, 'age'] = 18
df.loc[df['age'] == 0, 'age'] = 27
df.loc[df['age'] == -1, 'age'] = 33
df['age'] = df['age'].fillna(df['age'].mean())

df.age = np.floor(df.age).astype(np.int64)

df.age

0         23
1         52
2         30
3         55
4         20
          ..
235693    43
235694    36
235695    44
235696    27
235697    36
Name: age, Length: 235698, dtype: int64

In [19]:
df['loan_month'] = df['application_date'].dt.year * 100 + df['application_date'].dt.month  
df['loan_month'] 

0         202110
1         202110
2         202110
3         202110
4         202110
           ...  
235693    202306
235694    202306
235695    202306
235696    202306
235697    202306
Name: loan_month, Length: 235698, dtype: int64

In [20]:
df['city_of_registration'] = (df['city_of_living'].astype(str) == df['city_of_registration'].astype(str)).astype(int) 
df['city_of_registration']

0         0
1         1
2         1
3         0
4         1
         ..
235693    1
235694    1
235695    1
235696    1
235697    1
Name: city_of_registration, Length: 235698, dtype: int32

In [21]:
df['is_collateral'] = np.where(df['collateral_type'].isna(), 0, 1)  
df['is_collateral'].head(10)


0    0
1    1
2    0
3    0
4    0
5    0
6    0
7    0
8    1
9    0
Name: is_collateral, dtype: int32

In [22]:
temp_df = (  
    df.merge(df,   
             left_on='customer_id',   
             right_on='customer_id',   
             suffixes=('', '_previous'))  
    .loc[lambda x: x['application_date'] > x['application_date_previous'],   
         ['account_id', 'customer_id', 'application_date', \
          'customer_id', 'application_date_previous', \
          'maximum_days_past_due_lifetime']]  
)  
 
temp_df.columns = ['account_id', 'customer_id', \
                   'application_date', 'cust_id_2', \
                   'app_date_2', 'maximum_days_past_due_lifetime']  

temp_df = temp_df.sort_values(by='customer_id')  


In [23]:
print("Количество строк в df:", len(df))  
print("Количество строк в temp_df:", len(temp_df))  

Количество строк в df: 235698
Количество строк в temp_df: 83105


In [24]:
display(temp_df) 

Unnamed: 0,account_id,customer_id,application_date,cust_id_2,app_date_2,maximum_days_past_due_lifetime
161174,54353388035,17948859,2023-06-21,17948859,2022-03-02,0
68011,38650448411,17949067,2022-01-26,17949067,2021-12-17,0
90963,44731003921,17950946,2022-08-17,17950946,2022-01-25,0
90962,44731003921,17950946,2022-08-17,17950946,2022-01-04,0
90959,38139613650,17950946,2022-01-25,17950946,2022-01-04,10
...,...,...,...,...,...,...
398942,54281933460,54137528550,2023-06-19,54137528550,2023-06-14,0
399664,54263156084,54202892480,2023-06-19,54202892480,2023-06-16,0
400105,54381456797,54286027956,2023-06-23,54286027956,2023-06-20,0
400826,54414824002,54325138662,2023-06-24,54325138662,2023-06-21,0


In [25]:
acc_numb_before = (  
    temp_df[temp_df['cust_id_2'].notnull()]  
    .groupby('account_id')  
    .agg(  
        cnt=('account_id', 'count'),  
        max_max_days_past_due=('maximum_days_past_due_lifetime', 'max')  
    )  
    .reset_index()  
)  

acc_numb_before.columns = ['account_id', 'previous_loans_count', \
                           'previous_loans_max_dpd']  

acc_numb_before['existing_client'] = \
    acc_numb_before['previous_loans_count'].apply(lambda x: 1 if x > 1 else 0)  


In [26]:
display(acc_numb_before)

Unnamed: 0,account_id,previous_loans_count,previous_loans_max_dpd,existing_client
0,36031268759,1,0,0
1,36060574691,1,0,0
2,36086460296,1,3,0
3,36086651513,1,3,0
4,36087793581,2,0,1
...,...,...,...,...
60587,54537845335,3,0,1
60588,54538111981,1,0,0
60589,54538122855,1,0,0
60590,54538175144,1,0,0


In [27]:
df_new = pd.merge(df, acc_numb_before, on='account_id', how='left')  

original_row_count = len(df)  
new_row_count = len(df_new)  

print(f"Оригинальное количество строк: {original_row_count}")  
print(f"Новое количество строк: {new_row_count}")  


Оригинальное количество строк: 235698
Новое количество строк: 235698


In [28]:
if original_row_count == new_row_count:  
    print("Количество строк не увеличилось")  
else:  
    print("Количество строк увеличилось.")  

df_new['previous_loans_count'] = df_new['previous_loans_count'].fillna(0)  

df = df_new  


Количество строк не увеличилось


In [29]:
display(df.head(5))

Unnamed: 0,customer_id,application_id,account_id,branch_id,product_id,application_date,date_loan_granted,loan_amount,first_instalment_due_date,interest_rate,...,ftd_1,ftd_2,ftd_3,ftd_4,age,loan_month,is_collateral,previous_loans_count,previous_loans_max_dpd,existing_client
0,25121517058,764446/КР,35619143897,"ФИЛИАЛИ ЧСК ""БОНКИ ЭСХАТА"" ДАР Ш.ДУШАНБЕ",Карзхои гуногунмаксад,2021-10-19,2021-11-02,12300.0,2021-12-02,31.0,...,0,0,0,0,23,202110,0,0.0,,
1,847140141,766801/КР,35733163635,"ФИЛИАЛИ ЧСК ""БОНКИ ЭСХАТА"" ДАР Ш.ИСТАРАВШАН",Карзхои гуногунмаксад,2021-10-25,2021-11-01,20000.0,2021-12-01,30.0,...,0,0,0,0,52,202110,1,0.0,,
2,6286580057,766319/КР,35736913632,"ФИЛИАЛИ ЧСК ""БОНКИ ЭСХАТА"" ДАР Ш.ИСТАРАВШАН",Карзхои гуногунмаксад,2021-10-25,2021-11-05,10000.0,2021-12-06,30.0,...,0,1,0,7,30,202110,0,0.0,,
3,14939826396,766446/КР,35741587858,"ФИЛИАЛИ ЧСК ""БОНКИ ЭСХАТА"" ДАР Ш. КУЛОБ",Карзхои гуногунмаксад,2021-10-25,2021-11-03,3300.0,2021-12-03,31.0,...,0,0,0,0,55,202110,0,0.0,,
4,32830136711,767392/КР,35754732217,"ФИЛИАЛИ ЧСК ""БОНКИ ЭСХАТА"" ДАР Ш.ДУШАНБЕ, Н.СИ...",Карзхои гуногунмаксад,2021-10-26,2021-11-03,5000.0,2021-12-03,31.0,...,0,0,0,0,20,202110,0,0.0,,


In [30]:
sorted(df.columns)

['account_id',
 'account_status',
 'additional_income',
 'age',
 'amount_due_instalment',
 'application_date',
 'application_id',
 'arrears_amount',
 'bki_rating',
 'branch_id',
 'city_of_living',
 'city_of_registration',
 'client_type',
 'collateral_type',
 'cumulative_delinquency',
 'current_balance',
 'current_days_past_due',
 'current_exposure',
 'customer_id',
 'date_account_opened',
 'date_final_payment',
 'date_last_payment',
 'date_loan_granted',
 'date_of_birth',
 'default_flag',
 'due_date',
 'education',
 'employment_sector',
 'employment_segment',
 'employment_type',
 'existing_client',
 'first_instalment_due_date',
 'ftd_1',
 'ftd_2',
 'ftd_3',
 'ftd_4',
 'gender',
 'has_deposit',
 'has_eschata_online',
 'has_plastic_cards',
 'instalment_or_min_instalment_amount',
 'interest_accrued',
 'interest_rate',
 'is_collateral',
 'loan_amount',
 'loan_month',
 'loan_type',
 'marital_status',
 'maximum_days_past_due',
 'maximum_days_past_due_lifetime',
 'mobile_phone_number',
 'mont

In [31]:
df['gb_90_ever'] = (df['maximum_days_past_due_lifetime'] >= 90).astype(int)  
df['gb_cum_slq90'] = (df['cumulative_delinquency'] >= 90).astype(int)  
df['gb_60_ever'] = (df['maximum_days_past_due_lifetime'] >= 60).astype(int)  


In [32]:
display(df.head(5))

Unnamed: 0,customer_id,application_id,account_id,branch_id,product_id,application_date,date_loan_granted,loan_amount,first_instalment_due_date,interest_rate,...,ftd_4,age,loan_month,is_collateral,previous_loans_count,previous_loans_max_dpd,existing_client,gb_90_ever,gb_cum_slq90,gb_60_ever
0,25121517058,764446/КР,35619143897,"ФИЛИАЛИ ЧСК ""БОНКИ ЭСХАТА"" ДАР Ш.ДУШАНБЕ",Карзхои гуногунмаксад,2021-10-19,2021-11-02,12300.0,2021-12-02,31.0,...,0,23,202110,0,0.0,,,0,0,0
1,847140141,766801/КР,35733163635,"ФИЛИАЛИ ЧСК ""БОНКИ ЭСХАТА"" ДАР Ш.ИСТАРАВШАН",Карзхои гуногунмаксад,2021-10-25,2021-11-01,20000.0,2021-12-01,30.0,...,0,52,202110,1,0.0,,,0,0,0
2,6286580057,766319/КР,35736913632,"ФИЛИАЛИ ЧСК ""БОНКИ ЭСХАТА"" ДАР Ш.ИСТАРАВШАН",Карзхои гуногунмаксад,2021-10-25,2021-11-05,10000.0,2021-12-06,30.0,...,7,30,202110,0,0.0,,,0,0,0
3,14939826396,766446/КР,35741587858,"ФИЛИАЛИ ЧСК ""БОНКИ ЭСХАТА"" ДАР Ш. КУЛОБ",Карзхои гуногунмаксад,2021-10-25,2021-11-03,3300.0,2021-12-03,31.0,...,0,55,202110,0,0.0,,,0,0,0
4,32830136711,767392/КР,35754732217,"ФИЛИАЛИ ЧСК ""БОНКИ ЭСХАТА"" ДАР Ш.ДУШАНБЕ, Н.СИ...",Карзхои гуногунмаксад,2021-10-26,2021-11-03,5000.0,2021-12-03,31.0,...,0,20,202110,0,0.0,,,0,0,0


In [33]:
# correlation_matrix = df.corr()

# plt.figure(figsize=(48, 40))

# sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", linewidths=0.5)

# plt.savefig("correlation_matrix_400.png", dpi=400, bbox_inches="tight")

# plt.title("Матрица корреляции")
# plt.show()

In [34]:
tables = '''current_balance
interest_accrued
outstanding_balance
current_days_past_due
maximum_days_past_due
maximum_days_past_due_lifetime
number_of_extensions
ftd_2
ftd_3
ftd_4
existing_client
customer_id
account_id
cumulative_delinquency'''.split('\n')

In [35]:
df_old = df.copy()
df = df.drop(tables, axis=1)
# corr_mat = df_new.corr()
# corr_mat.to_excel('corr_mat.xlsx')

In [36]:
# tables = '''date_loan_granted
# property_or_collateral_type
# loan_type
# city_of_living
# city_of_registration
# work_phone_number
# mobile_phone_number
# source_of_additional_income
# current_exposure
# state
# rejection_reason
# run_date
# date_account_opened
# current_balance
# date_last_payment
# date_final_payment
# due_date
# payment_amount
# account_status
# number_of_payments_in_arrears
# cumulative_delinquency
# amount_due_instalment
# principal_amount
# interest_accrued
# outstanding_balance
# arrears_amount
# current_days_past_due
# maximum_days_past_due
# ftd_1
# ftd_2
# ftd_3
# ftd_4
# application_id
# product_id
# branch_id
# application_date
# first_instalment_due_date
# interest_rate
# collateral_type
# value_of_collateral
# date_of_birth
# additional_income'''.split('\n')

In [37]:
# df_old = df.copy()
# df = df.drop(tables, axis=1)

In [38]:
df = df.drop(columns=['application_id', 'branch_id', 'product_id'])
df = df.drop(columns=["application_date"])
df = df.drop(columns=["date_loan_granted"])
df = df.drop(columns=["loan_amount"])
df = df.drop(columns=["first_instalment_due_date"])
df = df.drop(columns=["value_of_collateral"])




In [27]:
df = df.drop(columns=['ftd_1'])

In [28]:
df = df.drop(columns=['rejection_reason'])

In [29]:
df = df.drop(columns=['previous_loans_max_dpd'])

In [30]:
for col in df.columns:
    print(col)

interest_rate
collateral_type
property_or_collateral_type
salary_payment_in_bank_account
loan_type
number_of_instalments
instalment_or_min_instalment_amount
date_of_birth
gender
city_of_living
region_of_living
city_of_registration
region_of_registration
work_phone_number
mobile_phone_number
education
marital_status
number_of_dependants
number_of_children
months_at_current_address
employment_type
employment_sector
employment_segment
months_at_job
net_main_income
source_of_main_income
additional_income
source_of_additional_income
reported_expenses
months_with_bank
current_exposure
client_type
property_object
has_eschata_online
has_plastic_cards
has_deposit
state
run_date
date_account_opened
date_last_payment
date_final_payment
due_date
payment_amount
account_status
number_of_payments_in_arrears
amount_due_instalment
principal_amount
arrears_amount
default_flag
bki_rating
number_of_loans_in_bki
age
loan_month
is_collateral
previous_loans_count
gb_90_ever
gb_cum_slq90
gb_60_ever


In [31]:
tables = '''amount_due_instalment
default_flag
principal_amount
arrears_amount
number_of_payments_in_arrears
amount_due_instalment
due_date'''.split('\n')

df = df.drop(tables, axis=1)

In [None]:
"Customer.ID.x",
"Account.ID",
"Loan.Amount",
"IsCollateral",
"Salary.payment.in.bank.account",
"Age",
"Gender",
"Region.of.living",
"Region.of.registration",
"CityOfLivingEqRegistration",
"Education",
"Marital.status",
"Dependants",
"Months.at.current.address",
"Employment.type",
"Employment.segment",
"Months.at.job",
"Net.main.income",
"Source.of.main.income",
"Additional.income",
"Reported.expenses",
"Months.with.bank",
"Client.type",
"Property.object",
"Eskhata.Online",
"Plastic.Cards",
"Deposit",
"GB_90Ever",
"GB_CumDlq90",
"GB_60Ever",
"GB",
"BKI.Rating",
"BKI.Number.of.Loans",
"LoanMonth",
"PreviousLoans_Count",
"PreviousLoans_MaxDPD"


In [32]:
df.isnull().sum() 


interest_rate                               0
collateral_type                        209221
property_or_collateral_type            209221
salary_payment_in_bank_account              0
loan_type                                   0
number_of_instalments                       0
instalment_or_min_instalment_amount      1208
date_of_birth                               1
gender                                      0
city_of_living                              0
region_of_living                            0
city_of_registration                        0
region_of_registration                      0
work_phone_number                       10282
mobile_phone_number                    221156
education                               12191
marital_status                            275
number_of_dependants                        0
number_of_children                          0
months_at_current_address                   0
employment_type                           293
employment_sector                 

In [33]:
df.dtypes

interest_rate                                 float64
collateral_type                                object
property_or_collateral_type                    object
salary_payment_in_bank_account                 object
loan_type                                      object
number_of_instalments                           int64
instalment_or_min_instalment_amount           float64
date_of_birth                          datetime64[ns]
gender                                         object
city_of_living                                 object
region_of_living                               object
city_of_registration                            int32
region_of_registration                         object
work_phone_number                              object
mobile_phone_number                            object
education                                      object
marital_status                                 object
number_of_dependants                            int64
number_of_children          

In [34]:
def fill_na(df):  
    for column in df.columns:  
        miss_perс = df[column].isnull().mean() 
        if miss_perс > 0.05: 
            if df[column].dtype == 'object':  
                df[column].fillna('Unknown', inplace=True)  
            else:  
                df[column].fillna(-1, inplace=True)  
        else:  
            if df[column].dtype == 'object':  
                mode_value = df[column].mode()  
                if not mode_value.empty:  
                    df[column].fillna(mode_value[0], inplace=True)  
            else:  
                mean_value = df[column].mean()  
                median_value = df[column].median()  
                if not pd.isnull(mean_value) and not pd.isnull(median_value):  
                    fill_value = median_value if median_value <= mean_value else mean_value  
                    df[column].fillna(fill_value, inplace=True)  

fill_na(df)  

In [35]:
df.isnull().sum() 

interest_rate                          0
collateral_type                        0
property_or_collateral_type            0
salary_payment_in_bank_account         0
loan_type                              0
number_of_instalments                  0
instalment_or_min_instalment_amount    0
date_of_birth                          0
gender                                 0
city_of_living                         0
region_of_living                       0
city_of_registration                   0
region_of_registration                 0
work_phone_number                      0
mobile_phone_number                    0
education                              0
marital_status                         0
number_of_dependants                   0
number_of_children                     0
months_at_current_address              0
employment_type                        0
employment_sector                      0
employment_segment                     0
months_at_job                          0
net_main_income 

In [36]:
df.dtypes

interest_rate                                 float64
collateral_type                                object
property_or_collateral_type                    object
salary_payment_in_bank_account                 object
loan_type                                      object
number_of_instalments                           int64
instalment_or_min_instalment_amount           float64
date_of_birth                          datetime64[ns]
gender                                         object
city_of_living                                 object
region_of_living                               object
city_of_registration                            int32
region_of_registration                         object
work_phone_number                              object
mobile_phone_number                            object
education                                      object
marital_status                                 object
number_of_dependants                            int64
number_of_children          

In [37]:
df.columns

Index(['interest_rate', 'collateral_type', 'property_or_collateral_type',
       'salary_payment_in_bank_account', 'loan_type', 'number_of_instalments',
       'instalment_or_min_instalment_amount', 'date_of_birth', 'gender',
       'city_of_living', 'region_of_living', 'city_of_registration',
       'region_of_registration', 'work_phone_number', 'mobile_phone_number',
       'education', 'marital_status', 'number_of_dependants',
       'number_of_children', 'months_at_current_address', 'employment_type',
       'employment_sector', 'employment_segment', 'months_at_job',
       'net_main_income', 'source_of_main_income', 'additional_income',
       'source_of_additional_income', 'reported_expenses', 'months_with_bank',
       'current_exposure', 'client_type', 'property_object',
       'has_eschata_online', 'has_plastic_cards', 'has_deposit', 'state',
       'run_date', 'date_account_opened', 'date_last_payment',
       'date_final_payment', 'payment_amount', 'account_status', 'bki_rat

In [53]:
X = df.drop(columns=['gb_60_ever', 'gb_90_ever', 'gb_cum_slq90'])  
y = df['gb_60_ever']


In [62]:
categorical_columns = X.select_dtypes(include=['object', 'category']).columns

for col in categorical_columns:
    X[col] = X[col].astype(str)
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

display(X.head())


Unnamed: 0,interest_rate,collateral_type,property_or_collateral_type,salary_payment_in_bank_account,loan_type,number_of_instalments,instalment_or_min_instalment_amount,gender,city_of_living,region_of_living,...,bki_rating,number_of_loans_in_bki,age,loan_month,is_collateral,previous_loans_count,date_of_birth_year,date_of_birth_month,date_of_birth_day,date_of_birth_weekday
0,31.0,97,97,1,6,18,863.0,0,259,3,...,10,0,23,202110,0,0.0,1998,1,7,2
1,30.0,58,58,1,6,24,1118.0,1,125,1,...,10,5,52,202110,1,0.0,1969,5,16,4
2,30.0,97,97,1,6,24,559.0,1,125,1,...,10,2,30,202110,0,0.0,1991,3,2,5
3,31.0,97,97,1,6,12,323.0,1,196,2,...,10,0,55,202110,0,0.0,1966,10,23,6
4,31.0,97,97,1,6,6,910.0,1,265,4,...,10,1,20,202110,0,0.0,2001,6,7,3


In [63]:
datetime_columns = X.select_dtypes(include=['datetime64']).columns

for col in datetime_columns:
    X[f"{col}_year"] = X[col].dt.year
    X[f"{col}_month"] = X[col].dt.month
    X[f"{col}_day"] = X[col].dt.day
    X[f"{col}_weekday"] = X[col].dt.weekday
    X.drop(col, axis=1, inplace=True)


In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)  


model = LogisticRegression(max_iter=1000, class_weight='balance')  

In [65]:
model.fit(X_train, y_train)  

InvalidParameterError: The 'class_weight' parameter of LogisticRegression must be an instance of 'dict', a str among {'balanced'} or None. Got 'balance' instead.

In [66]:
y_pred = model.predict(X_test)  
y_proba = model.predict_proba(X_test)[:, 1]  


NotFittedError: This LogisticRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
print("Confusion Matrix:")  
print(confusion_matrix(y_test, y_pred))  
print("\nClassification Report:")  
print(classification_report(y_test, y_pred))  
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))

In [67]:
X_train.columns

Index(['interest_rate', 'collateral_type', 'property_or_collateral_type',
       'salary_payment_in_bank_account', 'loan_type', 'number_of_instalments',
       'instalment_or_min_instalment_amount', 'gender', 'city_of_living',
       'region_of_living', 'city_of_registration', 'region_of_registration',
       'work_phone_number', 'mobile_phone_number', 'education',
       'marital_status', 'number_of_dependants', 'number_of_children',
       'months_at_current_address', 'employment_type', 'employment_sector',
       'employment_segment', 'months_at_job', 'net_main_income',
       'source_of_main_income', 'additional_income',
       'source_of_additional_income', 'reported_expenses', 'months_with_bank',
       'current_exposure', 'client_type', 'property_object',
       'has_eschata_online', 'has_plastic_cards', 'has_deposit', 'state',
       'run_date', 'date_account_opened', 'date_last_payment',
       'date_final_payment', 'payment_amount', 'account_status', 'bki_rating',
       'num