In [1]:
import pandas as pd
import time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix,accuracy_score, precision_score, recall_score, f1_score,roc_auc_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import numpy as np

In [2]:
# Read the CSV file
data = pd.read_csv("application_train_sample.csv")
data = data.drop('CODE_GENDER', axis=1)
# Replace 'Y' with 1 and 'N' with 0 in the FLAG_OWN_CAR and FLAG_OWN_REALTY columns
data['FLAG_OWN_CAR'] = data['FLAG_OWN_CAR'].replace({'Y': 1, 'N': 0})
data['FLAG_OWN_REALTY'] = data['FLAG_OWN_REALTY'].replace({'Y': 1, 'N': 0})

# Fill missing values with the median for the specified columns
data['EXT_SOURCE_1'] = data['EXT_SOURCE_1'].fillna(data['EXT_SOURCE_1'].median())
data['EXT_SOURCE_2'] = data['EXT_SOURCE_2'].fillna(data['EXT_SOURCE_2'].median())
data['EXT_SOURCE_3'] = data['EXT_SOURCE_3'].fillna(data['EXT_SOURCE_3'].median())



In [3]:
# Preprocess the data
data['FLAG_OWN_CAR'] = data['FLAG_OWN_CAR'].replace({'Y': 1, 'N': 0})
data['FLAG_OWN_REALTY'] = data['FLAG_OWN_REALTY'].replace({'Y': 1, 'N': 0})

# Fill missing values with the median for the specified columns
data['EXT_SOURCE_1'] = data['EXT_SOURCE_1'].fillna(data['EXT_SOURCE_1'].median())
data['EXT_SOURCE_2'] = data['EXT_SOURCE_2'].fillna(data['EXT_SOURCE_2'].median())
data['EXT_SOURCE_3'] = data['EXT_SOURCE_3'].fillna(data['EXT_SOURCE_3'].median())
data["NAME_EDUCATION_TYPE"] = data["NAME_EDUCATION_TYPE"].replace({
    "Lower secondary": 1,
    "Secondary / secondary special": 2,
    "Incomplete higher": 3,
    "Higher education": 4,
    "Academic degree": 5
})
# Create a dictionary to map category names to numbers
category_dict = {'State servant': 1, 'Working': 2, 'Commercial associate': 3, 'Pensioner': 4, 'Businessman': 5, 'Unemployed': 6, 'Maternity leave': 7, 'Student': 8}

# Map the category names to numbers
data['NAME_INCOME_TYPE'] = data['NAME_INCOME_TYPE'].map(category_dict)
data['NAME_INCOME_TYPE'] = data['NAME_INCOME_TYPE'].fillna(0)

# Define a dictionary to map category names to numbers
family_status_map = {
    'Married': 1, 
    'Single / not married': 2, 
    'Civil marriage': 3, 
    'Separated': 4, 
    'Widow': 5
}

# Use the map function to convert the category names to numbers
data['NAME_FAMILY_STATUS'] = data['NAME_FAMILY_STATUS'].map(family_status_map)
data['NAME_FAMILY_STATUS'] = data['NAME_FAMILY_STATUS'].fillna(0)

# Assign numeric values to categories in NAME_HOUSING_TYPE column
housing_type_mapping = {
    'House / apartment': 1,
    'With parents': 2,
    'Office apartment': 3,
    'Municipal apartment': 4,
    'Rented apartment': 5,
    'Co-op apartment': 6
}
data['NAME_HOUSING_TYPE'] = data['NAME_HOUSING_TYPE'].map(housing_type_mapping)

data["OCCUPATION_TYPE"] = data["OCCUPATION_TYPE"].replace({
    "Core staff": 1,
    "Laborers": 2,
    "High skill tech staff": 3,
    "Cleaning staff": 4,
    "Sales staff": 5,
    "Waiters/barmen staff": 6,
    "Managers": 7,
    "Private service staff": 8,
    "Accountants": 9,
    "Low-skill Laborers": 10,
    "Medicine staff": 11,
    "Drivers": 12,
    "Security staff": 13,
    "Cooking staff": 14,
    "Realty agents": 15,
    "HR staff": 16,
    "IT staff": 17,
    "Secretaries": 18,
    "nan": 0  # Replace missing values with 0
})
# Calculate median values for specified columns
median_annuity = data["AMT_ANNUITY"].median()
median_goods_price = data["AMT_GOODS_PRICE"].median()
median_own_car_age = data["OWN_CAR_AGE"].median()
median_obs_30 = data["OBS_30_CNT_SOCIAL_CIRCLE"].median()
median_def_30 = data["DEF_30_CNT_SOCIAL_CIRCLE"].median()
median_obs_60 = data["OBS_60_CNT_SOCIAL_CIRCLE"].median()
median_def_60 = data["DEF_60_CNT_SOCIAL_CIRCLE"].median()

# Replace NaN values with median values
data["AMT_ANNUITY"].fillna(median_annuity, inplace=True)
data["AMT_GOODS_PRICE"].fillna(median_goods_price, inplace=True)
data["OWN_CAR_AGE"].fillna(median_own_car_age, inplace=True)
data["OBS_30_CNT_SOCIAL_CIRCLE"].fillna(median_obs_30, inplace=True)
data["DEF_30_CNT_SOCIAL_CIRCLE"].fillna(median_def_30, inplace=True)
data["OBS_60_CNT_SOCIAL_CIRCLE"].fillna(median_obs_60, inplace=True)
data["DEF_60_CNT_SOCIAL_CIRCLE"].fillna(median_def_60, inplace=True)
data["OCCUPATION_TYPE"].fillna(0, inplace=True)
data['NAME_CONTRACT_TYPE'] = data['NAME_CONTRACT_TYPE'].replace({'Cash loans': 1, 'Revolving loans': 2})
name_type_suite_mapping = {
    "Unaccompanied": 1,
    "Family": 2,
    "Spouse, partner": 3,
    "Children": 4,
    "Other_B": 5,
    "Other_A": 6,
    "Group of people": 7
}

data['NAME_TYPE_SUITE'] = data['NAME_TYPE_SUITE'].replace(name_type_suite_mapping)
weekday_mapping = {
    "MONDAY": 1,
    "TUESDAY": 2,
    "WEDNESDAY": 3,
    "THURSDAY": 4,
    "FRIDAY": 5,
    "SATURDAY": 6,
    "SUNDAY": 7
}


data['WEEKDAY_APPR_PROCESS_START'] = data['WEEKDAY_APPR_PROCESS_START'].replace(weekday_mapping)
organization_mapping = {
    'Business Entity Type 3': 1,
    'XNA': 2,
    'Self-employed': 3,
    'Other': 4,
    'Business Entity Type 2': 1,
    'Medicine': 5,
    'Government': 6,
    'Trade: type 7': 7,
    'School': 8,
    'Construction': 9,
    'Kindergarten': 8,
    'Transport: type 4': 10,
    'Business Entity Type 1': 1,
    'Trade: type 3': 7,
    'Industry: type 3': 11,
    'Security': 12,
    'Industry: type 9': 11,
    'Agriculture': 13,
    'Housing': 14,
    'Industry: type 11': 11,
    'Postal': 15,
    'Restaurant': 16,
    'Military': 17,
    'Transport: type 2': 10,
    'Bank': 18,
    'Trade: type 2': 7,
    'Police': 19,
    'Security Ministries': 12,
    'Transport: type 3': 10,
    'Industry: type 7': 11,
    'Services': 20,
    'Industry: type 1': 11,
    'University': 8,
    'Industry: type 4': 11,
    'Electricity': 21,
    'Hotel': 22,
    'Telecom': 23,
    'Industry: type 5': 11,
    'Emergency': 24,
    'Insurance': 25,
    'Trade: type 6': 7,
    'Advertising': 26,
    'Industry: type 2': 11,
    'Realtor': 27,
    'Trade: type 1': 7,
    'Culture': 28,
    'Mobile': 29,
    'Legal Services': 30,
    'Cleaning': 31,
    'Industry: type 12': 11,
    'Transport: type 1': 10,
    'Industry: type 6': 11,
    'Industry: type 13': 11,
    'Industry: type 10': 11,
    'Religion': 32,
    'Trade: type 5': 7,
    'Trade: type 4': 7,
    'Industry: type 8': 11
}

data['ORGANIZATION_TYPE'] = data['ORGANIZATION_TYPE'].replace(organization_mapping)
data['NAME_INCOME_TYPE'] = data['NAME_INCOME_TYPE'].fillna(0)
fondkapremont_mapping = {
    'reg oper account': 1,
    'reg oper spec account': 2,
    'not specified': 3,
    'org spec account': 4
}

data['FONDKAPREMONT_MODE'] = data['FONDKAPREMONT_MODE'].replace(fondkapremont_mapping)
data['FONDKAPREMONT_MODE'] = data['FONDKAPREMONT_MODE'].fillna(0)
housetype_mapping = {
    'block of flats': 1,
    'specific housing': 2,
    'terraced house': 3
}

data['HOUSETYPE_MODE'] = data['HOUSETYPE_MODE'].replace(housetype_mapping)
data['HOUSETYPE_MODE'] = data['HOUSETYPE_MODE'].fillna(0)
wallsmaterial_mapping = {
    'Stone, brick': 1,
    'Panel': 2,
    'Block': 3,
    'Wooden': 4,
    'Mixed': 5,
    'Others': 6,
    'Monolithic': 7
}

data['WALLSMATERIAL_MODE'] = data['WALLSMATERIAL_MODE'].replace(wallsmaterial_mapping)
data['WALLSMATERIAL_MODE'] = data['WALLSMATERIAL_MODE'].fillna(0)
emergency_state_mapping = {
    'No': 1,
    'Yes':2
}

data['EMERGENCYSTATE_MODE'] = data['EMERGENCYSTATE_MODE'].replace(emergency_state_mapping)
data['EMERGENCYSTATE_MODE'] = data['EMERGENCYSTATE_MODE'].fillna(0)

all_variables = ["TOTALAREA_MODE","CNT_CHILDREN", "AMT_INCOME_TOTAL", "AMT_CREDIT", "AMT_ANNUITY", "AMT_GOODS_PRICE", "REGION_POPULATION_RELATIVE", "DAYS_BIRTH", "DAYS_EMPLOYED", "DAYS_REGISTRATION", "DAYS_ID_PUBLISH", "OWN_CAR_AGE", "EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3", "APARTMENTS_AVG", "BASEMENTAREA_AVG", 'CNT_FAM_MEMBERS',"YEARS_BEGINEXPLUATATION_AVG", "YEARS_BUILD_AVG", "COMMONAREA_AVG", "ELEVATORS_AVG", "ENTRANCES_AVG", "FLOORSMAX_AVG", "FLOORSMIN_AVG", "LANDAREA_AVG", "LIVINGAPARTMENTS_AVG", "LIVINGAREA_AVG", "NONLIVINGAPARTMENTS_AVG", "NONLIVINGAREA_AVG", "APARTMENTS_MODE", "BASEMENTAREA_MODE", "YEARS_BEGINEXPLUATATION_MODE", "YEARS_BUILD_MODE", "COMMONAREA_MODE", "ELEVATORS_MODE", "ENTRANCES_MODE", "FLOORSMAX_MODE", "FLOORSMIN_MODE", "LANDAREA_MODE", "LIVINGAPARTMENTS_MODE", "LIVINGAREA_MODE", "NONLIVINGAPARTMENTS_MODE", "NONLIVINGAREA_MODE", "APARTMENTS_MEDI", "BASEMENTAREA_MEDI", "YEARS_BEGINEXPLUATATION_MEDI", "YEARS_BUILD_MEDI", "COMMONAREA_MEDI", "ELEVATORS_MEDI", "ENTRANCES_MEDI", "FLOORSMAX_MEDI", "FLOORSMIN_MEDI", "LANDAREA_MEDI", "LIVINGAPARTMENTS_MEDI", "LIVINGAREA_MEDI", "NONLIVINGAPARTMENTS_MEDI", "NONLIVINGAREA_MEDI", "OBS_30_CNT_SOCIAL_CIRCLE", "DEF_30_CNT_SOCIAL_CIRCLE", "OBS_60_CNT_SOCIAL_CIRCLE", "DEF_60_CNT_SOCIAL_CIRCLE", "DAYS_LAST_PHONE_CHANGE", "AMT_REQ_CREDIT_BUREAU_HOUR", "AMT_REQ_CREDIT_BUREAU_DAY", "AMT_REQ_CREDIT_BUREAU_WEEK", "AMT_REQ_CREDIT_BUREAU_MON", "AMT_REQ_CREDIT_BUREAU_QRT", "AMT_REQ_CREDIT_BUREAU_YEAR",'TARGET', 'NAME_CONTRACT_TYPE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE',  'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']
for var in all_variables:
    
    data[var] = data[var].fillna(0)
    
# Split the data into train and test sets
X = data.drop('TARGET', axis=1)
y = data['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# Logistic Regression

# Create a logistic regression model
model = LogisticRegression(random_state=42)
start_time = time.time()
model.fit(X_train, y_train)
elapsed_time = time.time() - start_time

# Predict on the testing data
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

# Print the confusion matrix and accuracy
print('Logistic Regression Results: ')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('AUC: ', auc)
print(f'  Elapsed Time: {elapsed_time:.2f} seconds')
print(f'Confusion Matrix:\n{conf_matrix}\n')


Logistic Regression Results: 
Accuracy: 0.5796093122825796
Precision: 0.5828884325804243
Recall: 0.5681120747164776
F1 Score: 0.5754054054054053
AUC:  0.6108992201697684
  Elapsed Time: 0.67 seconds
Confusion Matrix:
[[4406 3047]
 [3237 4258]]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
# Fit the SVM model
svm_model = SVC(kernel='rbf', probability=True)

start_time = time.time()
svm_model.fit(X_train, y_train)
elapsed_time = time.time() - start_time

# Make predictions and evaluate the model
y_pred = svm_model.predict(X_test)
y_pred_proba = svm_model.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
    
# Print the confusion matrix and accuracy
print('SVM Results: ')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('AUC: ', auc)
print(f'  Elapsed Time: {elapsed_time:.2f} seconds')
print(f'Confusion Matrix:\n{conf_matrix}\n')


SVM Results: 
Accuracy: 0.5679020604763179
Precision: 0.5533031487960486
Recall: 0.7174116077384923
F1 Score: 0.6247603555452275
AUC:  0.6076063052724356
  Elapsed Time: 757.50 seconds
Confusion Matrix:
[[3112 4341]
 [2118 5377]]



In [6]:
# Create Gradient Boosting Classifier model


gb_clf = GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)

# Fit the model on the training data
gb_start_time = time.time()
gb_clf.fit(X_train, y_train)
gb_elapsed_time = time.time() - gb_start_time


# Make predictions on the testing data
y_pred = gb_clf.predict(X_test)

# Evaluate the model
gb_accuracy = accuracy_score(y_test, y_pred)
gb_conf_matrix = confusion_matrix(y_test, y_pred)
gb_precision = precision_score(y_test, y_pred)
gb_recall = recall_score(y_test, y_pred)
gb_f1 = f1_score(y_test, y_pred)
gb_auc = roc_auc_score(y_test, y_pred_proba)
    
# Print the confusion matrix and accuracy

print('Gradient Boosting Results: ')
print('Accuracy:', gb_accuracy)
print('Precision:', gb_precision)
print('Recall:', gb_recall)
print('F1 Score:', gb_f1)
print('AUC: ', gb_auc)
print(f'  Elapsed Time: {gb_elapsed_time:.2f} seconds')
print(f'Confusion Matrix:\n{gb_conf_matrix}\n')

Gradient Boosting Results: 
Accuracy: 0.6835028097404335
Precision: 0.6848582129481006
Recall: 0.6831220813875917
F1 Score: 0.6839890454879434
AUC:  0.6076063052724356
  Elapsed Time: 19.06 seconds
Confusion Matrix:
[[5097 2356]
 [2375 5120]]



In [7]:
# Standardize the data
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# Create a deep learning model using Keras
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train_std.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model

start_time = time.time()
model.fit(X_train_std, y_train, epochs=10, batch_size=32, verbose=1)
elapsed_time = time.time() - start_time


# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
    
# Print the confusion matrix and accuracy

print('Deep Learning Results: ')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('AUC: ', auc)
print(f'  Elapsed Time: {elapsed_time:.2f} seconds')
print(f'Confusion Matrix:\n{conf_matrix}\n')

Metal device set to: Apple M2
Epoch 1/10


2023-05-05 04:25:27.344907: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Deep Learning Results: 
Accuracy: 0.6835028097404335
Precision: 0.6848582129481006
Recall: 0.6831220813875917
F1 Score: 0.6839890454879434
AUC:  0.6076063052724356
  Elapsed Time: 46.54 seconds
Confusion Matrix:
[[5097 2356]
 [2375 5120]]



In [8]:
# Read the CSV file
df = pd.read_csv("application_train_sample.csv")
df = df.drop('CODE_GENDER', axis=1)


df['FLAG_OWN_CAR'] = df['FLAG_OWN_CAR'].replace({'Y': 1, 'N': 0})
df['FLAG_OWN_REALTY'] = df['FLAG_OWN_REALTY'].replace({'Y': 1, 'N': 0})

# Fill missing values with the median for the specified columns
df['EXT_SOURCE_1'] = df['EXT_SOURCE_1'].fillna(df['EXT_SOURCE_1'].median())
df['EXT_SOURCE_2'] = df['EXT_SOURCE_2'].fillna(df['EXT_SOURCE_2'].median())
df['EXT_SOURCE_3'] = df['EXT_SOURCE_3'].fillna(df['EXT_SOURCE_3'].median())
df["NAME_EDUCATION_TYPE"] = df["NAME_EDUCATION_TYPE"].replace({
    "Lower secondary": 1,
    "Secondary / secondary special": 2,
    "Incomplete higher": 3,
    "Higher education": 4,
    "Academic degree": 5
})
# Create a dictionary to map category names to numbers
category_dict = {'State servant': 1, 'Working': 2, 'Commercial associate': 3, 'Pensioner': 4, 'Businessman': 5, 'Unemployed': 6, 'Maternity leave': 7, 'Student': 8}

# Map the category names to numbers
df['NAME_INCOME_TYPE'] = df['NAME_INCOME_TYPE'].map(category_dict)

# Define a dictionary to map category names to numbers
family_status_map = {
    'Married': 1, 
    'Single / not married': 2, 
    'Civil marriage': 3, 
    'Separated': 4, 
    'Widow': 5
}

# Use the map function to convert the category names to numbers
df['NAME_FAMILY_STATUS'] = df['NAME_FAMILY_STATUS'].map(family_status_map)

# Assign numeric values to categories in NAME_HOUSING_TYPE column
housing_type_mapping = {
    'House / apartment': 1,
    'With parents': 2,
    'Office apartment': 3,
    'Municipal apartment': 4,
    'Rented apartment': 5,
    'Co-op apartment': 6
}
df['NAME_HOUSING_TYPE'] = df['NAME_HOUSING_TYPE'].map(housing_type_mapping)

df["OCCUPATION_TYPE"] = df["OCCUPATION_TYPE"].replace({
    "Core staff": 1,
    "Laborers": 2,
    "High skill tech staff": 3,
    "Cleaning staff": 4,
    "Sales staff": 5,
    "Waiters/barmen staff": 6,
    "Managers": 7,
    "Private service staff": 8,
    "Accountants": 9,
    "Low-skill Laborers": 10,
    "Medicine staff": 11,
    "Drivers": 12,
    "Security staff": 13,
    "Cooking staff": 14,
    "Realty agents": 15,
    "HR staff": 16,
    "IT staff": 17,
    "Secretaries": 18,
    "nan": 0  # Replace missing values with 0
})


# Print the first 100 rows of the preprocessed data
#print(preprocessed_data.head(100))

# Calculate median values for specified columns
median_annuity = df["AMT_ANNUITY"].median()
median_goods_price = df["AMT_GOODS_PRICE"].median()
median_own_car_age = df["OWN_CAR_AGE"].median()
median_obs_30 = df["OBS_30_CNT_SOCIAL_CIRCLE"].median()
median_def_30 = df["DEF_30_CNT_SOCIAL_CIRCLE"].median()
median_obs_60 = df["OBS_60_CNT_SOCIAL_CIRCLE"].median()
median_def_60 = df["DEF_60_CNT_SOCIAL_CIRCLE"].median()


# Replace NaN values with median values
df["AMT_ANNUITY"].fillna(median_annuity, inplace=True)
df["AMT_GOODS_PRICE"].fillna(median_goods_price, inplace=True)
df["OWN_CAR_AGE"].fillna(median_own_car_age, inplace=True)
df["OBS_30_CNT_SOCIAL_CIRCLE"].fillna(median_obs_30, inplace=True)
df["DEF_30_CNT_SOCIAL_CIRCLE"].fillna(median_def_30, inplace=True)
df["OBS_60_CNT_SOCIAL_CIRCLE"].fillna(median_obs_60, inplace=True)
df["DEF_60_CNT_SOCIAL_CIRCLE"].fillna(median_def_60, inplace=True)
df["OCCUPATION_TYPE"].fillna(0, inplace=True)
df['NAME_CONTRACT_TYPE'] = df['NAME_CONTRACT_TYPE'].replace({'Cash loans': 1, 'Revolving loans': 2})
name_type_suite_mapping = {
    "Unaccompanied": 1,
    "Family": 2,
    "Spouse, partner": 3,
    "Children": 4,
    "Other_B": 5,
    "Other_A": 6,
    "Group of people": 7
}

df['NAME_TYPE_SUITE'] = df['NAME_TYPE_SUITE'].replace(name_type_suite_mapping)
df['NAME_TYPE_SUITE'] = df['NAME_TYPE_SUITE'].fillna(0)
weekday_mapping = {
    "Monday": 1,
    "Tuesday": 2,
    "Wednesday": 3,
    "Thursday": 4,
    "Friday": 5,
    "Saturday": 6,
    "Sunday": 7
}


df['WEEKDAY_APPR_PROCESS_START'] = df['WEEKDAY_APPR_PROCESS_START'].replace(weekday_mapping)
organization_mapping = {
    'Business Entity Type 3': 1,
    'XNA': 2,
    'Self-employed': 3,
    'Other': 4,
    'Business Entity Type 2': 1,
    'Medicine': 5,
    'Government': 6,
    'Trade: type 7': 7,
    'School': 8,
    'Construction': 9,
    'Kindergarten': 8,
    'Transport: type 4': 10,
    'Business Entity Type 1': 1,
    'Trade: type 3': 7,
    'Industry: type 3': 11,
    'Security': 12,
    'Industry: type 9': 11,
    'Agriculture': 13,
    'Housing': 14,
    'Industry: type 11': 11,
    'Postal': 15,
    'Restaurant': 16,
    'Military': 17,
    'Transport: type 2': 10,
    'Bank': 18,
    'Trade: type 2': 7,
    'Police': 19,
    'Security Ministries': 12,
    'Transport: type 3': 10,
    'Industry: type 7': 11,
    'Services': 20,
    'Industry: type 1': 11,
    'University': 8,
    'Industry: type 4': 11,
    'Electricity': 21,
    'Hotel': 22,
    'Telecom': 23,
    'Industry: type 5': 11,
    'Emergency': 24,
    'Insurance': 25,
    'Trade: type 6': 7,
    'Advertising': 26,
    'Industry: type 2': 11,
    'Realtor': 27,
    'Trade: type 1': 7,
    'Culture': 28,
    'Mobile': 29,
    'Legal Services': 30,
    'Cleaning': 31,
    'Industry: type 12': 11,
    'Transport: type 1': 10,
    'Industry: type 6': 11,
    'Industry: type 13': 11,
    'Industry: type 10': 11,
    'Religion': 32,
    'Trade: type 5': 7,
    'Trade: type 4': 7,
    'Industry: type 8': 11
}

df['ORGANIZATION_TYPE'] = df['ORGANIZATION_TYPE'].replace(organization_mapping)

fondkapremont_mapping = {
    'reg oper account': 1,
    'reg oper spec account': 2,
    'not specified': 3,
    'org spec account': 4
}

df['FONDKAPREMONT_MODE'] = df['FONDKAPREMONT_MODE'].replace(fondkapremont_mapping)
df['FONDKAPREMONT_MODE'] = df['FONDKAPREMONT_MODE'].fillna(0)
housetype_mapping = {
    'block of flats': 1,
    'specific housing': 2,
    'terraced house': 3
}

df['HOUSETYPE_MODE'] = df['HOUSETYPE_MODE'].replace(housetype_mapping)
df['HOUSETYPE_MODE'] = df['HOUSETYPE_MODE'].fillna(0)
wallsmaterial_mapping = {
    'Stone, brick': 1,
    'Panel': 2,
    'Block': 3,
    'Wooden': 4,
    'Mixed': 5,
    'Others': 6,
    'Monolithic': 7
}

df['WALLSMATERIAL_MODE'] = df['WALLSMATERIAL_MODE'].replace(wallsmaterial_mapping)
df['WALLSMATERIAL_MODE'] = df['WALLSMATERIAL_MODE'].fillna(0)
emergency_state_mapping = {
    'No': 1
}

df['EMERGENCYSTATE_MODE'] = df['EMERGENCYSTATE_MODE'].replace(emergency_state_mapping)
df['EMERGENCYSTATE_MODE'] = df['EMERGENCYSTATE_MODE'].fillna(0)

continuous_variables = ["CNT_CHILDREN", "AMT_INCOME_TOTAL", "AMT_CREDIT", "AMT_ANNUITY", "AMT_GOODS_PRICE", "REGION_POPULATION_RELATIVE", "DAYS_BIRTH", "DAYS_EMPLOYED", "DAYS_REGISTRATION", "DAYS_ID_PUBLISH", "OWN_CAR_AGE", "EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3", "APARTMENTS_AVG", "BASEMENTAREA_AVG", 'CNT_FAM_MEMBERS',"YEARS_BEGINEXPLUATATION_AVG", "YEARS_BUILD_AVG", "COMMONAREA_AVG", "ELEVATORS_AVG", "ENTRANCES_AVG", "FLOORSMAX_AVG", "FLOORSMIN_AVG", "LANDAREA_AVG", "LIVINGAPARTMENTS_AVG", "LIVINGAREA_AVG", "NONLIVINGAPARTMENTS_AVG", "NONLIVINGAREA_AVG", "APARTMENTS_MODE", "BASEMENTAREA_MODE", "YEARS_BEGINEXPLUATATION_MODE", "YEARS_BUILD_MODE", "COMMONAREA_MODE", "ELEVATORS_MODE", "ENTRANCES_MODE", "FLOORSMAX_MODE", "FLOORSMIN_MODE", "LANDAREA_MODE", "LIVINGAPARTMENTS_MODE", "LIVINGAREA_MODE", "NONLIVINGAPARTMENTS_MODE", "NONLIVINGAREA_MODE", "APARTMENTS_MEDI", "BASEMENTAREA_MEDI", "YEARS_BEGINEXPLUATATION_MEDI", "YEARS_BUILD_MEDI", "COMMONAREA_MEDI", "ELEVATORS_MEDI", "ENTRANCES_MEDI", "FLOORSMAX_MEDI", "FLOORSMIN_MEDI", "LANDAREA_MEDI", "LIVINGAPARTMENTS_MEDI", "LIVINGAREA_MEDI", "NONLIVINGAPARTMENTS_MEDI", "NONLIVINGAREA_MEDI", "OBS_30_CNT_SOCIAL_CIRCLE", "DEF_30_CNT_SOCIAL_CIRCLE", "OBS_60_CNT_SOCIAL_CIRCLE", "DEF_60_CNT_SOCIAL_CIRCLE", "DAYS_LAST_PHONE_CHANGE", "AMT_REQ_CREDIT_BUREAU_HOUR", "AMT_REQ_CREDIT_BUREAU_DAY", "AMT_REQ_CREDIT_BUREAU_WEEK", "AMT_REQ_CREDIT_BUREAU_MON", "AMT_REQ_CREDIT_BUREAU_QRT", "AMT_REQ_CREDIT_BUREAU_YEAR"]

categorical_variables = [
    'TARGET', 'NAME_CONTRACT_TYPE',  'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE',  'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'
]

In [9]:
cat_df = df[categorical_variables].copy()

for var in continuous_variables:
    # Calculate the quantiles and create a new categorical variable
    cat_df[var] = pd.qcut(df[var], 10, labels=False, duplicates='drop')

    # Replace NaN values with -1 to indicate missing data
    cat_df[var] = cat_df[var].fillna(-1)


In [10]:
print(cat_df.head())

   TARGET  NAME_CONTRACT_TYPE  FLAG_OWN_CAR  FLAG_OWN_REALTY  NAME_TYPE_SUITE   
0       1                   1             0                0              1.0  \
1       0                   1             0                0              1.0   
2       1                   1             1                0              1.0   
3       1                   1             0                0              1.0   
4       0                   1             0                1              2.0   

   NAME_INCOME_TYPE  NAME_EDUCATION_TYPE  NAME_FAMILY_STATUS   
0                 3                    2                   1  \
1                 3                    2                   1   
2                 2                    4                   1   
3                 2                    2                   2   
4                 2                    2                   1   

   NAME_HOUSING_TYPE  FLAG_MOBIL  ...  DEF_30_CNT_SOCIAL_CIRCLE   
0                  3           1  ...                        

In [11]:
for variable in categorical_variables:
    print(f"{variable} unique values, their counts, and missing values:")
    print(cat_df[variable].value_counts(dropna=False))
    print(f"Missing values: {cat_df[variable].isna().sum()}")
    print("\n")

TARGET unique values, their counts, and missing values:
TARGET
0    25000
1    24825
Name: count, dtype: int64
Missing values: 0


NAME_CONTRACT_TYPE unique values, their counts, and missing values:
NAME_CONTRACT_TYPE
1    45726
2     4099
Name: count, dtype: int64
Missing values: 0


FLAG_OWN_CAR unique values, their counts, and missing values:
FLAG_OWN_CAR
0    33693
1    16132
Name: count, dtype: int64
Missing values: 0


FLAG_OWN_REALTY unique values, their counts, and missing values:
FLAG_OWN_REALTY
1    34343
0    15482
Name: count, dtype: int64
Missing values: 0


NAME_TYPE_SUITE unique values, their counts, and missing values:
NAME_TYPE_SUITE
1.0    40665
2.0     6140
3.0     1816
4.0      519
5.0      317
0.0      172
6.0      147
7.0       49
Name: count, dtype: int64
Missing values: 0


NAME_INCOME_TYPE unique values, their counts, and missing values:
NAME_INCOME_TYPE
2    27870
3    11149
4     7721
1     3070
6        9
8        3
7        2
5        1
Name: count, dtype: 

In [12]:
def calculate_woe_iv(data, feature, target):
    lst = []
    for i in range(data[feature].nunique()):
        val = list(data[feature].unique())[i]
        lst.append({
            'Value': val,
            'All': data[data[feature] == val].count()[feature],
            'Good': data[(data[feature] == val) & (data[target] == 0)].count()[feature],
            'Bad': data[(data[feature] == val) & (data[target] == 1)].count()[feature]
        })

    dset = pd.DataFrame(lst)
    dset['Distr_Good'] = dset['Good'] / dset['Good'].sum()
    dset['Distr_Bad'] = dset['Bad'] / dset['Bad'].sum()
    dset['WoE'] = np.log(dset['Distr_Good'] / dset['Distr_Bad'])
    dset['IV'] = (dset['Distr_Good'] - dset['Distr_Bad']) * dset['WoE']
    iv = dset['IV'].sum()

    return dset, iv


In [13]:
all_variables = ["CNT_CHILDREN", "AMT_INCOME_TOTAL", "AMT_CREDIT", "AMT_ANNUITY", "AMT_GOODS_PRICE", "REGION_POPULATION_RELATIVE", "DAYS_BIRTH", "DAYS_EMPLOYED", "DAYS_REGISTRATION", "DAYS_ID_PUBLISH", "OWN_CAR_AGE", "EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3", "APARTMENTS_AVG", "BASEMENTAREA_AVG", 'CNT_FAM_MEMBERS',"YEARS_BEGINEXPLUATATION_AVG", "YEARS_BUILD_AVG", "COMMONAREA_AVG", "ELEVATORS_AVG", "ENTRANCES_AVG", "FLOORSMAX_AVG", "FLOORSMIN_AVG", "LANDAREA_AVG", "LIVINGAPARTMENTS_AVG", "LIVINGAREA_AVG", "NONLIVINGAPARTMENTS_AVG", "NONLIVINGAREA_AVG", "APARTMENTS_MODE", "BASEMENTAREA_MODE", "YEARS_BEGINEXPLUATATION_MODE", "YEARS_BUILD_MODE", "COMMONAREA_MODE", "ELEVATORS_MODE", "ENTRANCES_MODE", "FLOORSMAX_MODE", "FLOORSMIN_MODE", "LANDAREA_MODE", "LIVINGAPARTMENTS_MODE", "LIVINGAREA_MODE", "NONLIVINGAPARTMENTS_MODE", "NONLIVINGAREA_MODE", "APARTMENTS_MEDI", "BASEMENTAREA_MEDI", "YEARS_BEGINEXPLUATATION_MEDI", "YEARS_BUILD_MEDI", "COMMONAREA_MEDI", "ELEVATORS_MEDI", "ENTRANCES_MEDI", "FLOORSMAX_MEDI", "FLOORSMIN_MEDI", "LANDAREA_MEDI", "LIVINGAPARTMENTS_MEDI", "LIVINGAREA_MEDI", "NONLIVINGAPARTMENTS_MEDI", "NONLIVINGAREA_MEDI", "OBS_30_CNT_SOCIAL_CIRCLE", "DEF_30_CNT_SOCIAL_CIRCLE", "OBS_60_CNT_SOCIAL_CIRCLE", "DEF_60_CNT_SOCIAL_CIRCLE", "DAYS_LAST_PHONE_CHANGE", "AMT_REQ_CREDIT_BUREAU_HOUR", "AMT_REQ_CREDIT_BUREAU_DAY", "AMT_REQ_CREDIT_BUREAU_WEEK", "AMT_REQ_CREDIT_BUREAU_MON", "AMT_REQ_CREDIT_BUREAU_QRT", "AMT_REQ_CREDIT_BUREAU_YEAR",'TARGET', 'NAME_CONTRACT_TYPE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE',  'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']
cat_df['NAME_INCOME_TYPE'] = cat_df['NAME_INCOME_TYPE'].apply(lambda x: x if x <5  else 5)
cat_df = cat_df.drop('FLAG_DOCUMENT_10', axis=1)

for feature in all_variables:
    print(f"Calculating WoE and IV for {feature}")
    dset, iv = calculate_woe_iv(cat_df, feature, 'TARGET')
    print(dset)
    print(f"IV score: {iv}\n")

Calculating WoE and IV for CNT_CHILDREN
   Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
0      0  44588  22525  22063     0.90100   0.888741  0.013699  0.000168
1      1   4477   2144   2333     0.08576   0.093978 -0.091506  0.000752
2      2    760    331    429     0.01324   0.017281 -0.266363  0.001076
IV score: 0.001996285591339281

Calculating WoE and IV for AMT_INCOME_TOTAL
   Value   All  Good   Bad  Distr_Good  Distr_Bad       WoE        IV
0      1  4937  2447  2490     0.09788   0.100302 -0.024445  0.000059
1      5  5499  2645  2854     0.10580   0.114965 -0.083075  0.000761
2      8  3069  1664  1405     0.06656   0.056596  0.162162  0.001616
3      7  7119  3621  3498     0.14484   0.140906  0.027534  0.000108
4      2  6243  3025  3218     0.12100   0.129627 -0.068874  0.000594
5      0  5523  2788  2735     0.11152   0.110171  0.012168  0.000016
6      6  5389  2665  2724     0.10660   0.109728 -0.028922  0.000090
7      3  8078  3931  4147     0

   Value   All  Good   Bad  Distr_Good  Distr_Bad       WoE        IV
0      3  4982  2258  2724     0.09032   0.109728 -0.194647  0.003778
1      2  4918  1982  2936     0.07928   0.118268 -0.399966  0.015594
2      8  4981  3213  1768     0.12852   0.071219  0.590332  0.033827
3      0  4983  1231  3752     0.04924   0.151138 -1.121487  0.114277
4      4  5045  2486  2559     0.09944   0.103082 -0.035966  0.000131
5      7  4984  3001  1983     0.12004   0.079879  0.407310  0.016358
6      6  4982  2769  2213     0.11076   0.089144  0.217113  0.004693
7      9  4983  3625  1358     0.14500   0.054703  0.974817  0.088023
8      5  4920  2606  2314     0.10424   0.093212  0.111814  0.001233
9      1  5047  1829  3218     0.07316   0.129627 -0.572015  0.032300
IV score: 0.3102139767999645

Calculating WoE and IV for EXT_SOURCE_3
   Value    All  Good   Bad  Distr_Good  Distr_Bad       WoE        IV
0      1   5031  1748  3283     0.06992   0.132246 -0.637310  0.039721
1      4   4760  2

   Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE            IV
0   -1.0  30705  14601  16104     0.58404   0.648701 -0.105002  6.789543e-03
1    8.0   1912   1127    785     0.04508   0.031621  0.354606  4.772521e-03
2    0.0   3832   2054   1778     0.08216   0.071621  0.137275  1.446697e-03
3    1.0   1907    958    949     0.03832   0.038228  0.002414  2.231038e-07
4    3.0   1913   1041    872     0.04164   0.035126  0.170123  1.108202e-03
5    4.0   1917   1056    861     0.04224   0.034683  0.197124  1.489712e-03
6    2.0   1909    972    937     0.03888   0.037744  0.029648  3.367381e-05
7    7.0   1911   1081    830     0.04324   0.033434  0.257192  2.522010e-03
8    5.0   1906   1026    880     0.04104   0.035448  0.146477  8.190765e-04
9    6.0   1913   1084    829     0.04336   0.033394  0.261168  2.602868e-03
IV score: 0.021584525867302518

Calculating WoE and IV for LIVINGAPARTMENTS_AVG
    Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV

   Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
0   -1.0  28021  13106  14915     0.52424   0.600806 -0.136322  0.010438
1    0.0  16434   8605   7829     0.34420   0.315368  0.087484  0.002522
2    1.0   2268   1365    903     0.05460   0.036375  0.406163  0.007402
3    2.0   1424    873    551     0.03492   0.022195  0.453176  0.005766
4    3.0   1678   1051    627     0.04204   0.025257  0.509526  0.008551
IV score: 0.034680396645136016

Calculating WoE and IV for ENTRANCES_MODE
   Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
0   -1.0  26515  12304  14211     0.49216   0.572447 -0.151117  0.012133
1    3.0   5407   2965   2442     0.11860   0.098369  0.187035  0.003784
2    1.0   4296   2204   2092     0.08816   0.084270  0.045129  0.000176
3    0.0   3136   1627   1509     0.06508   0.060785  0.068266  0.000293
4    7.0   1297    755    542     0.03020   0.021833  0.324427  0.002715
5    5.0   2956   1684   1272     0.06736   0.0512

    Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE            IV
0    -1.0  25770  11962  13808     0.47848   0.556213 -0.150538  1.170183e-02
1     7.0   2549   1388   1161     0.05552   0.046767  0.171558  1.501579e-03
2     6.0   2381   1329   1052     0.05316   0.042377  0.226709  2.444686e-03
3     9.0   2401   1463    938     0.05852   0.037784  0.437470  9.071160e-03
4     3.0   2339   1218   1121     0.04872   0.045156  0.075964  2.707301e-04
5     8.0   1980   1130    850     0.04520   0.034240  0.277712  3.043812e-03
6     0.0   2587   1289   1298     0.05156   0.052286 -0.013983  1.015133e-05
7     2.0   2475   1340   1135     0.05360   0.045720  0.159012  1.253011e-03
8     4.0   2474   1322   1152     0.05288   0.046405  0.130622  8.457963e-04
9     5.0   2325   1282   1043     0.05128   0.042014  0.199296  1.846653e-03
10    1.0   2544   1277   1267     0.05108   0.051037  0.000837  3.577529e-08
IV score: 0.03198944356571321

Calculating WoE and IV for YEARS_

   Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
0      0  34250  17369  16881     0.69476   0.680000  0.021474  0.000317
1      1   4892   2434   2458     0.09736   0.099013 -0.016837  0.000028
2      4   4974   2411   2563     0.09644   0.103243 -0.068161  0.000464
3      2   3339   1664   1675     0.06656   0.067472 -0.013613  0.000012
4      3   2370   1122   1248     0.04488   0.050272 -0.113454  0.000612
IV score: 0.001432618727345162

Calculating WoE and IV for DEF_30_CNT_SOCIAL_CIRCLE
   Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
0      0  48469  24461  24008     0.97844    0.96709  0.011668  0.000132
1      1   1356    539    817     0.02156    0.03291 -0.422948  0.004801
IV score: 0.00493305840299642

Calculating WoE and IV for OBS_60_CNT_SOCIAL_CIRCLE
   Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
0      0  34394  17443  16951     0.69772   0.682820  0.021587  0.000322
1      1   4867   2419   2448 

  result = getattr(ufunc, method)(*inputs, **kwargs)


   Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
0      0  15482   7640   7842      0.3056   0.315891 -0.033121  0.000341
1      1  34343  17360  16983      0.6944   0.684109  0.014931  0.000154
IV score: 0.0004945163110212953

Calculating WoE and IV for NAME_TYPE_SUITE
   Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
0    1.0  40665  20328  20337     0.81312   0.819215 -0.007467  0.000046
1    2.0   6140   3131   3009     0.12524   0.121208  0.032720  0.000132
2    6.0    147     71     76     0.00284   0.003061 -0.075078  0.000017
3    4.0    519    278    241     0.01112   0.009708  0.135800  0.000192
4    3.0   1816    921    895     0.03684   0.036052  0.021612  0.000017
5    5.0    317    143    174     0.00572   0.007009 -0.203235  0.000262
6    0.0    172    102     70     0.00408   0.002820  0.369453  0.000466
7    7.0     49     26     23     0.00104   0.000926  0.115578  0.000013
IV score: 0.0011435333729329247

Calculating WoE

   Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
0      1   4845   1906   2939     0.07624   0.118389 -0.440087  0.018549
1      0  44980  23094  21886     0.92376   0.881611  0.046701  0.001968
IV score: 0.020517502695266702

Calculating WoE and IV for REG_CITY_NOT_WORK_CITY
   Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
0      1  13105   5585   7520      0.2234    0.30292 -0.304506  0.024214
1      0  36720  19415  17305      0.7766    0.69708  0.108026  0.008590
IV score: 0.032804742664735216

Calculating WoE and IV for LIVE_CITY_NOT_WORK_CITY
   Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
0      0  39953  20631  19322     0.82524   0.778328  0.058526  0.002746
1      1   9872   4369   5503     0.17476   0.221672 -0.237784  0.011155
IV score: 0.013900392508976522

Calculating WoE and IV for ORGANIZATION_TYPE
    Value    All  Good   Bad  Distr_Good  Distr_Bad       WoE        IV
0       3   6852  2944  3908 

   Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
0      0  49705  24910  24795      0.9964   0.998792 -0.002397  0.000006
1      1    120     90     30      0.0036   0.001208  1.091588  0.002611
IV score: 0.0026163096968026566

Calculating WoE and IV for FLAG_DOCUMENT_14
   Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
0      0  49718  24923  24795     0.99692   0.998792 -0.001876  0.000004
1      1    107     77     30     0.00308   0.001208  0.935583  0.001751
IV score: 0.0017544927311977602

Calculating WoE and IV for FLAG_DOCUMENT_15
   Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE            IV
0      0  49781  24967  24814     0.99868   0.999557 -0.000878  7.696291e-07
1      1     44     33     11     0.00132   0.000443  1.091588  9.572114e-04
IV score: 0.0009579809914384317

Calculating WoE and IV for FLAG_DOCUMENT_16
   Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
0      0  49419  24744  246

In [14]:
def woe_transform(df, target_col):
    woe_df = pd.DataFrame()
    
    for col in df.columns:
        if col != target_col:
            # Calculate the number of events and non-events for each category in the column
            temp_df = df.groupby([col, target_col]).size().unstack(target_col).reset_index().fillna(0)
            temp_df.columns = [col, 'non_event', 'event']
            
            # Calculate WoE for each category in the column
            temp_df['woe'] = np.log((temp_df['event'] / temp_df['event'].sum()) / (temp_df['non_event'] / temp_df['non_event'].sum()))
            
            # Create a dictionary to map original values to WoE values
            woe_dict = temp_df[[col, 'woe']].set_index(col).to_dict()['woe']
            
            # Replace the original values in the dataframe with their corresponding WoE values
            woe_df[col] = df[col].map(woe_dict)
    
    return woe_df

# Usage example:
# cat_df is the categorical dataframe
# 'TARGET' is the target column

woe_cat_df = woe_transform(cat_df, 'TARGET')


  woe_df[col] = df[col].map(woe_dict)
  woe_df[col] = df[col].map(woe_dict)
  woe_df[col] = df[col].map(woe_dict)
  woe_df[col] = df[col].map(woe_dict)
  woe_df[col] = df[col].map(woe_dict)
  woe_df[col] = df[col].map(woe_dict)
  woe_df[col] = df[col].map(woe_dict)
  woe_df[col] = df[col].map(woe_dict)
  woe_df[col] = df[col].map(woe_dict)
  woe_df[col] = df[col].map(woe_dict)
  woe_df[col] = df[col].map(woe_dict)
  woe_df[col] = df[col].map(woe_dict)
  woe_df[col] = df[col].map(woe_dict)
  woe_df[col] = df[col].map(woe_dict)
  woe_df[col] = df[col].map(woe_dict)
  woe_df[col] = df[col].map(woe_dict)
  woe_df[col] = df[col].map(woe_dict)


In [15]:
print(woe_cat_df.head())
def calculate_iv(data, target):
    iv_list = []
    features = data.columns.tolist()
    features.remove(target)

    for feature in features:
        lst = []
        for i in range(data[feature].nunique()):
            val = list(data[feature].unique())[i]
            lst.append({
                'Variable': feature,
                'Value': val,
                'All': data[data[feature] == val].count()[feature],
                'Good': data[(data[feature] == val) & (data[target] == 0)].count()[feature],
                'Bad': data[(data[feature] == val) & (data[target] == 1)].count()[feature]
            })

        dset = pd.DataFrame(lst)
        dset['Distr_Good'] = dset['Good'] / dset['Good'].sum()
        dset['Distr_Bad'] = dset['Bad'] / dset['Bad'].sum()
        dset['WoE'] = np.log(dset['Distr_Good'] / dset['Distr_Bad'])
        dset['IV'] = (dset['Distr_Good'] - dset['Distr_Bad']) * dset['WoE']
        iv = dset['IV'].sum()
        iv_list.append({'Variable': feature, 'IV': iv})

    iv_table = pd.DataFrame(iv_list)
    return iv_table

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

iv_table = calculate_iv(cat_df, 'TARGET')
print(iv_table)

   NAME_CONTRACT_TYPE  FLAG_OWN_CAR  FLAG_OWN_REALTY  NAME_TYPE_SUITE   
0            0.038344      0.054818         0.033121         0.007467  \
1            0.038344      0.054818         0.033121         0.007467   
2            0.038344     -0.114623         0.033121         0.007467   
3            0.038344      0.054818         0.033121         0.007467   
4            0.038344      0.054818        -0.014931        -0.032720   

   NAME_INCOME_TYPE  NAME_EDUCATION_TYPE  NAME_FAMILY_STATUS   
0         -0.069971             0.118757           -0.063117  \
1         -0.069971             0.118757           -0.063117   
2          0.192557            -0.441644           -0.063117   
3          0.192557             0.118757            0.204037   
4          0.192557             0.118757           -0.063117   

   NAME_HOUSING_TYPE  FLAG_MOBIL  FLAG_EMP_PHONE  ...   
0          -0.243644         0.0        0.081845  ...  \
1           0.399897         0.0        0.081845  ...   
2    

In [16]:
# Split the data into train and test sets
X = woe_cat_df
y = cat_df['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)




In [17]:
# Logistic Regression

# Create a logistic regression model
model = LogisticRegression(random_state=42)
start_time = time.time()
model.fit(X_train, y_train)
elapsed_time = time.time() - start_time

# Predict on the testing data
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

# Print the confusion matrix and accuracy
print('Logistic Regression Results: ')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('AUC: ', auc)
print(f'  Elapsed Time: {elapsed_time:.2f} seconds')
print(f'Confusion Matrix:\n{conf_matrix}\n')


Logistic Regression Results: 
Accuracy: 0.6802916778164303
Precision: 0.6795821211319757
Recall: 0.685657104736491
F1 Score: 0.6826060968320383
AUC:  0.7443388843602251
  Elapsed Time: 0.60 seconds
Confusion Matrix:
[[5030 2423]
 [2356 5139]]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# Fit the SVM model

svm_model = SVC(kernel='rbf')

start_time = time.time()
svm_model.fit(X_train, y_train)
elapsed_time = time.time() - start_time

# Make predictions and evaluate the model
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
    
# Print the confusion matrix and accuracy

print('SVM Results: ')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('AUC: ', auc)
print(f'  Elapsed Time: {elapsed_time:.2f} seconds')
print(f'Confusion Matrix:\n{conf_matrix}\n')

SVM Results: 
Accuracy: 0.6800909820711801
Precision: 0.6782288792537118
Recall: 0.6887258172114743
F1 Score: 0.683437044882828
AUC:  0.7443388843602251
  Elapsed Time: 122.09 seconds
Confusion Matrix:
[[5004 2449]
 [2333 5162]]



In [19]:
# Create Gradient Boosting Classifier model


gb_clf = GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)

# Fit the model on the training data
gb_start_time = time.time()
gb_clf.fit(X_train, y_train)
gb_elapsed_time = time.time() - gb_start_time


# Make predictions on the testing data
y_pred = gb_clf.predict(X_test)

# Evaluate the model
gb_accuracy = accuracy_score(y_test, y_pred)
gb_conf_matrix = confusion_matrix(y_test, y_pred)
gb_precision = precision_score(y_test, y_pred)
gb_recall = recall_score(y_test, y_pred)
gb_f1 = f1_score(y_test, y_pred)
gb_auc = roc_auc_score(y_test, y_pred_proba)
    
# Print the confusion matrix and accuracy

print('Gradient Boosting Results: ')
print('Accuracy:', gb_accuracy)
print('Precision:', gb_precision)
print('Recall:', gb_recall)
print('F1 Score:', gb_f1)
print('AUC: ', gb_auc)
print(f'  Elapsed Time: {gb_elapsed_time:.2f} seconds')
print(f'Confusion Matrix:\n{gb_conf_matrix}\n')

Gradient Boosting Results: 
Accuracy: 0.6844393898849345
Precision: 0.6853482786228983
Recall: 0.6852568378919279
F1 Score: 0.6853025552071519
AUC:  0.7443388843602251
  Elapsed Time: 9.35 seconds
Confusion Matrix:
[[5095 2358]
 [2359 5136]]



In [20]:
# Standardize the data
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# Create a deep learning model using Keras
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train_std.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
start_time = time.time()
model.fit(X_train_std, y_train, epochs=10, batch_size=32, verbose=1)
elapsed_time = time.time() - start_time



# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
    
# Print the confusion matrix and accuracy

print('Deep Learning Results: ')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('AUC: ', auc)
print(f'  Elapsed Time: {elapsed_time:.2f} seconds')
print(f'Confusion Matrix:\n{conf_matrix}\n')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Deep Learning Results: 
Accuracy: 0.6844393898849345
Precision: 0.6853482786228983
Recall: 0.6852568378919279
F1 Score: 0.6853025552071519
AUC:  0.7443388843602251
  Elapsed Time: 46.04 seconds
Confusion Matrix:
[[5095 2358]
 [2359 5136]]



In [21]:
medium_iv_variables = [
    'NAME_CONTRACT_TYPE',
    'FLAG_OWN_CAR',
    'NAME_INCOME_TYPE',
    'NAME_EDUCATION_TYPE',
    'NAME_FAMILY_STATUS',
    'NAME_HOUSING_TYPE',
    'FLAG_EMP_PHONE',
    'FLAG_WORK_PHONE',
    'FLAG_PHONE',
    'OCCUPATION_TYPE',
    'REGION_RATING_CLIENT',
    'REGION_RATING_CLIENT_W_CITY',
    'HOUR_APPR_PROCESS_START',
    'REG_CITY_NOT_LIVE_CITY',
    'REG_CITY_NOT_WORK_CITY',
    'LIVE_CITY_NOT_WORK_CITY',
    'ORGANIZATION_TYPE',
    'FONDKAPREMONT_MODE',
    'HOUSETYPE_MODE',
    'WALLSMATERIAL_MODE',
    'EMERGENCYSTATE_MODE',
    'FLAG_DOCUMENT_3',
    'FLAG_DOCUMENT_6',
    'CNT_CHILDREN',
    'AMT_INCOME_TOTAL',
    'AMT_CREDIT',
    'AMT_ANNUITY',
    'AMT_GOODS_PRICE',
    'REGION_POPULATION_RELATIVE',
    'DAYS_BIRTH',
    'DAYS_EMPLOYED',
    'DAYS_REGISTRATION',
    'DAYS_ID_PUBLISH',
    'OWN_CAR_AGE',
    'EXT_SOURCE_1',
    'EXT_SOURCE_2',
    'EXT_SOURCE_3',
    'APARTMENTS_AVG',
    'BASEMENTAREA_AVG',
    'CNT_FAM_MEMBERS',
    'YEARS_BEGINEXPLUATATION_AVG',
    'YEARS_BUILD_AVG',
    'COMMONAREA_AVG',
    'ELEVATORS_AVG',
    'ENTRANCES_AVG',
    'FLOORSMAX_AVG',
    'FLOORSMIN_AVG',
    'LANDAREA_AVG',
    'LIVINGAPARTMENTS_AVG',
    'LIVINGAREA_AVG',
    'NONLIVINGAPARTMENTS_AVG',
    'NONLIVINGAREA_AVG',
    'APARTMENTS_MODE',
    'BASEMENTAREA_MODE',
    'YEARS_BEGINEXPLUATATION_MODE',
    'YEARS_BUILD_MODE',
    'COMMONAREA_MODE',
    'ELEVATORS_MODE',
    'ENTRANCES_MODE',
    'FLOORSMAX_MODE',
    'FLOORSMIN_MODE',
    'LANDAREA_MODE',
    'LIVINGAPARTMENTS_MODE',
    'LIVINGAREA_MODE',
    'NONLIVINGAPARTMENTS_MODE',
    'NONLIVINGAREA_MODE',
    'APARTMENTS_MEDI',
    'BASEMENTAREA_MEDI',
    'YEARS_BEGINEXPLUATATION_MEDI',
    'YEARS_BUILD_MEDI',
    'COMMONAREA_MEDI',
    'ELEVATORS_MEDI',
    'ENTRANCES_MEDI',
    'FLOORSMAX_MEDI',
    'FLOORSMIN_MEDI',
    'LANDAREA_MEDI',
    'LIVINGAPARTMENTS_MEDI',
    'LIVINGAREA_MEDI',
    'NONLIVINGAPARTMENTS_MEDI',
    'NONLIVINGAREA_MEDI',
    'OBS_30_CNT_SOCIAL_CIRCLE',
    'DEF_30_CNT_SOCIAL_CIRCLE',
    'OBS_60_CNT_SOCIAL_CIRCLE',
    'DAYS_LAST_PHONE_CHANGE',
    'AMT_REQ_CREDIT_BUREAU_HOUR',
    'AMT_REQ_CREDIT_BUREAU_DAY',
    'AMT_REQ_CREDIT_BUREAU_WEEK',
        'AMT_REQ_CREDIT_BUREAU_MON',
    'AMT_REQ_CREDIT_BUREAU_QRT',
    'AMT_REQ_CREDIT_BUREAU_YEAR'
]
medium_data = woe_cat_df[medium_iv_variables]
# Split the data into train and test sets
X = medium_data
y = cat_df['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
# Logistic Regression

# Create a logistic regression model
model = LogisticRegression(random_state=42)
start_time = time.time()
model.fit(X_train, y_train)
elapsed_time = time.time() - start_time

# Predict on the testing data
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

# Print the confusion matrix and accuracy
print('Logistic Regression Results: ')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('AUC: ', auc)
print(f'  Elapsed Time: {elapsed_time:.2f} seconds')
print(f'Confusion Matrix:\n{conf_matrix}\n')


Logistic Regression Results: 
Accuracy: 0.6807599678886808
Precision: 0.680259499536608
Recall: 0.68552368245497
F1 Score: 0.6828814460393409
AUC:  0.7436183002094423
  Elapsed Time: 0.69 seconds
Confusion Matrix:
[[5038 2415]
 [2357 5138]]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
# Fit the SVM model

svm_model = SVC(kernel='rbf')

start_time = time.time()
svm_model.fit(X_train, y_train)
elapsed_time = time.time() - start_time

# Make predictions and evaluate the model
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
    
# Print the confusion matrix and accuracy

print('SVM Results: ')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('AUC: ', auc)
print(f'  Elapsed Time: {elapsed_time:.2f} seconds')
print(f'Confusion Matrix:\n{conf_matrix}\n')

SVM Results: 
Accuracy: 0.6802916778164303
Precision: 0.6780750065565172
Recall: 0.6899266177451634
F1 Score: 0.6839494742411216
AUC:  0.7436183002094423
  Elapsed Time: 79.09 seconds
Confusion Matrix:
[[4998 2455]
 [2324 5171]]



In [24]:
# Create Gradient Boosting Classifier model


gb_clf = GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)

# Fit the model on the training data
gb_start_time = time.time()
gb_clf.fit(X_train, y_train)
gb_elapsed_time = time.time() - gb_start_time


# Make predictions on the testing data
y_pred = gb_clf.predict(X_test)

# Evaluate the model
gb_accuracy = accuracy_score(y_test, y_pred)
gb_conf_matrix = confusion_matrix(y_test, y_pred)
gb_precision = precision_score(y_test, y_pred)
gb_recall = recall_score(y_test, y_pred)
gb_f1 = f1_score(y_test, y_pred)
gb_auc = roc_auc_score(y_test, y_pred_proba)
    
# Print the confusion matrix and accuracy

print('Gradient Boosting Results: ')
print('Accuracy:', gb_accuracy)
print('Precision:', gb_precision)
print('Recall:', gb_recall)
print('F1 Score:', gb_f1)
print('AUC: ', gb_auc)
print(f'  Elapsed Time: {gb_elapsed_time:.2f} seconds')
print(f'Confusion Matrix:\n{gb_conf_matrix}\n')

Gradient Boosting Results: 
Accuracy: 0.6855097671929355
Precision: 0.6865153538050734
Recall: 0.6860573715810541
F1 Score: 0.6862862862862863
AUC:  0.7436183002094423
  Elapsed Time: 8.66 seconds
Confusion Matrix:
[[5105 2348]
 [2353 5142]]



In [25]:
# Standardize the data
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# Create a deep learning model using Keras
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train_std.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
start_time = time.time()
model.fit(X_train_std, y_train, epochs=10, batch_size=32, verbose=1)
elapsed_time = time.time() - start_time

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
    
# Print the confusion matrix and accuracy

print('Deep Learning Results: ')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('AUC: ', auc)
print(f'  Elapsed Time: {elapsed_time:.2f} seconds')
print(f'Confusion Matrix:\n{conf_matrix}\n')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Deep Learning Results: 
Accuracy: 0.6855097671929355
Precision: 0.6865153538050734
Recall: 0.6860573715810541
F1 Score: 0.6862862862862863
AUC:  0.7436183002094423
  Elapsed Time: 45.28 seconds
Confusion Matrix:
[[5105 2348]
 [2353 5142]]



In [26]:
high_iv_variables = [
    'EXT_SOURCE_1',
    'EXT_SOURCE_2',
    'EXT_SOURCE_3',
    'AMT_GOODS_PRICE'
]
high_data = woe_cat_df[high_iv_variables]
# Split the data into train and test sets
X = high_data
y = cat_df['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [27]:
# Logistic Regression

# Create a logistic regression model
model = LogisticRegression(random_state=42)
start_time = time.time()
model.fit(X_train, y_train)
elapsed_time = time.time() - start_time

# Predict on the testing data
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

# Print the confusion matrix and accuracy
print('Logistic Regression Results: ')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('AUC: ', auc)
print(f'  Elapsed Time: {elapsed_time:.2f} seconds')
print(f'Confusion Matrix:\n{conf_matrix}\n')


Logistic Regression Results: 
Accuracy: 0.6677816430291678
Precision: 0.6677278153601274
Recall: 0.6716477651767845
F1 Score: 0.6696820540109086
AUC:  0.7265399885983292
  Elapsed Time: 0.02 seconds
Confusion Matrix:
[[4948 2505]
 [2461 5034]]



In [28]:
# Fit the SVM model

svm_model = SVC(kernel='rbf')

start_time = time.time()
svm_model.fit(X_train, y_train)
elapsed_time = time.time() - start_time

# Make predictions and evaluate the model
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
    
# Print the confusion matrix and accuracy

print('SVM Results: ')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('AUC: ', auc)
print(f'  Elapsed Time: {elapsed_time:.2f} seconds')
print(f'Confusion Matrix:\n{conf_matrix}\n')

SVM Results: 
Accuracy: 0.6679154401926679
Precision: 0.6719662997689904
Recall: 0.6597731821214142
F1 Score: 0.6658139221758448
AUC:  0.7265399885983292
  Elapsed Time: 20.93 seconds
Confusion Matrix:
[[5039 2414]
 [2550 4945]]



In [29]:
# Create Gradient Boosting Classifier model


gb_clf = GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)

# Fit the model on the training data
gb_start_time = time.time()
gb_clf.fit(X_train, y_train)
gb_elapsed_time = time.time() - gb_start_time


# Make predictions on the testing data
y_pred = gb_clf.predict(X_test)

# Evaluate the model
gb_accuracy = accuracy_score(y_test, y_pred)
gb_conf_matrix = confusion_matrix(y_test, y_pred)
gb_precision = precision_score(y_test, y_pred)
gb_recall = recall_score(y_test, y_pred)
gb_f1 = f1_score(y_test, y_pred)
gb_auc = roc_auc_score(y_test, y_pred_proba)
    
# Print the confusion matrix and accuracy

print('Gradient Boosting Results: ')
print('Accuracy:', gb_accuracy)
print('Precision:', gb_precision)
print('Recall:', gb_recall)
print('F1 Score:', gb_f1)
print('AUC: ', gb_auc)
print(f'  Elapsed Time: {gb_elapsed_time:.2f} seconds')
print(f'Confusion Matrix:\n{gb_conf_matrix}\n')

Gradient Boosting Results: 
Accuracy: 0.6677816430291678
Precision: 0.670578713071631
Recall: 0.6632421614409606
F1 Score: 0.6668902602629462
AUC:  0.7265399885983292
  Elapsed Time: 0.92 seconds
Confusion Matrix:
[[5011 2442]
 [2524 4971]]



In [30]:
# Standardize the data
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# Create a deep learning model using Keras
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train_std.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
start_time = time.time()
model.fit(X_train_std, y_train, epochs=10, batch_size=32, verbose=1)
elapsed_time = time.time() - start_time
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
    
# Print the confusion matrix and accuracy

print('Deep Learning Results: ')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('AUC: ', auc)
print(f'  Elapsed Time: {elapsed_time:.2f} seconds')
print(f'Confusion Matrix:\n{conf_matrix}\n')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Deep Learning Results: 
Accuracy: 0.6677816430291678
Precision: 0.670578713071631
Recall: 0.6632421614409606
F1 Score: 0.6668902602629462
AUC:  0.7265399885983292
  Elapsed Time: 45.72 seconds
Confusion Matrix:
[[5011 2442]
 [2524 4971]]

