In [109]:
import pandas as pd

df = pd.read_parquet('../data/processed/application_clean.parquet')

df

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,CREDIT_INCOME_RATIO,ANNUITY_INCOME_RATIO,HAS_CHILDREN,EXT_SOURCE_MEAN,EXT_SOURCE_MAX,EXT_SOURCE_MIN,EMPLOYMENT_RATIO,INCOME_PER_PERSON
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.0,1.0,2.007889,0.121978,0,0.161787,0.262949,0.083037,0.067329,101250.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,4.790750,0.132217,0,0.489596,0.622246,0.311267,0.070862,90000.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,0.0,2.000000,0.100000,0,0.597159,0.729567,0.505998,0.011814,33750.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0.0,1.0,2.316167,0.219900,0,0.563905,0.650442,0.505998,0.159905,45000.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0.0,0.0,4.222222,0.179963,0,0.454671,0.535276,0.322738,0.152418,60750.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,...,0.0,1.0,1.617143,0.174971,0,0.454160,0.681632,0.145570,0.025303,78750.0
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,...,0.0,1.0,3.743750,0.166687,0,0.385755,0.535276,0.115992,-17.580890,36000.0
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,...,0.0,1.0,4.429176,0.195941,0,0.499536,0.744026,0.218859,0.529266,76500.0
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,...,0.0,0.0,2.164368,0.118158,0,0.560395,0.661024,0.505998,0.400134,57000.0


In [110]:
df.select_dtypes(include=["object", "string"]).columns.tolist()



['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'EMERGENCYSTATE_MODE']

In [111]:
df.isna().sum()

SK_ID_CURR            0
TARGET                0
NAME_CONTRACT_TYPE    0
CODE_GENDER           0
FLAG_OWN_CAR          0
                     ..
EXT_SOURCE_MEAN       0
EXT_SOURCE_MAX        0
EXT_SOURCE_MIN        0
EMPLOYMENT_RATIO      0
INCOME_PER_PERSON     0
Length: 109, dtype: int64

In [112]:
X = df.drop(columns=['TARGET'])
y = df['TARGET']

In [113]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size = 0.2,
    random_state = 42,
    stratify = y
)

In [114]:
num_cols = X_train.select_dtypes(exclude=['object', 'str']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object', 'str']).columns.tolist()

print("Num cols:", num_cols[:5])
print("Cat cols:", cat_cols[:5])

Num cols: ['SK_ID_CURR', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY']
Cat cols: ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE']


In [115]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, accuracy_score,f1_score,recall_score, precision_score

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
])

pipe = Pipeline([
    ('preprocess', preprocessor),
    ('model', LogisticRegression(max_iter=7000, class_weight='balanced'))
])

pipe.fit(X_train, y_train)

y_val_pred = pipe.predict_proba(X_val)[:, 1]

y_val_class = (y_val_pred >= 0.5).astype(int)

roc_auc = roc_auc_score(y_val, y_val_pred)
accuracy = accuracy_score(y_val, y_val_class)
f1 = f1_score(y_val, y_val_class)
recall = recall_score(y_val, y_val_class)
precision = precision_score(y_val, y_val_class)


In [116]:
print("ROC-AUC:", roc_auc, "\naccuracy:", accuracy, "\nF1 :", f1, "\nrecall:", recall, "\nprecision:", precision)

ROC-AUC: 0.7478943890975198 
accuracy: 0.688584296700974 
F1 : 0.2609869969518077 
recall: 0.6811681772406848 
precision: 0.16141657121038563
