In [2]:
# Parameters
DATA_DIR = './data' 
OUTPUT_DIR = './midsem_output'
NROWS = 20000 # 20000
MODEL = 'lgb' 
USE_ONEHOT = True

In [3]:
import os
from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, classification_report, confusion_matrix, roc_curve


from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib

In [5]:
HAS_LGB = False

In [7]:
# Utility functions


def optimize_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    for c in df.select_dtypes(include=['int64']).columns:
        df[c] = pd.to_numeric(df[c], downcast='integer')
    for c in df.select_dtypes(include=['float64']).columns:
        df[c] = pd.to_numeric(df[c], downcast='float')
    for c in df.select_dtypes(include=['object']).columns:
        if df[c].nunique(dropna=False) < 50:
            df[c] = df[c].astype('category')
    return df




def save_fig(fig, path: Path):
    fig.tight_layout()
    fig.savefig(path, bbox_inches='tight')
    plt.close(fig)




def pr_auc_score(y_true, y_scores):
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    return auc(recall, precision)

In [10]:
DATA_DIR = Path(DATA_DIR)
OUTPUT_DIR = Path(OUTPUT_DIR)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


train_path = DATA_DIR / 'application_train.csv'



app = pd.read_csv(train_path, nrows=NROWS)
app = optimize_dtypes(app)
print('shape: ', app.shape)


print('TARGET distribution:')
print(app['TARGET'].value_counts(normalize=True))

shape:  (20000, 122)
TARGET distribution:
TARGET
0    0.9211
1    0.0789
Name: proportion, dtype: float64


In [12]:
if 'DAYS_EMPLOYED' in app.columns:
    app.loc[app['DAYS_EMPLOYED'] > 200000, 'DAYS_EMPLOYED'] = np.nan


# Target distribution plot
fig, ax = plt.subplots(figsize=(6,4))
sns.countplot(x='TARGET', data=app, ax=ax)
ax.set_title('Target distribution')
save_fig(fig, OUTPUT_DIR / 'target_distribution.png')


# Missingness (top 40 cols)
missing = app.isna().mean().sort_values(ascending=False)
fig, ax = plt.subplots(figsize=(8,6))
sns.barplot(x=missing.head(40).values, y=missing.head(40).index, ax=ax)
ax.set_xlabel('Fraction missing')
ax.set_title('Top 40 columns by missing fraction')
save_fig(fig, OUTPUT_DIR / 'missing_top40.png')

In [14]:
numeric_samples = [c for c in ['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','AMT_CREDIT'] if c in app.columns]
if numeric_samples:
    fig, axes = plt.subplots(nrows=len(numeric_samples), figsize=(8,3*len(numeric_samples)))
    if len(numeric_samples) == 1:
        axes = [axes]
    for ax, col in zip(axes, numeric_samples):
        sns.histplot(app[col].dropna(), ax=ax, kde=True)
        ax.set_title(col)
    save_fig(fig, OUTPUT_DIR / 'numeric_samples.png')

In [16]:
if 'NAME_CONTRACT_TYPE' in app.columns:
    grp = app.groupby('NAME_CONTRACT_TYPE')['TARGET'].mean().sort_values()
    fig, ax = plt.subplots(figsize=(6,3))
    sns.barplot(x=grp.values, y=grp.index, ax=ax)
    ax.set_xlabel('Default rate')
    ax.set_title('Default rate by NAME_CONTRACT_TYPE')
    save_fig(fig, OUTPUT_DIR / 'target_rate_by_contract_type.png')

  grp = app.groupby('NAME_CONTRACT_TYPE')['TARGET'].mean().sort_values()


In [17]:
app.describe().to_csv(OUTPUT_DIR / 'app_describe.csv')
app.isna().mean().sort_values(ascending=False).to_csv(OUTPUT_DIR / 'missing_fraction.csv')

In [19]:
X = app.drop(columns=['TARGET','SK_ID_CURR'], errors='ignore')
y = app['TARGET']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print('Train/Valid sizes', X_train.shape, X_valid.shape)


# Build preprocessor
num_cols = X_train.select_dtypes(include=['int','float']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['category','object']).columns.tolist()
print('Num cols', len(num_cols), 'Cat cols', len(cat_cols))


num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
if USE_ONEHOT and cat_cols:
    cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
else:
    cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])


transformers = []
if num_cols:
    transformers.append(('num', num_pipe, num_cols))
if cat_cols:
    transformers.append(('cat', cat_pipe, cat_cols))


preproc = ColumnTransformer(transformers=transformers, remainder='drop')

Train/Valid sizes (16000, 120) (4000, 120)
Num cols 66 Cat cols 16


In [23]:
def get_model(name='lgb'):
    name = name.lower()
    if name in ('lgb','lightgbm') and HAS_LGB:
        return lgb.LGBMClassifier(n_estimators=1000, n_jobs=-1, random_state=42)
    elif name in ('rf','randomforest'):
        return RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42)
    else:
        return LogisticRegression(class_weight='balanced', max_iter=1000, solver='lbfgs')


clf = get_model(MODEL)
imb_pipeline = ImbPipeline([('preproc', preproc), ('smote', SMOTE(random_state=42)), ('clf', clf)])


print('why you no run, you stupid?')
imb_pipeline.fit(X_train, y_train)

why you no run, you stupid?


0,1,2
,steps,"[('preproc', ...), ('smote', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [24]:
proba = imb_pipeline.predict_proba(X_valid)[:,1]
pred = imb_pipeline.predict(X_valid)
roc = roc_auc_score(y_valid, proba)
pr = pr_auc_score(y_valid, proba)


print('Validation ROC AUC:', roc)
print('Validation PR AUC:', pr)
print('\nClassification report:\n', classification_report(y_valid, pred))
print('\nConfusion matrix:\n', confusion_matrix(y_valid, pred))

Validation ROC AUC: 0.7184154193982876
Validation PR AUC: 0.18188522469663643

Classification report:
               precision    recall  f1-score   support

           0       0.96      0.70      0.81      3684
           1       0.15      0.62      0.24       316

    accuracy                           0.69      4000
   macro avg       0.55      0.66      0.52      4000
weighted avg       0.89      0.69      0.76      4000


Confusion matrix:
 [[2566 1118]
 [ 119  197]]


In [25]:
# ROC curve
fpr, tpr, _ = roc_curve(y_valid, proba)
fig, ax = plt.subplots(figsize=(6,6))
ax.plot(fpr, tpr, label=f'AUC={roc:.4f}')
ax.plot([0,1],[0,1], linestyle='--', color='gray')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curve (validation)')
ax.legend()
save_fig(fig, OUTPUT_DIR / 'roc_curve.png')


# PR curve
precision, recall, _ = precision_recall_curve(y_valid, proba)
fig, ax = plt.subplots(figsize=(6,6))
ax.plot(recall, precision, label=f'PR AUC={pr:.4f}')
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_title('Precision-Recall Curve (validation)')
ax.legend()
save_fig(fig, OUTPUT_DIR / 'pr_curve.png')


# Confusion matrix heatmap
fig, ax = plt.subplots(figsize=(5,4))
sns.heatmap(confusion_matrix(y_valid, pred), annot=True, fmt='d', ax=ax, cmap='Blues')
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix')
save_fig(fig, OUTPUT_DIR / 'confusion_matrix.png')

In [27]:
metrics = {'roc_auc': float(roc), 'pr_auc': float(pr), 'confusion_matrix': confusion_matrix(y_valid, pred).tolist()}
with open(OUTPUT_DIR / 'metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)


joblib.dump(imb_pipeline, OUTPUT_DIR / 'imb_pipeline.joblib')

['midsem_output/imb_pipeline.joblib']