<a href="https://colab.research.google.com/github/avionerman/machine_learning_2025/blob/main/Exercise_7_Data_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Βιβλιοθήκες

In [1]:
# !pip install ydata-profiling --quiet

import pandas as pd
import numpy as np

# sklearn for preprocessing & metrics
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# For Random Forest for feature importance, etc.
from sklearn.ensemble import RandomForestClassifier

# PyTorch for GPU-based classifier
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

# ydata-profiling
# from ydata_profiling import ProfileReport

# Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


# Ερώτημα 1

In [32]:
data_path = "/content/bankloan.csv"

df = pd.read_csv(data_path)

# df.shape
#df.info()
# display(df.describe().T)
#categorical_columns = df.select_dtypes(include=['object']).columns
# print(categorical_columns.size)
#numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
# print(numerical_columns)
#nan_total_ratio = df.isna().mean().sort_values(ascending=False)
# display(nan_total_ratio)

#profile = ProfileReport(df, title="Bank loan dataset", explorative=True)
#profile.to_file('report.json')

# Ερώτημα 2 - Basic with Mean imputer

*   Θα αφαιρεσω features οπως τα IDs (member_id, raw_id, κλπ_ διοτι ειναι μοναδικα και δεν συμβαλουν καπως σε συμπερασματα ποιοτικα. Υπαρχει μεγαλο ρισκο για overfitting αν τα κρατησω.
*   Μεσω του nan_total_ratio παρατηρησα οτι υπαρχουν κενα σε πολλα features και θα δημιουργησει προβληματα. Θα τα αντικαταστησω με τον μεσο ορο (mean) το οποιο βεβαια θα μου μειωσει το variance ειναι ενα ρισκο που θα παρω, καθως το μειωμενο variance < nan values). Για αυτο θα δοκιμασω να εισαγω και τον kNN imputator που θα εχει καλυτερα αποτελεσματα αλλα θα ειναι πιο αργος (στο δευτερο σκελος - πραγματικα πολυ πολυ αργος!!)
*   Τελος θα μετατρεψω την ιδεα της λυσης απο πολλες κλασεις, σε δυο (binary) για να το κανω πιο απλο και ευκολα κατανοητο προς παραπανω αναλυση.



In [28]:
"""# normal imputer (Mean)

# 2.1 Μέσο, Μέγιστο, Ελάχιστο loan_amnt
min_loan = df['loan_amnt'].min()
max_loan = df['loan_amnt'].max()
mean_loan = df['loan_amnt'].mean()

print(f"\nLoan Amount - Min: {min_loan}, Max: {max_loan}, Mean: {mean_loan:.2f}")

# 2.2 Feature Selection/Cleaning
cols_to_drop = ['id', 'member_id', 'Row ID', 'title']
df_clean = df.drop(columns=cols_to_drop)

# 2.3 handle missing values combined with encoding
numerical_cols = df_clean.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df_clean.select_dtypes(include=['object']).columns

# imputation with the mean
df_clean[numerical_cols] = df_clean[numerical_cols].fillna(df_clean[numerical_cols].mean())

# we drop NaN values from categorical > replace them (due to too many NaNs)
df_clean.dropna(subset=categorical_cols, inplace=True)

def define_target(sub_grade):
    if pd.isna(sub_grade): return 0
    if sub_grade.startswith('A'): return 1
    if sub_grade in ['B1', 'B2']: return 1
    return 0

df_clean['Target'] = df_clean['sub_grade'].apply(define_target)

print("\ntarget: ", df_clean['Target'].value_counts(normalize=True))

# bins for the loan amount (looking for the prob. < 15%)
df_clean['loan_bin'] = pd.cut(df_clean['loan_amnt'], bins=range(0, 40000, 5000))
prob_per_bin = df_clean.groupby('loan_bin', observed=False)['Target'].mean()

#print("\naccepted prob per bin: ", prob_per_bin)
for x in prob_per_bin.index.tolist():
  print(prob_per_bin[x])

# range of probs
valid_bins = prob_per_bin[prob_per_bin > 0.15]
print("\nΕύρη με πιθανότητα > 15%:", valid_bins.index.tolist())"""


Loan Amount - Min: 1000, Max: 35000, Mean: 15257.97

target:  Target
0    0.708492
1    0.291508
Name: proportion, dtype: float64
0.25518882251695857
0.35874252938987283
0.29317818316682503
0.2851569588096534
0.2881468966743731
0.28300173991981237
0.1279341776236186

Εύρη με πιθανότητα > 15%: [Interval(0, 5000, closed='right'), Interval(5000, 10000, closed='right'), Interval(10000, 15000, closed='right'), Interval(15000, 20000, closed='right'), Interval(20000, 25000, closed='right'), Interval(25000, 30000, closed='right')]


# Ερώτημα 2 - Alt. with kNN imputer

# Ερώτημα 2 - Simple imputer

In [33]:
if "loan_amnt" not in df.columns:
    raise ValueError("Column 'loan_amnt' not found in dataset!")

loan_stats = df["loan_amnt"].describe()
print("\n[2.1] loan_amnt stats:")
print(loan_stats[["min", "mean", "max"]])

# ------------------------------------------------------------
# 2.2  Ποιες μεταβλητές αφαιρούμε και ποιες κρατάμε για το μοντέλο
#      (κρατάμε ένα καθαρό σύνολο features για classification)
# ------------------------------------------------------------

# Features που ΘΑ ΧΡΗΣΙΜΟΠΟΙΗΣΟΥΜΕ στο μοντέλο (numeric + categorical)
numeric_features = [
    "loan_amnt",
    "funded_amnt",
    "annual_inc",
    "dti",
    "delinq_2yrs",
    "inq_last_6mths",
    "open_acc",
    "pub_rec",
    "revol_bal",
    "revol_util",
    "total_acc",
]

categorical_features = [
    "term",
    "home_ownership",
    "verification_status",
    "purpose",
    "initial_list_status",
    "application_type",
]

# Στήλες που ΓΕΝΙΚΑ δε θα χρησιμοποιήσουμε ως features (για αναφορά)
#columns_to_drop_example = [
#    "id", "member_id", "emp_title", "title", "loan_status",
#    "grade", "sub_grade",  # επειδή ορίζουμε από αυτά το target
#    "Unnamed: 0", "Row ID", "Unnamed: 50"
#]
#print("\n[2.2] Example columns to drop / not use as X:", columns_to_drop_example)

# ------------------------------------------------------------
# 2.3 + 2.4  Ορισμός target (good_loan) και basic preprocessing
# ------------------------------------------------------------

# Πρέπει να υπάρχει η στήλη sub_grade για να ορίσουμε το target
if "sub_grade" not in df.columns:
    raise ValueError("Column 'sub_grade' not found in dataset!")

# 2.4: Ορισμός των good grades (A1–A5, B1, B2)
good_grades = ['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2']
df["good_loan"] = df["sub_grade"].isin(good_grades).astype(int)

print("\n[2.4] Target 'good_loan' value counts:")
print(df["good_loan"].value_counts())
print("\n[2.4] Target 'good_loan' distribution (ratio):")
print(df["good_loan"].value_counts(normalize=True))

# Δημιουργούμε df_model μόνο με τα features που θέλουμε + target
all_feature_cols = numeric_features + categorical_features
missing_features = [c for c in all_feature_cols if c not in df.columns]
if missing_features:
    raise ValueError(f"Missing expected feature columns: {missing_features}")

df_model = df[all_feature_cols + ["good_loan"]].copy()

X = df_model[all_feature_cols]
y = df_model["good_loan"]

print("\n[2.3] X shape:", X.shape)
print("[2.3] y positive ratio (good_loan=1):", y.mean())

# Preprocessing pipelines
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Fit μόνο στο X (train/test split θα γίνει στο 3.x)
X_processed_sample = preprocess.fit_transform(X)
print("\n[2.3] Preprocessing fitted on X. Processed shape (for all data):", X_processed_sample.shape)

# ------------------------------------------------------------
# 2.5  loan_amnt ranges με P(good_loan=1) >= 15%
# ------------------------------------------------------------

# Ομαδοποίηση σε bins 5.000€
bins = list(range(0, 40001, 5000))  # 0–5k, 5–10k, ..., 35–40k
df["loan_bin_5k"] = pd.cut(df["loan_amnt"], bins=bins)

prob_by_bin = df.groupby("loan_bin_5k", observed=True)["good_loan"].mean()
count_by_bin = df.groupby("loan_bin_5k", observed=True)["good_loan"].count()

result_25 = pd.DataFrame(
    {
        "n_samples": count_by_bin,
        "good_loan_prob": prob_by_bin,
    }
).sort_index()

print("\n[2.5] P(good_loan=1) ανά εύρος loan_amnt (bins 5.000€):")
print(result_25)

# Φιλτράρουμε μόνο τα bins με prob >= 0.15 (15%)
valid_bins = result_25[result_25["good_loan_prob"] >= 0.15]
print("\n[2.5] Εύρη loan_amnt με P(good_loan=1) >= 15%:")
print(valid_bins if not valid_bins.empty else "Κανένα εύρος δεν ικανοποιεί το κριτήριο.")


[2.1] loan_amnt stats:
min      1000.00000
mean    15257.96553
max     35000.00000
Name: loan_amnt, dtype: float64

[2.4] Target 'good_loan' value counts:
good_loan
0    151709
1     61290
Name: count, dtype: int64

[2.4] Target 'good_loan' distribution (ratio):
good_loan
0    0.712252
1    0.287748
Name: proportion, dtype: float64

[2.3] X shape: (212999, 17)
[2.3] y positive ratio (good_loan=1): 0.28774782980201785

[2.3] Preprocessing fitted on X. Processed shape (for all data): (212999, 37)

[2.5] P(good_loan=1) ανά εύρος loan_amnt (bins 5.000€):
                n_samples  good_loan_prob
loan_bin_5k                              
(0, 5000]           23591        0.248018
(5000, 10000]       52221        0.352425
(10000, 15000]      48125        0.290306
(15000, 20000]      36573        0.282640
(20000, 25000]      24675        0.283728
(25000, 30000]      14418        0.279096
(30000, 35000]      13396        0.127053

[2.5] Εύρη loan_amnt με P(good_loan=1) >= 15%:
                

# Ερώτημα 3

Θα πειραματηστω με το RF γιατι μπορει και παραγει πολλα πιθανα δεντρα, αρα πολλα πιθανα αποτελεσματα και εχει τεχνικες να αγνοει τα μη ρεαλιστικα/καλα "κλαδια" του. Ετσι θεωρω πως στο τελος, ο ΜΟ των απαντησεων των θετικων κλαδιων θα μας δωσει αυτο που επιθυμουμε (δε θα εχω overfitting, δε θα επηρεαστει το αποτελεσμα τοσο πολυ απο τους outliers)

*   Αρχικα αφαιρεσα τα grade, sub_grade και int_rate. Το int_rate γτ αρχικα το βαζει η ιδια η τραπεζα και δε θελω να κλεψω τον εαυτο μου. Μετα εβγαλα τα grades γιατι αν ξερουμε ηδη τον βαθμο του καθε πελατη, τοτε ειναι biased το μοντελο.
*   Πρεπει να κανω κανονικοποιηση στα δεδομενα γιατι δε γινεται για παραδειγμα ενα feature με παραδειγμα των 100,000 να ερθει σε σωστη αναλογια αντιστοιχα με ενα feature του οποιου ο αριθμος ειναι 50, 100 ή κατι πιο μικρο. Δε θα παω σε MinMax γιατι αν εχω καποιον μεγαλο outlier δε θα εχει το καλυτερο αποτελεσμα.
*    Θα βαλω οπως σε ολες τις ασκησεις το stratify για να υποστηριξω την αδυναμη κλαση σε περιπτωση που θα εχουμε inbalanced set.
*    Τελος, θα προσπαθησω να βγαλω το precision απο το μοντελο για να δω οταν το μοντελο εγκρινει καποιον ποσο συχνα εχει δικιο (ποιοτικο metric). Και το recall για να καταλαβω ποσους καταφερε να εντωπισει απο τον συνολικο κουβα (ισως ποσοτικο metric).

In [34]:
try:
    X
    y
    preprocess
except NameError as e:
    raise RuntimeError(
        "X, y, or preprocess not found. Please run the 2.x preprocessing cell first."
    ) from e

# ------------------------------------------------------------
# 3.1  Normalization is already defined inside 'preprocess'
#      (StandardScaler for numeric, OneHotEncoder for categorical)
# ------------------------------------------------------------
# Nothing extra to do here in code – we just *use* 'preprocess' properly
# by fitting it ONLY on the training data (see below).

# ------------------------------------------------------------
# 3.2  Train/Test split (70/30, stratified)
# ------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
#    stratify=y,
    random_state=42
)

print("Train size:", X_train.shape[0])
print("Test size :", X_test.shape[0])
print("Train positive ratio:", y_train.mean())
print("Test positive ratio :", y_test.mean())

# ------------------------------------------------------------
# 3.3  Preprocess + convert to GPU tensors + MLP model
# ------------------------------------------------------------

# Fit preprocessing ONLY on train
X_train_proc = preprocess.fit_transform(X_train)
X_test_proc = preprocess.transform(X_test)

# Convert sparse matrices to dense if needed
if hasattr(X_train_proc, "toarray"):
    X_train_proc = X_train_proc.toarray()
    X_test_proc = X_test_proc.toarray()

# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("\nUsing device:", device)

# Convert to tensors
X_train_tensor = torch.tensor(X_train_proc, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test_proc, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).to(device)

# Dataset & DataLoader
train_ds = TensorDataset(X_train_tensor, y_train_tensor)
batch_size = 1024
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

input_dim = X_train_tensor.shape[1]
print("Input dimension:", input_dim)

# Define MLP model
class MLPClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

model = MLPClassifier(input_dim).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

print("\nModel architecture:")
print(model)

# ------------------------------------------------------------
# Training loop on GPU
# ------------------------------------------------------------
n_epochs = 8  # you can tune

for epoch in range(n_epochs):
    model.train()
    running_loss = 0.0
    for xb, yb in train_dl:
        yb = yb.unsqueeze(1)  # shape (batch, 1)

        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * xb.size(0)

    epoch_loss = running_loss / len(train_ds)
    print(f"Epoch {epoch+1}/{n_epochs} - loss: {epoch_loss:.4f}")

# ------------------------------------------------------------
# 3.4  Evaluation: Accuracy, Precision, Recall, F1
# ------------------------------------------------------------
model.eval()
with torch.no_grad():
    test_probs = model(X_test_tensor).cpu().numpy().ravel()

test_pred = (test_probs >= 0.5).astype(int)

acc = accuracy_score(y_test, test_pred)
prec = precision_score(y_test, test_pred, zero_division=0)
rec = recall_score(y_test, test_pred, zero_division=0)
f1 = f1_score(y_test, test_pred, zero_division=0)

print("\n=== Test Metrics ===")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")

Train size: 149099
Test size : 63900
Train positive ratio: 0.28835203455422237
Test positive ratio : 0.2863380281690141

Using device: cuda
Input dimension: 37

Model architecture:
MLPClassifier(
  (net): Sequential(
    (0): Linear(in_features=37, out_features=64, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): ReLU()
    (6): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Linear(in_features=32, out_features=1, bias=True)
    (8): Sigmoid()
  )
)
Epoch 1/8 - loss: 0.4895
Epoch 2/8 - loss: 0.4076
Epoch 3/8 - loss: 0.3940
Epoch 4/8 - loss: 0.3888
Epoch 5/8 - loss: 0.3863
Epoch 6/8 - loss: 0.3850
Epoch 7/8 - loss: 0.3832
Epoch 8/8 - loss: 0.3822

=== Test Metrics ===
Accuracy : 0.8231
Precision: 0.7314
Recall   : 0.6040
F1-score : 0.6617
