In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Data Loading

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

# Define the directory
output_dir = '/content/drive/MyDrive/Data/ML_Project_Data'

# Load the DataFrames
X_CV = pd.read_csv(os.path.join(output_dir, 'X_CV.csv'))
y_CV = pd.read_csv(os.path.join(output_dir, 'y_CV.csv'))

X_CV_W_outliers = pd.read_csv(os.path.join(output_dir, 'X_CV_W_outliers.csv'))
y_CV_W_outliers = pd.read_csv(os.path.join(output_dir, 'y_CV_W_outliers.csv'))

X_train = pd.read_csv(os.path.join(output_dir, 'X_train.csv'))
y_train = pd.read_csv(os.path.join(output_dir, 'y_train.csv'))

# X_train_W_outliers = pd.read_csv(os.path.join(output_dir, 'X_train_W_outliers.csv'))
# y_train_W_outliers = pd.read_csv(os.path.join(output_dir, 'y_train_W_outliers.csv'))

X_val = pd.read_csv(os.path.join(output_dir, 'X_val.csv'))
y_val = pd.read_csv(os.path.join(output_dir, 'y_val.csv'))

X_test = pd.read_csv(os.path.join(output_dir, 'X_test.csv'))


In [None]:
print("X_CV shape:", X_CV.shape)
print("y_CV shape:", y_CV.shape)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)

print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)

print("X_test shape:", X_test.shape)

X_CV shape: (3116945, 13)
y_CV shape: (3116945, 1)
X_train shape: (2334900, 13)
y_train shape: (2334900, 1)
X_train shape: (2334900, 13)
X_val shape: (623389, 13)
y_train shape: (2334900, 1)
y_val shape: (623389, 1)
X_test shape: (2077964, 14)


# Logistic Regression

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
def evaluate_model(name, model, X_train, y_train, X_val, y_val):
    print(f"\n=== {name} ===")

    # Train Predictions
    y_train_pred = model.predict(X_train)
    print("---- Train Metrics ----")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))
    print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
    print(f"Train MCC: {matthews_corrcoef(y_train, y_train_pred):.4f}")

    # Validation Predictions
    y_val_pred = model.predict(X_val)
    print("---- Validation Metrics ----")
    print(confusion_matrix(y_val, y_val_pred))
    print(classification_report(y_val, y_val_pred))
    print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
    print(f"Validation MCC: {matthews_corrcoef(y_val, y_val_pred):.4f}")

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

logreg_model = LogisticRegression(random_state=42, max_iter=1000 ,n_jobs=-1 ,C=1,class_weight=None) # -> 1000 (big data size), none (<10% diff in balance)

logreg_model.fit(X_train_scaled, y_train)
evaluate_model("Logistic Regression", logreg_model, X_train_scaled, y_train, X_val_scaled, y_val)

  y = column_or_1d(y, warn=True)



=== Logistic Regression ===
---- Train Metrics ----
[[754808 316765]
 [319804 943523]]
              precision    recall  f1-score   support

           0       0.70      0.70      0.70   1071573
           1       0.75      0.75      0.75   1263327

    accuracy                           0.73   2334900
   macro avg       0.73      0.73      0.73   2334900
weighted avg       0.73      0.73      0.73   2334900

Train Accuracy: 0.7274
Train MCC: 0.4512
---- Validation Metrics ----
[[202443  79867]
 [ 98546 242533]]
              precision    recall  f1-score   support

           0       0.67      0.72      0.69    282310
           1       0.75      0.71      0.73    341079

    accuracy                           0.71    623389
   macro avg       0.71      0.71      0.71    623389
weighted avg       0.72      0.71      0.71    623389

Validation Accuracy: 0.7138
Validation MCC: 0.4265


In [None]:
##### low variance but high bias -->> C (inverse of reg strength) already used the default (1 which is moderate) but we will use higher value

In [None]:
logreg_model = LogisticRegression(random_state=42, max_iter=1000 ,n_jobs=-1,solver='lbfgs',C=10,class_weight=None) # -> 1000 (big data size), none (<10% diff in balance)

logreg_model.fit(X_train_scaled, y_train)
evaluate_model("Logistic Regression (C=10 , new solver algorithm)", logreg_model, X_train_scaled, y_train, X_val_scaled, y_val)

  y = column_or_1d(y, warn=True)



=== Logistic Regression (C=10 , new solver algorithm) ===
---- Train Metrics ----
[[754808 316765]
 [319804 943523]]
              precision    recall  f1-score   support

           0       0.70      0.70      0.70   1071573
           1       0.75      0.75      0.75   1263327

    accuracy                           0.73   2334900
   macro avg       0.73      0.73      0.73   2334900
weighted avg       0.73      0.73      0.73   2334900

Train Accuracy: 0.7274
Train MCC: 0.4512
---- Validation Metrics ----
[[202442  79868]
 [ 98546 242533]]
              precision    recall  f1-score   support

           0       0.67      0.72      0.69    282310
           1       0.75      0.71      0.73    341079

    accuracy                           0.71    623389
   macro avg       0.71      0.71      0.71    623389
weighted avg       0.72      0.71      0.71    623389

Validation Accuracy: 0.7138
Validation MCC: 0.4265


In [None]:
# no Improvement  -> try polynomial features

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

In [None]:
poly_logreg_pipeline = Pipeline([
    ("poly", PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)),
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(max_iter=1000,  random_state=42))
])


poly_logreg_pipeline.fit(X_train, y_train)
evaluate_model("Logistic Regression with Polynomial Features", poly_logreg_pipeline, X_train, y_train, X_val, y_val)


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



=== Logistic Regression with Polynomial Features ===
---- Train Metrics ----
[[ 846059  225514]
 [ 202853 1060474]]
              precision    recall  f1-score   support

           0       0.81      0.79      0.80   1071573
           1       0.82      0.84      0.83   1263327

    accuracy                           0.82   2334900
   macro avg       0.82      0.81      0.81   2334900
weighted avg       0.82      0.82      0.82   2334900

Train Accuracy: 0.8165
Train MCC: 0.6301
---- Validation Metrics ----
[[214695  67615]
 [ 55411 285668]]
              precision    recall  f1-score   support

           0       0.79      0.76      0.78    282310
           1       0.81      0.84      0.82    341079

    accuracy                           0.80    623389
   macro avg       0.80      0.80      0.80    623389
weighted avg       0.80      0.80      0.80    623389

Validation Accuracy: 0.8026
Validation MCC: 0.6007


In [None]:
# with more degrees it may imporve but it take alot of time to train

# SVM

In [None]:
from sklearn.svm import SVC


In [None]:
svm_model = SVC(kernel='linear', C=1.0, class_weight=None)
svm_model.fit(X_train_scaled, y_train)
evaluate_model("SVM", svm_model, X_train_scaled, y_train, X_val_scaled, y_val)

  y = column_or_1d(y, warn=True)


In [None]:
# really long time -> will try to apply only on a sample of the data (balanced sample)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [None]:
X_small, _, y_small, _ = train_test_split(X_train_scaled, y_train, train_size=10_000, stratify=y_train, random_state=42)

In [None]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_small, y_small)

evaluate_model("SVM (Balanced Sample)", svm_model, X_small, y_small, X_val_scaled, y_val)

  y = column_or_1d(y, warn=True)



=== SVM (Balanced Sample) ===
---- Train Metrics ----
[[3370 1219]
 [1400 4011]]
              precision    recall  f1-score   support

           0       0.71      0.73      0.72      4589
           1       0.77      0.74      0.75      5411

    accuracy                           0.74     10000
   macro avg       0.74      0.74      0.74     10000
weighted avg       0.74      0.74      0.74     10000

Train Accuracy: 0.7381
Train MCC: 0.4745
---- Validation Metrics ----
[[210982  71328]
 [104306 236773]]
              precision    recall  f1-score   support

           0       0.67      0.75      0.71    282310
           1       0.77      0.69      0.73    341079

    accuracy                           0.72    623389
   macro avg       0.72      0.72      0.72    623389
weighted avg       0.72      0.72      0.72    623389

Validation Accuracy: 0.7183
Validation MCC: 0.4396


In [None]:
# change the kernel

In [None]:
svm_model = SVC(kernel='rbf')
svm_model.fit(X_small, y_small)

evaluate_model("SVM (Balanced Sample (rbf) )", svm_model, X_small, y_small, X_val_scaled, y_val)

  y = column_or_1d(y, warn=True)



=== SVM (Balanced Sample (rbf) ) ===
---- Train Metrics ----
[[4256  333]
 [ 279 5132]]
              precision    recall  f1-score   support

           0       0.94      0.93      0.93      4589
           1       0.94      0.95      0.94      5411

    accuracy                           0.94     10000
   macro avg       0.94      0.94      0.94     10000
weighted avg       0.94      0.94      0.94     10000

Train Accuracy: 0.9388
Train MCC: 0.8767
---- Validation Metrics ----
[[250070  32240]
 [ 17811 323268]]
              precision    recall  f1-score   support

           0       0.93      0.89      0.91    282310
           1       0.91      0.95      0.93    341079

    accuracy                           0.92    623389
   macro avg       0.92      0.92      0.92    623389
weighted avg       0.92      0.92      0.92    623389

Validation Accuracy: 0.9197
Validation MCC: 0.8382


In [None]:
# increase the sample size

In [None]:
X_small, _, y_small, _ = train_test_split(X_train_scaled, y_train, train_size=50_000, stratify=y_train, random_state=42)

In [None]:
svm_model = SVC(kernel='rbf')
svm_model.fit(X_small, y_small)

evaluate_model("SVM (Balanced Sample (rbf & increased samples))", svm_model, X_small, y_small, X_val_scaled, y_val)

  y = column_or_1d(y, warn=True)



=== SVM (Balanced Sample (rbf & increased samples)) ===
---- Train Metrics ----
[[21712  1235]
 [ 1002 26051]]
              precision    recall  f1-score   support

           0       0.96      0.95      0.95     22947
           1       0.95      0.96      0.96     27053

    accuracy                           0.96     50000
   macro avg       0.96      0.95      0.95     50000
weighted avg       0.96      0.96      0.96     50000

Train Accuracy: 0.9553
Train MCC: 0.9099
---- Validation Metrics ----
[[259352  22958]
 [ 12530 328549]]
              precision    recall  f1-score   support

           0       0.95      0.92      0.94    282310
           1       0.93      0.96      0.95    341079

    accuracy                           0.94    623389
   macro avg       0.94      0.94      0.94    623389
weighted avg       0.94      0.94      0.94    623389

Validation Accuracy: 0.9431
Validation MCC: 0.8853


# Cross validation

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, matthews_corrcoef

In [None]:
xgb = XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    verbosity=0,
    n_jobs=-1
)
param_grid = {
    'max_depth': [6,10, 14],
    'min_child_weight': [5, 7],
    'gamma': [1e-6, 1e-4],
    'subsample': [0.7,0.8],
    'colsample_bytree': [0.6],
    'reg_alpha': [0.1],
    'n_estimators': [50]
}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
mcc_scorer = make_scorer(matthews_corrcoef)
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring=mcc_scorer,
    n_jobs=-1,
    cv=cv,
    verbose=2
)

In [None]:
grid_search.fit(X_CV, y_CV)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [None]:
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters: {'colsample_bytree': 0.6, 'gamma': 0.0001, 'max_depth': 14, 'min_child_weight': 5, 'n_estimators': 50, 'reg_alpha': 0.1, 'subsample': 0.8}
Best cross-validation score: 0.9797767095341166


In [None]:
X_CV_W_outliers = pd.read_csv(os.path.join(output_dir, 'X_CV_W_outliers.csv'))
y_CV_W_outliers = pd.read_csv(os.path.join(output_dir, 'y_CV_W_outliers.csv'))

In [None]:
final_xgb_model = XGBClassifier(
    colsample_bytree=0.6,
    gamma=0.0001,
    max_depth=14,
    min_child_weight=5,
    n_estimators=200,
    reg_alpha=0.1,
    subsample=0.8,
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    n_jobs=-1,
    random_state=42
)
final_xgb_model.fit(X_CV_W_outliers, y_CV_W_outliers)

Parameters: { "use_label_encoder" } are not used.



In [None]:
test_predictions = final_xgb_model.predict(X_test.drop(columns=['id']))

submission = pd.DataFrame({
    'id': X_test['id'],
    'prediction': test_predictions
})
submission = submission.replace({0: 'e', 1: 'p'})
submission.to_csv("submission_CV_best_all_W_outliers.csv", index=False)