In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix, f1_score

In [3]:
# 1. Load datasets
train_df = pd.read_csv("../yoges/Desktop/Deerwalk/dataset/TASK_2/TASK_2/train_set.csv")
test_df = pd.read_csv("../yoges/Desktop/Deerwalk/dataset/TASK_2/TASK_2/test_set.csv")
blind_df = pd.read_csv("../yoges/Desktop/Deerwalk/dataset/TASK_2/TASK_2/blinded_test_set.csv")

In [5]:
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Blind shape: {blind_df.shape}")

Train shape: (315, 3240)
Test shape: (100, 3240)
Blind shape: (36, 3239)


In [7]:
# 2. Missing and infinite value summary function
def summary_missing_inf(df, df_name):
    print(f"--- {df_name} ---")
    total_elements = df.size
    missing = df.isnull().sum().sum()
    infinite = np.isinf(df.select_dtypes(include=[np.number])).sum().sum()
    print(f"Missing values: {missing} ({missing/total_elements*100:.2f}%)")
    print(f"Infinite values: {infinite}\n")

In [9]:
summary_missing_inf(train_df, "Train")
summary_missing_inf(test_df, "Test")
summary_missing_inf(blind_df, "Blind")

--- Train ---
Missing values: 2668 (0.26%)
Infinite values: 4

--- Test ---
Missing values: 1127 (0.35%)
Infinite values: 0

--- Blind ---
Missing values: 276 (0.24%)
Infinite values: 0



In [11]:
# 3. Separate features and target
X_train = train_df.drop(columns=['ID', 'CLASS'])
y_train = train_df['CLASS']
X_test = test_df.drop(columns=['ID', 'CLASS'])
y_test = test_df['CLASS']
X_blind = blind_df.drop(columns=['ID'])

In [13]:
# 4. Replace infinite values with NaN for all datasets to handle later
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_blind.replace([np.inf, -np.inf], np.nan, inplace=True)

In [15]:
# 5. KNN imputation for selected features that need it (customize list per your data)
knn_features = ['Feature_1712', 'Feature_1713', 'Feature_1714', 'Feature_1715',
                'Feature_1716', 'Feature_1717', 'Feature_1718', 'Feature_1719',
                'Feature_1725', 'Feature_1729']
scaler_knn = StandardScaler()
knn_imputer = KNNImputer(n_neighbors=5)

In [17]:
# Impute train KNN features
subset_train = X_train[knn_features]
subset_train_scaled = scaler_knn.fit_transform(subset_train)
subset_train_imputed = scaler_knn.inverse_transform(knn_imputer.fit_transform(subset_train_scaled))
X_train.loc[:, knn_features] = subset_train_imputed


In [19]:
# Impute test and blind KNN features
for X in [X_test, X_blind]:
    subset = X[knn_features]
    subset_scaled = scaler_knn.transform(subset)
    subset_imputed = scaler_knn.inverse_transform(knn_imputer.transform(subset_scaled))
    X.loc[:, knn_features] = subset_imputed


In [21]:
# 6. Mean imputation for remaining features
num_cols = X_train.select_dtypes(include=[np.number]).columns
mean_imputer = SimpleImputer(strategy='mean')
X_train.loc[:, num_cols] = mean_imputer.fit_transform(X_train[num_cols])
X_test.loc[:, num_cols] = mean_imputer.transform(X_test[num_cols])
X_blind.loc[:, num_cols] = mean_imputer.transform(X_blind[num_cols])


In [23]:
# Verify no missing
print(f"Missing after imputation - Train: {X_train.isnull().sum().sum()}, Test: {X_test.isnull().sum().sum()}, Blind: {X_blind.isnull().sum().sum()}")


Missing after imputation - Train: 0, Test: 0, Blind: 0


In [25]:
# 7. Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_blind_scaled = scaler.transform(X_blind)

In [27]:
# 8. Feature selection - Select top 100 features by mutual information
selector = SelectKBest(mutual_info_classif, k=100)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)
X_blind_selected = selector.transform(X_blind_scaled)


In [29]:
# 9. Handle class imbalance with SMOTE on train data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_selected, y_train)

In [31]:
# 10. Models and hyperparameter grids
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42)
}
param_grids = {
    'Logistic Regression': {'C': [0.001, 0.01, 0.1, 1, 10], 'penalty': ['l2']},
    'Random Forest': {'n_estimators': [50,100,200], 'max_depth': [3,5,10], 'min_samples_split':[2,5]},
    'XGBoost': {
        'n_estimators': [50,100,200],
        'max_depth': [3,5,7],
        'learning_rate': [0.01,0.1,0.3],
        'gamma': [0,0.1],
        'reg_lambda': [1,10]
    }
}

In [33]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [35]:
# 11. Function for specificity
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp) if (tn + fp) > 0 else 0

In [37]:
# 12. Metric evaluation function
def evaluate_metrics(y_true, y_pred, y_proba):
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'AUROC': roc_auc_score(y_true, y_proba),
        'Sensitivity': recall_score(y_true, y_pred),
        'Specificity': specificity_score(y_true, y_pred),
        'F1-Score': f1_score(y_true, y_pred)
    }

In [45]:
# 13. Train / tune / evaluate models and save predictions
results = []
output_dir = 'predictions'
os.makedirs(output_dir, exist_ok=True)
for name, model in models.items():
    print(f"\nTraining and tuning {name}...")
    grid = GridSearchCV(model, param_grids[name], cv=cv, scoring='f1', n_jobs=-1, verbose=1)
    grid.fit(X_train_res, y_train_res)
    best_model = grid.best_estimator_
    print(f"Best params for {name}: {grid.best_params_}")
    
    # Evaluate on train (resampled)
    y_train_pred = best_model.predict(X_train_res)
    y_train_proba = best_model.predict_proba(X_train_res)[:, 1]
    train_metrics = evaluate_metrics(y_train_res, y_train_pred, y_train_proba)
    train_metrics.update({'Model': name, 'Dataset': 'Train (Resampled)', 'Best_Params': grid.best_params_})
    
    # Evaluate on test
    y_test_pred = best_model.predict(X_test_selected)
    y_test_proba = best_model.predict_proba(X_test_selected)[:, 1]
    test_metrics = evaluate_metrics(y_test, y_test_pred, y_test_proba)
    test_metrics.update({'Model': name, 'Dataset': 'Test', 'Best_Params': grid.best_params_})
    
    results.append(train_metrics)
    results.append(test_metrics)
    


Training and tuning Logistic Regression...
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best params for Logistic Regression: {'C': 0.01, 'penalty': 'l2'}

Training and tuning Random Forest...
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best params for Random Forest: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}

Training and tuning XGBoost...
Fitting 5 folds for each of 108 candidates, totalling 540 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best params for XGBoost: {'gamma': 0, 'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 200, 'reg_lambda': 1}


In [48]:
# Save predictions for train, test, and blinded test datasets
for dataset_label, X_data_sel, orig_df in [('train', X_train_selected, train_df),
                                               ('test', X_test_selected, test_df),
                                               ('blinded_test', X_blind_selected, blind_df)]:
        proba = best_model.predict_proba(X_data_sel)
        proba_df = pd.DataFrame(proba, columns=[f'Class_{i}_Prob' for i in range(proba.shape[1])])
        proba_df['ID'] = orig_df['ID'].reset_index(drop=True)
        proba_df = proba_df[['ID'] + [f'Class_{i}_Prob' for i in range(proba.shape[1])]]
        file_path = os.path.join(output_dir, f"{name.lower().replace(' ', '_')}_{dataset_label}_predictions.csv")
        proba_df.to_csv(file_path, index=False)
        print(f"Saved predictions: {file_path}")

Saved predictions: predictions\xgboost_train_predictions.csv
Saved predictions: predictions\xgboost_test_predictions.csv
Saved predictions: predictions\xgboost_blinded_test_predictions.csv


In [50]:
# 14. Save metrics results
results_df = pd.DataFrame(results)
results_csv_path = 'model_evaluation_metrics.csv'
results_df.to_csv(results_csv_path, index=False)
print("\nMetrics evaluation summary saved to:", results_csv_path)
print(results_df)


Metrics evaluation summary saved to: model_evaluation_metrics.csv
   Accuracy     AUROC  Sensitivity  Specificity  F1-Score  \
0  0.675393  0.741071     0.654450     0.696335  0.668449   
1  0.620000  0.689245     0.595238     0.637931  0.568182   
2  0.997382  1.000000     0.994764     1.000000  0.997375   
3  0.590000  0.659278     0.404762     0.724138  0.453333   
4  1.000000  1.000000     1.000000     1.000000  1.000000   
5  0.590000  0.595238     0.357143     0.758621  0.422535   

                 Model            Dataset  \
0  Logistic Regression  Train (Resampled)   
1  Logistic Regression               Test   
2        Random Forest  Train (Resampled)   
3        Random Forest               Test   
4              XGBoost  Train (Resampled)   
5              XGBoost               Test   

                                         Best_Params  
0                       {'C': 0.01, 'penalty': 'l2'}  
1                       {'C': 0.01, 'penalty': 'l2'}  
2  {'max_depth': 10, 'mi