# Hyper-Parameter Tuning

# Imports

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
import joblib
import os

# Load Datasets

In [19]:
datasets = {
    'Frequency': pd.read_csv('Data/Processed Data/frequency_encoded_data.csv'),
    'One-Hot': pd.read_csv('Data/Processed Data/one_hot_encoded_data.csv'),
    'Target': pd.read_csv('Data/Processed Data/target_encoded_data.csv')
}

# Tuning

## Param-grid

In [20]:
xgb_params = {
    'scale_pos_weight': [1, 3, 6, 9, 12, 15],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [50,100, 200]
}

ada_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'n_estimators': [50,100, 200]
}

svm_params = {
    'C': [0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1, 1],
    'class_weight': [{0:1, 1:w} for w in [1, 3, 6, 9, 12, 15]]
}

## Tuning function

In [21]:
def tune_models(X_train, X_test, y_train, y_test, use_smote=False):
    if use_smote:
        smote = SMOTE(random_state=42)
        X_train, y_train = smote.fit_resample(X_train, y_train)
    
    models = {
        'XGBoost': (xgb.XGBClassifier(random_state=42), xgb_params),
        'AdaBoost': (AdaBoostClassifier(algorithm='SAMME',random_state=42), ada_params),
        'SVM': (SVC(kernel='linear',probability=True, random_state=42), svm_params)
    }
    
    results = {}
    for name, (model, params) in models.items():
        grid = GridSearchCV(model, params, scoring='f1', cv=5)
        grid.fit(X_train, y_train)
        y_pred = grid.predict(X_test)
        results[name] = {
            'best_params': grid.best_params_,
            'best_score': grid.best_score_,
            'test_f1': f1_score(y_test, y_pred)
        }
    
    return results

## Execution

In [22]:
# Store best models and scores
best_overall = {
    'score': 0,
    'model': None,
    'params': None,
    'encoding': None,
    'smote': None
}

# Evaluate all combinations
for enc_name, data in datasets.items():
    X = data.drop('Class/ASD', axis=1)
    y = data['Class/ASD']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    for smote in [False, True]:
        results = tune_models(X_train, X_test, y_train, y_test, smote)
        
        print(f"\nResults for {enc_name} encoding {'with' if smote else 'without'} SMOTE:")
        for model_name, result in results.items():
            print(f"\n{model_name}:")
            print(f"Best Parameters: {result['best_params']}")
            print(f"CV F1-Score: {result['best_score']:.4f}")
            print(f"Test F1-Score: {result['test_f1']:.4f}")
            
            # Update best overall model
            if result['test_f1'] > best_overall['score']:
                best_overall.update({
                    'score': result['test_f1'],
                    'model': model_name,
                    'params': result['best_params'],
                    'encoding': enc_name,
                    'smote': smote
                })

# Save best model
print("\nBest Overall Model:")
print(f"Model: {best_overall['model']}")
print(f"Encoding: {best_overall['encoding']}")
print(f"SMOTE: {best_overall['smote']}")
print(f"F1-Score: {best_overall['score']:.4f}")

os.makedirs('Models', exist_ok=True)

# Train and save the best model
X = datasets[best_overall['encoding']].drop('Class/ASD', axis=1)
y = datasets[best_overall['encoding']]['Class/ASD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

if best_overall['model'] == 'XGBoost':
    best_model = xgb.XGBClassifier(random_state=42, **best_overall['params'])
elif best_overall['model'] == 'AdaBoost':
    best_model = AdaBoostClassifier(algorithm='SAMME',random_state=42, **best_overall['params'])
else:
    best_model = SVC(kernel='linear',probability=True, random_state=42, **best_overall['params'])

if best_overall['smote']:
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

best_model.fit(X_train, y_train)

# Save model and configuration
model_info = {
    'model': best_model,
    'encoding': best_overall['encoding'],
    'smote': best_overall['smote'],
    'params': best_overall['params']
}
joblib.dump(model_info, 'Models/best_model.joblib')


Results for Frequency encoding without SMOTE:

XGBoost:
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'scale_pos_weight': 3}
CV F1-Score: 0.6824
Test F1-Score: 0.6842

AdaBoost:
Best Parameters: {'learning_rate': 1.0, 'n_estimators': 100}
CV F1-Score: 0.6457
Test F1-Score: 0.6667

SVM:
Best Parameters: {'C': 0.1, 'class_weight': {0: 1, 1: 3}, 'gamma': 0.001}
CV F1-Score: 0.6540
Test F1-Score: 0.7059

Results for Frequency encoding with SMOTE:

XGBoost:
Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'scale_pos_weight': 6}
CV F1-Score: 0.9002
Test F1-Score: 0.6234

AdaBoost:
Best Parameters: {'learning_rate': 1.0, 'n_estimators': 50}
CV F1-Score: 0.8638
Test F1-Score: 0.7059

SVM:
Best Parameters: {'C': 10, 'class_weight': {0: 1, 1: 3}, 'gamma': 0.001}
CV F1-Score: 0.8546
Test F1-Score: 0.6286

Results for One-Hot encoding without SMOTE:

XGBoost:
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'sc

['Models/best_model.joblib']