# Best SVC Model for ASD Detection 
### based on our Target Metric F2-Score

### Import Libraries needed for Modelling

In [1]:
import os
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 

from xgboost import XGBClassifier

# import own modules
sys.path.append("..")  # Adds higher directory to python modules path.
from scripts import features as ft
from scripts import preprocessing as pp
from scripts import evaluate_models as em

# plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-dark.mplstyle')
plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-light.mplstyle')

import pickle

### Load the Dataframe
- Calculate the DF or load it form path
- Drop first batch of useless columns
- Set id as an index
- distinguish between numerical and categorical Columns

In [2]:
# path to csv file
path_df = os.path.join("..", "data", "df_sam_resnet.csv")

# get features - or recalculate
recalculate_df = False
if os.path.isfile(path_df) and not recalculate_df:
    df = pd.read_csv(path_df)
else:
    df = ft.get_features()
    df.to_csv(path_df, index=False)

# set id as index
df = df.set_index("id", drop=True)

# drop first batch of useless variables
df = df.drop(columns=['img', 'sp_idx'])
df = df.drop(columns=[col for col in df.columns if "_obj" in col])  # drop 'object' columns

# find numerical and categorical columns
num_cols = df.columns[df.dtypes != "object"]
cat_cols = df.columns[df.dtypes == "object"]

# print info
print(f" -> dataframe has {df.shape[0]} instances and {df.shape[1]} columns")
print(f" -> there are {len(num_cols)} numerical columns")
print(f" -> there are {len(cat_cols)} categoricals columns")

 -> dataframe has 7598 instances and 34 columns
 -> there are 34 numerical columns
 -> there are 0 categoricals columns


### Alter the DF a bit more
- We only take Participants lower equal 5000 (5 Seconds of Fixation Duration per Picture usually the Picture is only shown for 3 Seconds to the Participant)

- Get rid of some Features, which have proven to not give us any value for the Research

In [3]:
# processing
df = df[df["sp_fix_duration_ms_total"] <= 5000]
df = df.drop(columns=['sal_first_above_0.75*max_rank', 'sal_first_above_0.9*max_rank'])

# find numerical and categorical columns
num_cols = df.columns[df.dtypes != "object"]
cat_cols = df.columns[df.dtypes == "object"]

# print info
print(f" -> dataframe has {df.shape[0]} instances and {df.shape[1]} columns")
print(f" -> there are {len(num_cols)} numerical columns")
print(f" -> there are {len(cat_cols)} categoricals columns")

 -> dataframe has 7217 instances and 32 columns
 -> there are 32 numerical columns
 -> there are 0 categoricals columns


### The Features for Best SVC Model

- I put them in here Hardcoded
- all the selection and Research Work was done in other Notebooks
- I Provide 2 lists of Features which u could choose 1 from
- because Model runs equally good with 11 or 10 Features

In [4]:
# 11 Features List (optional)

feature_list = ["sp_fix_duration_ms_total","sp_fix_duration_ms_mean","sp_fix_duration_ms_var", "sal_first_fixation","sal_sum","sal_KLD", "obj_t_abs_on_background","obj_t_abs_on_animate", "obj_n_fix_background","obj_n_fix_inanimate","obj_n_fix_animate","asd"]

# 10 Features List same as above without "sp_fix_duration_ms_var"

# feature_list = ["sp_fix_duration_ms_total","sp_fix_duration_ms_mean", "sal_first_fixation","sal_sum","sal_KLD", "obj_t_abs_on_background","obj_t_abs_on_animate", "obj_n_fix_background","obj_n_fix_inanimate","obj_n_fix_animate","asd"]

# Model Preparation

### Split into train & test sets

- we defined our own Function "pp.split" and Test Set

- This ensures we can do better Error Analysis on specific Pictures later


In [None]:
# prepare features and target
X = df[feature_list]
y = X.pop("asd")

# train-test-split
X_train, X_test, y_train, y_test = pp.split(X, y)

# print info
print(f"train-set has '{len(y_train)}' samples & '{X.shape[1]}' features")
print(f"test-set has '{len(y_test)}' samples - out of '{df.shape[0]}'")
print(f"  ~ {len(y_test) / df.shape[0] * 100:.2f}% of full dataset")

train-set has '6485' samples & '11' features
test-set has '732' samples - out of '7217'
  ~ 10.14% of full dataset


### Set Variables and Columns for Pipeline
- define our scoring metric (make_scorer function)
- Set Variables for our Model
- Behavior for saving Models as pickle File later

In [None]:
# metric
ftwo_scorer = make_scorer(fbeta_score, beta=2)

# defaults
RSEED = 42
cv = 10
n_jobs = -1
verbose = 1


# find numerical and categorical columns
num_cols = X_train.columns[X_train.dtypes != "object"]
cat_cols = X_train.columns[X_train.dtypes == "object"]

### Pipeline & Grid Search CV
- Defining the SVC Pipeline
- Define the Grid Search Grid we use

In [None]:
# The Column Transformer and Operations to do on Columns
# add other transformations at the end if needed
transformer = [("scaler", StandardScaler(), num_cols),
               ("ohe", OneHotEncoder(drop="first"), cat_cols  )]

# Add our transformer to a ColumnTransformer Object               
preprocessing = ColumnTransformer(transformer,
                                  remainder="passthrough")


# The Pipeline for the SVC Model
# Support Vector Classifier: apply scaling / encoding
svc_pipeline = Pipeline([
    ("preprocessor", preprocessing),
    ("classifier",SVC(probability=True))
])


# Support Vector Classifier Parameter Grid
param_grid_svc = {
    'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel types to try
    'classifier__C': [0.1, 1, 10, 100],  # Regularization parameter values
    'classifier__gamma': ['scale', 'auto'],  # Gamma parameter for RBF kernel
    'classifier__degree': [2, 3, 4]  # Degree of the polynomial kernel (only for poly kernel)
}

### Fit the GridSearchCV with our Gridi and Pipeline
- Define GridSearchCV Model parameters
- Fit on Training Data
- Get the best parameters and estimators


In [None]:
# Create GridSearchCV object
grid_search_svc = GridSearchCV(
    svc_pipeline,
    param_grid=param_grid_svc,
    cv=cv,
    scoring=ftwo_scorer,
    n_jobs=n_jobs,
    verbose=verbose,
)

# Fit the Model
grid_search_svc.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_svc = grid_search_svc.best_params_
best_est_svc = grid_search_svc.best_estimator_
print("Best params for SVC are:", best_params_svc)
print("Best est for SVC are:", best_est_svc)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


  pid = os.fork()


  pid = os.fork()


### Predict on Test and Train Set for Evaluation
- 1 Prediction as Classification - 0 or 1
- 1 Predict Proba - gets back the Probability for a Class

In [None]:
# predict & proba
pred_test = grid_search_svc.predict(X_test)
proba_test = grid_search_svc.predict_proba(X_test)

pred_train = grid_search_svc.predict(X_train)
proba_train = grid_search_svc.predict_proba(X_train)

### Evaluate the Model with own Functions
- em.report gives back a full Evaluation Report
- Learning Curves show the Behaviour of the Model (e.g over/ underfitting)

In [None]:
# evaluate model
em.report(
    y_train=y_train,
    y_train_pred=pred_train,
    y_train_proba=proba_train,
    y_test=y_test,
    y_test_pred=pred_test,
    y_test_proba=proba_test,
)

In [None]:
# learning curves - for one model - f2 score
em.learning(best_est_svc, X_train, y_train)

In [None]:
em.model_info(best_est_svc)

### Safe Model as Pickle File

In [None]:
# Save the best estimator to a pickle file
with open('best_svc_resnet.pkl', 'wb') as file:
    pickle.dump(best_est_svc, file)