# *Toxin*
Here, we are training model on single-single features and calculating their performance metrics

In [1]:

# Importing Required Libraries
from sklearn.ensemble import (
    ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier,
    BaggingClassifier, AdaBoostClassifier, StackingClassifier, VotingClassifier
)
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, cohen_kappa_score



In [2]:
# Python code to ignore warnings
import warnings

warnings.filterwarnings('ignore')


# Data Preparation
df_train = pd.read_csv("../train_data.csv")
df_validation = pd.read_csv("../test_data.csv")

# Feature Isolation
x_train = df_train.iloc[:, 1:-9170]
y_train = df_train.iloc[:, -1]
x_validation = df_validation.iloc[:, 1:-9170]
y_validation = df_validation.iloc[:, -1]

# creation performance measure function
def perf_measure(y_actual, y_hat, thr=0.50):
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    spec = 0
    mcc = 0
    sens = 0
    acc = 0
    i = 0
    
    while i < len(y_hat):
        if y_actual[i] == 1 and y_hat[i] >= thr:
            TP += 1
        elif y_actual[i] == 0 and y_hat[i] >= thr:
            FP += 1
        elif y_actual[i] == 0 and y_hat[i] < thr:
            TN += 1
        elif y_actual[i] == 1 and y_hat[i] < thr:
            FN += 1
        i += 1
        
    binder = TP + FN
    nonb = TN + FP
    total = TP + TN + FP + FN
    
    Pred = list(map(lambda x: 1 if x >= thr else 0, y_hat))
    
    if binder != 0:
        sens = (TP / binder) * 100
    else:
        sens = 0
        
    if nonb != 0:
        spec = TN / nonb * 100
    else:
        spec = 0
        
    acc = ((TP + TN) / total) * 100
    f1 = 2 * TP / ((2 * TP) + FP + FN)
    F1 = f1_score(y_actual, Pred, zero_division=0)
    auc1 = roc_auc_score(y_actual, y_hat)
    auprc = average_precision_score(y_actual, y_hat)
    kappa = cohen_kappa_score(Pred, y_actual)
    
    if ((TP+FN)*(TP+FP)*(TN+FP)*(TN+FN)) != 0:
        mcc = (TP * TN - FP * FN) / ((TN + FN) * (TP + FN) * (TN + FP) * (TP + FP)) ** 0.5
    else:
        mcc = 0
        
    return (TP, FP, TN, FN, sens, spec, acc, auc1, auprc, F1, kappa, mcc)


# Classifier Initialization
classifiers = {

    "RandomForest": RandomForestClassifier(n_jobs=-1,  random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(random_state=42),
    "SGDClassifier": SGDClassifier(loss='log', random_state=42),
    "SVC": SVC(probability=True, random_state=42),
    "GaussianNB": GaussianNB(),
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=42),
    "MLPClassifier": MLPClassifier(random_state=42),
    "AdaBoostClassifier": AdaBoostClassifier(random_state=42),
    "BaggingClassifier": BaggingClassifier(random_state=42),
    "LinearSVC": CalibratedClassifierCV(LinearSVC(random_state=42)), # Wrapped with CalibratedClassifierCV
    "NuSVC": NuSVC(probability=True, random_state=42),
    "BernoulliNB": BernoulliNB(),
    "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
    "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
#    "GaussianProcessClassifier": GaussianProcessClassifier(random_state=42), #take too much time
#    "LabelPropagation": LabelPropagation(),
    "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "ExtraTrees": ExtraTreesClassifier(n_jobs=-1, criterion='entropy', n_estimators=200, random_state=42),
}

# Initialize Metrics Data Dictionary
metrics_data = {}

# Running 5-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)


# Looping Over Classifiers and Features
for classifier_name, classifier in classifiers.items():
    print(classifier_name)
    test_model = []
    validation_model = []    
    for i in range(x_train.shape[1]):
        #Extracting single feature from training data
        X_single_feature = x_train.iloc[:, i].values.reshape(-1, 1)

        test_metrics = []
        # 5 fold-cv
        for train_index, test_index in skf.split(X_single_feature, y_train):
            X_fold_train, X_fold_test = X_single_feature[train_index], X_single_feature[test_index]
            y_fold_train, y_fold_test = y_train[train_index], y_train[test_index]
            classifier.fit(X_fold_train, y_fold_train)
            
            # Predict on 5 fold testing data
            y_pred_test = classifier.predict_proba(X_fold_test)
            metrics_test = perf_measure(y_fold_test.to_numpy(), y_pred_test[:,1])
            test_metrics.append(metrics_test)
        
        # Taking mean of 5-fold testing data
        test_df = pd.DataFrame(test_metrics).mean(axis=0)
        test_model.append(np.array(test_df))

        # Extracting single feature from validation data
        X_single_feature_validation = x_validation.iloc[:, i].values.reshape(-1, 1)
        y_pred_validation = classifier.predict_proba(X_single_feature_validation)
        metrics_validation = perf_measure(y_validation.to_numpy(), y_pred_validation[:,1])                   
        validation_model.append(metrics_validation)
        
    # Create DataFrames for training and test metrics
    test_metrics_df = pd.DataFrame(np.array(test_model),
        columns=["TP", "FP", "TN", "FN", "Sens", "Spec", "Accuracy", "AUC", "AUPRC", "F1", "Kappa", "MCC"],
        index=list(x_train.columns))

    validation_metrics_df = pd.DataFrame(np.array(validation_model),
        columns=["TP", "FP", "TN", "FN", "Sens", "Spec", "Accuracy", "AUC", "AUPRC", "F1", "Kappa", "MCC"],
        index=list(x_train.columns))

    # Store metrics DataFrames in a dictionary
    metrics_data[classifier_name] = {
        "Testing": test_metrics_df,
        "validation": validation_metrics_df
    }

# Now you can access the metrics for each classifier and feature using metrics_data dictionary.


RandomForest
GradientBoosting
LogisticRegression
SGDClassifier
SVC
GaussianNB
DecisionTreeClassifier
MLPClassifier
AdaBoostClassifier
BaggingClassifier
LinearSVC
NuSVC
BernoulliNB
LinearDiscriminantAnalysis
QuadraticDiscriminantAnalysis
XGBClassifier
ExtraTrees


In [3]:
# Specify the directory and filename where you want to save the metrics data
metrics_data_dir = "../metrics_data/"
metrics_data_filename = "metrics_data_all_feature.csv"

# Create the directory if it doesn't exist
import os
if not os.path.exists(metrics_data_dir):
    os.makedirs(metrics_data_dir)

# Loop through the metrics_data dictionary and save each DataFrame to a CSV file
for classifier_name, metrics in metrics_data.items():
    for metric_type, metric_df in metrics.items():
        # Create a directory for each classifier
        classifier_dir = os.path.join(metrics_data_dir, classifier_name)
        if not os.path.exists(classifier_dir):
            os.makedirs(classifier_dir)
        
        # Define the full path to save the CSV file
        csv_file_path = os.path.join(classifier_dir, f"{metric_type}_metrics_all_feature.csv")
        
        # Save the DataFrame to a CSV file
        metric_df.to_csv(csv_file_path)

print("Metrics data saved successfully!")

Metrics data saved successfully!


In [1]:
##combined code

In [None]:
# Python code to ignore warnings
import warnings
warnings.filterwarnings('ignore')

# creation performance measure function
def perf_measure(y_actual, y_hat, thr=0.50):
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    spec = 0
    mcc = 0
    sens = 0
    acc = 0
    i = 0
    
    while i < len(y_hat):
        if y_actual[i] == 1 and y_hat[i] >= thr:
            TP += 1
        elif y_actual[i] == 0 and y_hat[i] >= thr:
            FP += 1
        elif y_actual[i] == 0 and y_hat[i] < thr:
            TN += 1
        elif y_actual[i] == 1 and y_hat[i] < thr:
            FN += 1
        i += 1
        
    binder = TP + FN
    nonb = TN + FP
    total = TP + TN + FP + FN
    
    Pred = list(map(lambda x: 1 if x >= thr else 0, y_hat))
    
    if binder != 0:
        sens = (TP / binder) * 100
    else:
        sens = 0
        
    if nonb != 0:
        spec = TN / nonb * 100
    else:
        spec = 0
        
    acc = ((TP + TN) / total) * 100
    f1 = 2 * TP / ((2 * TP) + FP + FN)
    F1 = f1_score(y_actual, Pred, zero_division=0)
    auc1 = roc_auc_score(y_actual, y_hat)
    auprc = average_precision_score(y_actual, y_hat)
    kappa = cohen_kappa_score(Pred, y_actual)
    
    if ((TP+FN)*(TP+FP)*(TN+FP)*(TN+FN)) != 0:
        mcc = (TP * TN - FP * FN) / ((TN + FN) * (TP + FN) * (TN + FP) * (TP + FP)) ** 0.5
    else:
        mcc = 0
        
    return (TP, FP, TN, FN, sens, spec, acc, auc1, auprc, F1, kappa, mcc)

# Specify the directory and filename where you want to save the metrics data
metrics_data_dir = "../metrics_data/"
metrics_data_filename = "metrics_data_all_feature.csv"

# Create the directory if it doesn't exist
import os
if not os.path.exists(metrics_data_dir):
    os.makedirs(metrics_data_dir)


# Classifier Initialization
classifiers = {
    "RandomForest": RandomForestClassifier(criterion='entropy', max_depth=30, min_samples_split=5,
                       n_estimators=200, n_jobs=-1, random_state=1),
    "ExtraTrees": ExtraTreesClassifier(n_jobs=-1, criterion='entropy', n_estimators=200, random_state=42),
    #    "GradientBoosting": GradientBoostingClassifier(random_state=42),
#    "LogisticRegression": LogisticRegression(random_state=42),
#    "SGDClassifier": SGDClassifier(loss='log', random_state=42),
#    "SVC": SVC(probability=True, random_state=42),
#    "GaussianNB": GaussianNB(),
#    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=42),
#    "MLPClassifier": MLPClassifier(random_state=42),
#    "AdaBoostClassifier": AdaBoostClassifier(random_state=42),
#    "BaggingClassifier": BaggingClassifier(random_state=42),
#    "LinearSVC": CalibratedClassifierCV(LinearSVC(random_state=42)), # Wrapped with CalibratedClassifierCV
#    "NuSVC": NuSVC(probability=True, random_state=42),
#    "BernoulliNB": BernoulliNB(),
#    "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
#    "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
#    "GaussianProcessClassifier": GaussianProcessClassifier(random_state=42), #take too much time
#    "LabelPropagation": LabelPropagation(),
#    "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),

}

# Initialize Metrics Data Dictionary
metrics_data = {}

# Running 5-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# Looping Over Classifiers and Features
for classifier_name, classifier in classifiers.items():
    print(classifier_name)
    test_model = []
    validation_model = []    
    for i in range(x_train.shape[1]):
        # Extracting single feature from training data
        X_single_feature = x_train.iloc[:, i].values.reshape(-1, 1)

        test_metrics = []
        # 5 fold-cv
        for train_index, test_index in skf.split(X_single_feature, y_train):
            X_fold_train, X_fold_test = X_single_feature[train_index], X_single_feature[test_index]
            y_fold_train, y_fold_test = y_train[train_index], y_train[test_index]
            classifier.fit(X_fold_train, y_fold_train)
            
            # Predict on 5 fold testing data
            y_pred_test = classifier.predict_proba(X_fold_test)
            metrics_test = perf_measure(y_fold_test.to_numpy(), y_pred_test[:,1])
            test_metrics.append(metrics_test)
        
        # Taking mean of 5-fold testing data
        test_df = pd.DataFrame(test_metrics).mean(axis=0)
        test_model.append(np.array(test_df))

        # Extracting single feature from validation data
        X_single_feature_validation = x_validation.iloc[:, i].values.reshape(-1, 1)
        y_pred_validation = classifier.predict_proba(X_single_feature_validation)
        metrics_validation = perf_measure(y_validation.to_numpy(), y_pred_validation[:,1])                   
        validation_model.append(metrics_validation)
        
    # Create DataFrames for training and test metrics
    test_metrics_df = pd.DataFrame(np.array(test_model),
        columns=["TP", "FP", "TN", "FN", "Sens", "Spec", "Accuracy", "AUC", "AUPRC", "F1", "Kappa", "MCC"],
        index=list(x_train.columns))
#return (TP, FP, TN, FN, sens, spec, acc, auc1, auprc, F1, kappa, mcc)
    validation_metrics_df = pd.DataFrame(np.array(validation_model),
        columns=["TP", "FP", "TN", "FN", "Sens", "Spec", "Accuracy", "AUC", "AUPRC", "F1", "Kappa", "MCC"],
        index=list(x_train.columns))

    # Store metrics DataFrames in a dictionary
    metrics_data[classifier_name] = {
        "Testing": test_metrics_df,
        "Validation": validation_metrics_df
    }

    # Save the metrics to CSV
    classifier_dir = os.path.join(metrics_data_dir, classifier_name)
    if not os.path.exists(classifier_dir):
        os.makedirs(classifier_dir)

    # Save training and validation metrics to CSV files
    test_metrics_df.to_csv(os.path.join(classifier_dir, "testing_metrics_all_feature.csv"))
    validation_metrics_df.to_csv(os.path.join(classifier_dir, "validation_metrics_all_feature.csv"))

print("Metrics data saved successfully!")
