In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

report_df = pd.read_csv("./table_for_radiological_report_paper.csv")
adj_df = pd.read_csv("./table_for_radiological_report_adj1.csv")

In [6]:
report_df["QFV_lobe"][0]

'[7.03339248e-02 8.32721038e-03 0.00000000e+00 5.78278218e-05\n 2.92980195e-04 1.54690741e-01 0.00000000e+00 0.00000000e+00\n 0.00000000e+00 0.00000000e+00 1.00000000e-02]'

In [4]:
selected_cases = report_df[(report_df['PCA'] == 1) | 
                           (report_df['cerebellar'] == 1) | 
                           (report_df['basilar'] == 1) | 
                           (report_df['Choroidal & Thalamoperfurating'] == 1) &
                           (report_df['MCA'] == 0) &
                            (report_df['basal ganglia'] == 0) &
                            (report_df['deep white matter'] == 0)]

# selected_cases = selected_cases[(selected_cases['MCA'] == 0)]
# selected_cases = selected_cases[(selected_cases['basal ganglia'] == 0)]
# selected_cases = selected_cases[(selected_cases['deep white matter'] == 0)]

visual_scores = selected_cases[['Subject_ID', 'occipital', 'thalamus', 'cerebellum', 'brainstem']]

adj_df['subject_id'] = adj_df['subject_id'].str.replace('sub-', '')

merged_df = pd.merge(visual_scores, adj_df[['subject_id', 'QFV_occipital', 'QFV_thalamus', 'QFV_cerebellar', 'QFV_brainstem', 'stroke_volume_logml']], left_on='Subject_ID', right_on='subject_id')
merged_df.drop(['subject_id'], axis=1, inplace=True)
merged_df['cerebellum'] = merged_df['cerebellum'].astype(int)

In [5]:
merged_df.head()

Unnamed: 0,Subject_ID,occipital,thalamus,cerebellum,brainstem,QFV_occipital,QFV_thalamus,QFV_cerebellar,QFV_brainstem,stroke_volume_logml
0,003aa3e8,0,1,0,0,0.0,0.036126,0.0,0.0,-0.577267
1,005605fd,0,0,1,1,0.002083,0.003473,0.014228,0.018774,0.935951
2,00a13eb6,0,0,1,0,0.0,0.0,0.010252,0.0,0.511115
3,032c9420,0,0,0,0,0.141,0.0304,0.001442,0.00249,1.18461
4,6dad7c8a,1,0,0,0,0.14,0.0825,0.003986,0.000226,1.189633


In [6]:
# Preprocess the data for each brain region
# Separate the features and target for each model
X_occipital = merged_df[['QFV_occipital', 'stroke_volume_logml']]
y_occipital = merged_df['occipital']

X_thalamus = merged_df[['QFV_thalamus', 'stroke_volume_logml']]
y_thalamus = merged_df['thalamus']

X_cerebellum = merged_df[['QFV_cerebellar', 'stroke_volume_logml']]
y_cerebellum = merged_df['cerebellum']

X_brainstem = merged_df[['QFV_brainstem', 'stroke_volume_logml']]
y_brainstem = merged_df['brainstem']


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import Binarizer
import numpy as np

# Function to train and evaluate models
def train_evaluate_models(X, y):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Initialize models
    models = {
        "LDA": LinearDiscriminantAnalysis(),
        "QDA": QuadraticDiscriminantAnalysis(),
        "RF": RandomForestClassifier(random_state=42),
        "KNN": KNeighborsClassifier(),
        "SVM": SVC(random_state=42),
        "MLP": MLPClassifier(random_state=42, max_iter=1000)
    }

    # Train and evaluate each model
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results[name] = {
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "F1": f1_score(y_test, y_pred)
        }

    # Binary Threshold (BT) Model
    binarizer = Binarizer(threshold=0.5)
    y_pred_bt = binarizer.fit_transform(X_test.mean(axis=1).values.reshape(-1, 1)).ravel()
    results["BT"] = {
        "Accuracy": accuracy_score(y_test, y_pred_bt),
        "Precision": precision_score(y_test, y_pred_bt),
        "Recall": recall_score(y_test, y_pred_bt),
        "F1": f1_score(y_test, y_pred_bt)
    }

    return results

In [24]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

def train_evaluate_models_cv(X, y):
    n_splits = 5
    n_iterations = 10

    # Initialize models
    models = {
        "LDA": LinearDiscriminantAnalysis(),
        "QDA": QuadraticDiscriminantAnalysis(),
        "RF": RandomForestClassifier(random_state=42),
        "KNN": KNeighborsClassifier(),
        "SVM": SVC(random_state=42),
        "MLP": MLPClassifier(random_state=42, max_iter=1000),
        # Note: BT will be handled separately
    }

    # Dictionary to store results
    avg_results = {name: {"Accuracy": [], "Precision": [], "Recall": [], "F1": []} for name in models}
    avg_results["BT"] = {"Accuracy": [], "Precision": [], "Recall": [], "F1": []}

    for iteration in range(n_iterations):
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=iteration)

        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # Train and evaluate each model
            for name, model in models.items():
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

                avg_results[name]["Accuracy"].append(accuracy_score(y_test, y_pred))
                avg_results[name]["Precision"].append(precision_score(y_test, y_pred))
                avg_results[name]["Recall"].append(recall_score(y_test, y_pred))
                avg_results[name]["F1"].append(f1_score(y_test, y_pred))

            # Binary Threshold (BT) Model
            binarizer = Binarizer(threshold=0.5)
            y_pred_bt = binarizer.fit_transform(X_test.mean(axis=1).reshape(-1, 1)).ravel()
            
            avg_results["BT"]["Accuracy"].append(accuracy_score(y_test, y_pred_bt))
            avg_results["BT"]["Precision"].append(precision_score(y_test, y_pred_bt))
            avg_results["BT"]["Recall"].append(recall_score(y_test, y_pred_bt))
            avg_results["BT"]["F1"].append(f1_score(y_test, y_pred_bt))

    # Calculating the average over all iterations
    for model in avg_results:
        for metric in avg_results[model]:
            avg_results[model][metric] = np.mean(avg_results[model][metric])

    return avg_results


In [25]:
train_evaluate_models_cv(X_occipital.values, y_occipital.values)

{'LDA': {'Accuracy': 0.80788591983556,
  'Precision': 0.8284867512451348,
  'Recall': 0.5233429951690822,
  'F1': 0.6375301622987792},
 'QDA': {'Accuracy': 0.8426896197327853,
  'Precision': 0.9295611669421971,
  'Recall': 0.5632077294685991,
  'F1': 0.6973132881595334},
 'RF': {'Accuracy': 0.850146968139774,
  'Precision': 0.7702977683059311,
  'Recall': 0.7762898550724637,
  'F1': 0.7714771901707155},
 'KNN': {'Accuracy': 0.8547307297019527,
  'Precision': 0.8608966864178744,
  'Recall': 0.6644830917874397,
  'F1': 0.7475526033232728},
 'SVM': {'Accuracy': 0.8404059609455292,
  'Precision': 0.9358763992106417,
  'Recall': 0.5501449275362319,
  'F1': 0.6893167034951533},
 'MLP': {'Accuracy': 0.8859486125385406,
  'Precision': 0.9140237972757926,
  'Recall': 0.7197004830917874,
  'F1': 0.8031895246349144},
 'BT': {'Accuracy': 0.7865375128468653,
  'Precision': 0.6701493337050181,
  'Recall': 0.6887149758454106,
  'F1': 0.6779454095205348}}