Load Data

In [None]:
import pandas as pd
import warnings
import mrmr
warnings.filterwarnings("ignore")

In [None]:
# Define the list of feature names
feature_names =  ['age', 'rank', 'height', 'weight', 'bmi', 'bp_s', 'bp_d', 'bp', 'map', 'smoker', 'alcohol', 'faam', 'eilp', 'chronicity', 'ttp', 'ttd', 'ttt', 'wait time', 'time_dg', 'co_morb', 'prior_injuries', 'prior_surgery', 'prior_courses']

# Read the feature data from a CSV file into a Pandas DataFrame
X = pd.read_csv('data//X.csv', header=None)

# Assign the feature names to the columns of the DataFrame
X.columns = feature_names

# Read the target variable (labels) from a CSV file into a Pandas DataFrame
y = pd.read_csv('data//y.csv', header=None)

# Assign a column name ('labels') to the target variable
y.columns = ['labels']

# Use the mrmr_classif function to select features based on minimum redundancy maximum relevance
selected_features = mrmr.mrmr_classif(X, y.iloc[:, 0].values, K=X.shape[1])

# Initialize a list with the first selected feature
valid_features = [selected_features[0]]

# Iterate over the remaining selected features and check for correlation with already selected features
for feat in range(1, X.shape[1]):
    # Check if the absolute correlation between the current feature and the valid features is greater than 0.5
    if any(abs(X[valid_features + [selected_features[feat]]].corr().iloc[:-1, -1].values) > 0.5):
        # If the correlation is greater than 0.5, skip adding the feature to the valid features list
        pass
    else:
        # If the correlation is not greater than 0.5, add the feature to the valid features list
        valid_features.append(selected_features[feat])

# Create a new DataFrame (X_) containing only the selected valid features
X_ = X[valid_features].copy()

Load Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [None]:
models = [LogisticRegression(class_weight='balanced',C=0.25,penalty='l2',solver='liblinear'),
          SVC(probability=True, max_iter=100,class_weight='balanced',C=0.45,gamma='scale',kernel='linear'),
          Pipeline([('smote', SMOTE(sampling_strategy='auto', random_state=42)), ('clf', KNeighborsClassifier(metric='manhattan',weights='uniform',n_neighbors=11,algorithm='brute'))]),
          RandomForestClassifier(class_weight='balanced',max_features='sqrt',n_estimators=20,max_depth=3,warm_start=True),
          Pipeline([('smote', SMOTE(sampling_strategy='auto', random_state=42)), ('clf', AdaBoostClassifier())]),
          Pipeline([('smote', SMOTE(sampling_strategy='auto', random_state=42)), ('clf', GaussianNB())]),
          Pipeline([('smote', SMOTE(sampling_strategy='auto', random_state=42)), ('clf', LinearDiscriminantAnalysis())]),
          Pipeline([('smote', SMOTE(sampling_strategy='auto', random_state=42)), ('clf', QuadraticDiscriminantAnalysis())]),
          Pipeline([('smote', SMOTE(sampling_strategy='auto', random_state=42)), ('clf', MLPClassifier(max_iter=10))]),
          Pipeline([('smote', SMOTE(sampling_strategy='auto', random_state=42)), ('clf', XGBClassifier(learning_rate=0.1,max_depth=3,n_estimators=20,booster='gblinear'))])]

model_names = ['Logistic Regression',
               'SVM',
               'KNN',
               'Random Forest',
               'Adaboost',
               'Naive Bayes',
               'Linear Discriminant Analysis',
               'Quadratic Discriminant Analysis',
               'Neural Network',
               'XGBoost']

model_dict = dict(zip(model_names, models))

Conduct Analysis

In [None]:
from interrogation_analysis import CompetencyAnalysis
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
competency_analyser = CompetencyAnalysis(n_datasets=200, pop_size=30, n_gen=10)
competency_analyser.prepare_analysis(X_.values,y['labels'].values)

In [None]:
def compare_models(grid, x_coords, y_coords):
    titles = ['Top Ranked Model', '2nd Ranked Model', '3rd Ranked Model']

    # Set the size of the figure
    fig, axes = plt.subplots(1, 3, figsize=(20, 7))

    # Define a custom color map with 10 colors
    colors = sns.color_palette("pastel", 10)

    # Loop through the top 3 ranked models
    for rank in range(3):
        # Get indices for the current rank
        indices = np.argsort(grid, axis=2)[:, :, -rank - 1]

        # Flatten the coordinates and sorted indices
        x_flattened = x_coords.flatten()
        y_flattened = y_coords.flatten()
        indices_flattened = indices.flatten()
        c = [colors[i] for i in indices_flattened]

        # Create a scatter plot for the current rank using Seaborn
        sns.scatterplot(x=x_flattened, y=y_flattened, c=c, marker='o', ax=axes[rank])
        axes[rank].scatter(competency_analyser.f1_score, competency_analyser.n1_score, color='red', edgecolors='k', linewidths=1, marker='s', s=50, label='Original Dataset')

        # Remove color legend from the Seaborn plot
        axes[rank].set_title(titles[rank], fontsize=16)  # Set title fontsize
        axes[rank].set_xlabel('F1 Score', fontsize=14)  # Set xlabel fontsize
        axes[rank].set_ylabel('N1 Score', fontsize=14)  # Set ylabel fontsize
        axes[rank].set_ylim(y_coords[0][0], y_coords[-1][-1])
        axes[rank].set_xlim(x_coords[0][0], x_coords[-1][-1])

    # Create a legend for the models and original specification
    legend_handles = []
    legend_labels = []
    for i, model in enumerate(model_names):
        legend_handles.append(plt.scatter([0], [0], marker='o', c=colors[i], label=model))
        legend_labels.append(model)

    # Add the legend to the figure
    legend_handles.append(plt.scatter([0], [0], marker='s', edgecolors='k', linewidths=1, s=50, color='red', label='Original Dataset'))
    legend_labels.append('Original Dataset')
    fig.legend(handles=legend_handles, labels=legend_labels, loc='upper right', bbox_to_anchor=(1.17, 1), fontsize=12)  # Set legend fontsize

    # Adjust layout and display the plot
    plt.tight_layout()
    plt.show()

In [None]:
results = {}
for model_name, model in model_dict.items():
    print(f'-- Evaluating {model_name} --')
    results[model_name] = {}
    competency_analyser.evaluate_competency(model=model)
    results[model_name]['scores'] = [{'F1': v['F1'], 'N1': v['N1'], 'score': v['score']} for v in competency_analyser.datasets.values()]
    
    # Extract F1, N1, and scores from the datasets
    f1_scores = [v['F1'] for v in competency_analyser.datasets.values()]
    n1_scores = [v['N1'] for v in competency_analyser.datasets.values()]
    scores = [v['score'] for v in competency_analyser.datasets.values()]

    f1_scores_ = []
    n1_scores_ = []
    scores_ = []
    for f1, n1, score in zip(f1_scores, n1_scores, scores):
        if not (np.isnan(f1) or np.isnan(n1) or np.isnan(score)).any():
            f1_scores_.append(f1)
            n1_scores_.append(n1)
            scores_.append(score)

    # Set up a grid for the heatmap
    f1_range = np.linspace(min(f1_scores), max(f1_scores), 100)
    n1_range = np.linspace(min(n1_scores), max(n1_scores), 100)
    f1_mesh, n1_mesh = np.meshgrid(f1_range, n1_range)

    # Use k-nearest neighbors regression to predict accuracy at each grid point
    knn_regressor = KNeighborsRegressor(n_neighbors=20, weights='distance')
    knn_regressor.fit(np.column_stack((f1_scores_, n1_scores_)), scores_)

    # Predict the score at each grid point
    score_grid = knn_regressor.predict(np.column_stack((f1_mesh.ravel(), n1_mesh.ravel())))

    # Reshape the predicted score to the shape of the meshgrid
    score_grid = score_grid.reshape(f1_mesh.shape)
    results[model_name]['score_grid'] = score_grid

grid = np.zeros((100,100,10))
for i, k in enumerate(results.keys()):
    grid[:,:,i] = results[k]['score_grid']

compare_models(grid, f1_mesh, n1_mesh)