In [None]:
import pandas as pd
import numpy as np
import warnings
import mrmr
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

In [None]:
# Define the list of feature names
feature_names =  ['age', 'rank', 'height', 'weight', 'bmi', 'bp_s', 'bp_d', 'bp', 'map', 'smoker', 'alcohol', 'faam', 'eilp', 'chronicity', 'ttp', 'ttd', 'ttt', 'wait time', 'time_dg', 'co_morb', 'prior_injuries', 'prior_surgery', 'prior_courses']

# Read the feature data from a CSV file into a Pandas DataFrame
X = pd.read_csv('data//X.csv', header=None)

# Assign the feature names to the columns of the DataFrame
X.columns = feature_names

# Read the target variable (labels) from a CSV file into a Pandas DataFrame
y = pd.read_csv('data//y.csv', header=None)

# Assign a column name ('labels') to the target variable
y.columns = ['labels']

# Use the mrmr_classif function to select features based on minimum redundancy maximum relevance
selected_features = mrmr.mrmr_classif(X, y.iloc[:, 0].values, K=X.shape[1])

# Initialize a list with the first selected feature
valid_features = [selected_features[0]]

# Iterate over the remaining selected features and check for correlation with already selected features
for feat in range(1, X.shape[1]):
    # Check if the absolute correlation between the current feature and the valid features is greater than 0.5
    if any(abs(X[valid_features + [selected_features[feat]]].corr().iloc[:-1, -1].values) > 0.5):
        # If the correlation is greater than 0.5, skip adding the feature to the valid features list
        pass
    else:
        # If the correlation is not greater than 0.5, add the feature to the valid features list
        valid_features.append(selected_features[feat])

# Create a new DataFrame (X_) containing only the selected valid features
X_ = X[valid_features].copy()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import cross_validate, StratifiedKFold, RandomizedSearchCV

In [None]:
param_grid_logistic = {
    'C': [0.001, 0.01, 0.1, 0.5, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

param_grid_svc = {
    'C': [0.001, 0.01, 0.1, 0.5, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

param_grid_rf = {
    'n_estimators': [10, 20, 50, 100],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [1, 2]
}

param_grid_lda = {}  # LinearDiscriminantAnalysis does not have hyperparameters to tune

param_grid_xgb = {
    'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1],
    'max_depth': [1, 2],
    'n_estimators': [10, 20, 50, 100],
    'booster': ['gbtree', 'gblinear', 'dart']
}

models = [LogisticRegression(class_weight='balanced'),
          SVC(probability=True, max_iter=100,class_weight='balanced'),
          RandomForestClassifier(class_weight='balanced',warm_start=True),
          Pipeline([('smote', SMOTE(sampling_strategy='auto', random_state=42)), ('clf', LinearDiscriminantAnalysis())]),
          Pipeline([('smote', SMOTE(sampling_strategy='auto', random_state=42)), ('clf', XGBClassifier())])]

model_names = ['Logistic Regression',
               'SVM',
               'Random Forest',
               'Linear Discriminant Analysis',
               'XGBoost']

# Perform hyperparameter tuning for each model
tuned_models = []
for model, param_grid in zip(models, [param_grid_logistic, param_grid_svc, param_grid_rf, param_grid_lda, param_grid_xgb]):
    if isinstance(model, Pipeline):
        # If the model is a pipeline, update the 'clf' step with RandomizedSearchCV
        model_name = model.steps[-1][0]
        tuned_model = Pipeline([
            ('smote', model.steps[0][1]),  # Assuming the first step is 'smote'
            ('clf', RandomizedSearchCV(model.steps[-1][1], param_distributions=param_grid, scoring='roc_auc', n_iter=10, random_state=42, cv=4))
        ])
    else:
        # If the model is not a pipeline, wrap it with RandomizedSearchCV
        model_name = model.__class__.__name__
        tuned_model = RandomizedSearchCV(model, param_distributions=param_grid, scoring='roc_auc', n_iter=10, random_state=42, cv=4)

    tuned_models.append(tuned_model)

# Create the model dictionary
model_dict = dict(zip(model_names, tuned_models))

In [None]:
from interrogation_analysis import ConceptDriftAnalysis

In [None]:
noise = np.random.normal(loc=0, scale=0.05, size=X_.values.shape)
X_noisy = X_.values + noise

drift_analyser = ConceptDriftAnalysis(X_noisy, y['labels'].values, f1_target=1, n1_target=1, pop_size=40, n_gen=10)

drift_analyser.X = X_.values
drift_analyser.X_ = np.round(drift_analyser.X_,0)

drift_results = {}
for name, model in model_dict.items():
    drift_results[name] = drift_analyser.evaluate_concept_drift(model,n_splits=5,scoring='roc_auc')

In [None]:
xticks = []
for v in drift_results['Logistic Regression'].values():
    if v['std_f1'] == 0:
        xticks.append(f"F1 = {np.round(v['mean_f1'],2)}\nN1 = {np.round(v['mean_n1'],2)}")
    else:
        xticks.append(f"F1 = {np.round(v['mean_f1'],2)} ± {np.round(v['std_f1'],2)}\nN1 = {np.round(v['mean_n1'],2)} ± {np.round(v['std_n1'],2)}")

In [None]:
# Set the desired width and height for the figure
fig_width = 12
fig_height = 6

# Create a new figure with the specified size
plt.figure(figsize=(fig_width, fig_height))

# Define line styles and markers for each line
line_styles = ['--', ':', '-.', '-', '-']
markers = ['o', 's', '^', 'D', 'v']

# Your existing code with modified line and marker styles
for i, (k, v) in enumerate(drift_results.items()):
    mean_scores = []
    std_scores = []
    for k_, v_ in v.items():
        mean_scores.append(v_['mean_score'])

    plt.plot(mean_scores, linestyle=line_styles[i], marker=markers[i])

# Add text boxes under the first and last tick labels
plt.text(0.04, -0.15, 'Original\nDataset', transform=plt.gca().transAxes, ha='center', va='center', fontweight='bold')
plt.text(0.23, -0.15, 'Increasingly Complex\nDatasets', transform=plt.gca().transAxes, ha='center', va='center', fontweight='bold')

plt.annotate('', xy=(0.96, -0.15), xycoords='axes fraction', xytext=(0.35, -0.15),
arrowprops=dict(arrowstyle="->", color='k'))

plt.legend(drift_results.keys())
plt.xticks(ticks=[0, 1, 2, 3, 4, 5], labels=xticks)