In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, LeaveOneOut, cross_val_score

#LOAD DATASETS
train_features = pd.read_csv('./data/derived/feature_selection/training_data_with_dual.csv')
test_features = pd.read_csv('./data/derived/feature_selection/test_data_with_dual.csv')

#LOAD DATASETS
train_features2 = pd.read_csv('./data/derived/feature_selection/training_data_adaptive_bspline.csv')
test_features2 = pd.read_csv('./data/derived/feature_selection/test_data_adaptive_bspline.csv')

combined = pd.concat([train_features, test_features])

In [6]:
# Features
features_derived = ['genome_lrtt', 'genome_maf12c', 'genome_maf3c', 'genome_tips', 'genome_ambig', 
                    'gag_lrtt', 'gag_maf12c', 'gag_maf3c', 'gag_tips', 'gag_ambig',
                    'pol_lrtt', 'pol_maf12c', 'pol_maf3c', 'pol_ambig',
                    'gp120_lrtt', 'gp120_maf12c', 'gp120_maf3c', 'gp120_tips', 'gp120_ambig',
                    'gp41_maf12c', 'gp41_maf3c', 'gp41_ambig']

# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=StratifiedKFold(n_splits=5),
                           scoring='accuracy',
                           n_jobs=-1)

# Define range of recency thresholds (in days)
recency_thresholds = np.arange(182, 730, 30)  # from 1 month to 2 years

best_accuracy = 0
best_threshold = None
best_model = None
best_conf_matrix = None
best_class_report = None

# Iterate through each recency threshold
for threshold in recency_thresholds:
    # Label the data for the current threshold
    train_features['recency'] = np.where(train_features['TSI_days'] <= threshold, 1, 0)
    test_features['recency'] = np.where(test_features['TSI_days'] <= threshold, 1, 0)
    
    # Features and target variable for training and testing sets
    X_train = train_features[features_derived]
    y_train = train_features['recency']
    X_test = test_features[features_derived]
    y_test = test_features['recency']
    
    # Fit the GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Get the best estimator
    model = grid_search.best_estimator_
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_threshold = threshold
        best_model = model
        best_conf_matrix = confusion_matrix(y_test, y_pred)
        best_class_report = classification_report(y_test, y_pred)

# Print the best results
print(f"Best Recency Threshold (in days): {best_threshold}")
print(f"Best Accuracy: {best_accuracy}")
print("\nClassification Report with Best Recency Threshold:")
print(best_class_report)

# Plot the confusion matrix for the best model
disp = ConfusionMatrixDisplay(confusion_matrix=best_conf_matrix, display_labels=best_model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.show()