In [None]:
!pip install --quiet pandas matplotlib scikit-optimize xgboost

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneOut
from skopt import gp_minimize
from skopt.space import Categorical
from skopt.plots import plot_convergence
from xgboost import XGBClassifier
import heapq
import csv

# Load data
file_path = 'TestTrainSet.csv'
data = pd.read_csv(file_path)

# Drop non-feature columns if present
data_clean = data.drop(columns=['Protein'], errors='ignore')

# Target column
y = data_clean['Cyclical']

# Feature groups (strings only)
feature_groups = [
    ['Exclude', 'Glycine - G'],
    ['Exclude', 'Alanine - A'],
    ['Exclude', 'Leucine - L'],
    ['Exclude', 'Methionine - M'],
    ['Exclude', 'Phenylalanine - F'],
    ['Exclude', 'Tryptophan - W'],
    ['Exclude', 'Lysine - K'],
    ['Exclude', 'Glutamine - Q'],
    ['Exclude', 'Glutamic Acid - E'],
    ['Exclude', 'Serine - S'],
    ['Exclude', 'Proline - P'],
    ['Exclude', 'Valine - V'],
    ['Exclude', 'Isoleucine - I'],
    ['Exclude', 'Cysteine - C'],
    ['Exclude', 'Tyrosine - Y'],
    ['Exclude', 'Histidine - H'],
    ['Exclude', 'Arginine - R'],
    ['Exclude', 'Asparagine - N'],
    ['Exclude', 'Aspartic Acid - D'],
    ['Exclude', 'Threonine - T'],
    ['Exclude', 'Charged | Uncharged', 'Charge Score', 'Positive charge | Negative charge', 'Pos-neg charge ratio'],
    ['Exclude', 'Polar | Non-polar', 'Polarity Score', 'Hydrophobicity Score', 'Local hydrophobic density Score',
     'Number of apolar alpha sphere', 'Proportion of apolar alpha sphere', 'Hydrophobic', 'Mean alpha-sphere solvent acc'],
    ['Exclude', 'Drug Score'],
    ['Exclude', 'Small', 'Number of alpha spheres | Mean alpha-sphere radius', 'Pocket volume (Monte Carlo)', 'Pocket volume (Convex hull)'],
    ['Exclude', 'Amino Acid based volume Score'],
    ['Exclude', 'Aromatic'],
    ['Exclude', 'Number of residues']
]

# Tuple expansion map
string_to_tuple = {
    'Charged | Uncharged': ('Charged', 'Uncharged'),
    'Positive charge | Negative charge': ('Positive charge', 'Negative charge'),
    'Polar | Non-polar': ('Polar', 'Non-polar'),
    'Number of alpha spheres | Mean alpha-sphere radius': ('Number of alpha spheres', 'Mean alpha-sphere radius')
}

# Precompute sample weights
class_freq = y.value_counts(normalize=True).to_dict()
weight_map = {k: 1 / v for k, v in class_freq.items()}
sample_weights = y.map(weight_map)

# Leave-one-out splitter
loo = LeaveOneOut()

# Expand a chosen combination to explicit feature names

def expand_features(feature_combination):
    selected = []
    for subgroup in feature_combination:
        if subgroup == 'Exclude':
            continue
        if subgroup in string_to_tuple:
            selected.extend(string_to_tuple[subgroup])
        else:
            selected.append(subgroup)
    return selected

# Build the XGB model

def build_xgb():
    return XGBClassifier(
        eval_metric='logloss',
        reg_alpha=0.1,
        reg_lambda=1.0,
        verbosity=0,
        random_state=42,
        tree_method='hist',
        device='cuda'
    )

# LOO-CV accuracy for a feature set

def loo_cv(X, y, sw):
    correct = 0
    for tr_idx, te_idx in loo.split(X):
        X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
        y_tr, y_te = y.iloc[tr_idx], y.iloc[te_idx]
        sw_tr = sw.iloc[tr_idx]
        model = build_xgb()
        model.fit(X_tr, y_tr, sample_weight=sw_tr, verbose=False)
        y_pred = model.predict(X_te)
        correct += int(y_pred[0] == y_te.values[0])
    return correct / len(y)

# Search space: one categorical per group
search_space = [Categorical(group, name=f'group_{i}') for i, group in enumerate(feature_groups)]

# Tracking variables
all_results = []              # (accuracy, features)
best_so_far = []              # running best accuracy
iteration_counter = 0         # iteration count
val_scores = []               # accuracy per iteration for early stopping

# Prepare CSV log
with open('feature_combination_log.csv', mode='w', newline='') as f:
    csv.writer(f).writerow(['Iteration', 'Accuracy', 'Selected Features'])

# Objective for gp_minimize

def objective(choice_vector):
    global iteration_counter
    iteration_counter += 1
    selected_features = expand_features(choice_vector)

    if not selected_features:
        print(f"Iteration {iteration_counter}: no features selected.")
        acc = 0.0
    else:
        X_new = data_clean[selected_features]
        acc = loo_cv(X_new, y, sample_weights)

    all_results.append((acc, selected_features))
    with open('feature_combination_log.csv', mode='a', newline='') as f:
        csv.writer(f).writerow([iteration_counter, acc, selected_features])

    if best_so_far:
        best_so_far.append(max(best_so_far[-1], acc))
    else:
        best_so_far.append(acc)

    print(f"Iteration {iteration_counter}: accuracy={acc:.4f}, features={selected_features}")
    return -acc

# Early stopping callback
patience = 100
threshold = 1e-4

def early_stop(res):
    # res.fun is negative accuracy
    val_scores.append(-res.fun)
    if len(val_scores) <= patience:
        return False
    # Check the max improvement in the last `patience` steps
    recent = val_scores[-patience-1:]
    improvements = [abs(recent[i+1] - recent[i]) for i in range(len(recent)-1)]
    if max(improvements) < threshold:
        print(f"Early stopping at iteration {len(val_scores)}")
        return True
    return False

# Run Bayesian optimisation
gp_result = gp_minimize(
    func=objective,
    dimensions=search_space,
    n_calls=500,
    random_state=42,
    verbose=True,
    acq_func='LCB',
    kappa=10.0,
    callback=[early_stop]
)

# Extract best combination and top 10
best_combo = gp_result.x
best_features = expand_features(best_combo)

top10 = heapq.nlargest(10, all_results, key=lambda x: x[0])
print("\nTop 10 feature combinations:")
for i, (acc, feats) in enumerate(top10, 1):
    print(f"{i}. accuracy={acc:.4f}, features={feats}")

print("\nBest feature combination:", best_features)
print("Best accuracy:", -gp_result.fun)

# Convergence plot
plot_convergence(gp_result)
plt.show()

# Best-so-far accuracy per iteration
plt.figure(figsize=(8, 5))
plt.plot(best_so_far)
plt.xlabel('Iteration')
plt.ylabel('Best accuracy so far')
plt.title('Best accuracy over iterations')
plt.tight_layout()
plt.show()
