# Gaussian Process Classifier

Simple implementation with cross-validation and metrics.


In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.metrics import specificity_score

np.random.seed(42)
sns.set_style('darkgrid')


In [None]:
# Metric functions
def prec_DNA(y_true, y_pred):
    return precision_score(y_true, y_pred, labels=["DNA"], average=None, zero_division=0)[0]
def prec_RNA(y_true, y_pred):
    return precision_score(y_true, y_pred, labels=["RNA"], average=None, zero_division=0)[0]
def prec_DRNA(y_true, y_pred):
    return precision_score(y_true, y_pred, labels=["DRNA"], average=None, zero_division=0)[0]
def prec_nonDRNA(y_true, y_pred):
    return precision_score(y_true, y_pred, labels=["nonDRNA"], average=None, zero_division=0)[0]

def rec_DNA(y_true, y_pred):
    return recall_score(y_true, y_pred, labels=["DNA"], average=None, zero_division=0)[0]
def rec_RNA(y_true, y_pred):
    return recall_score(y_true, y_pred, labels=["RNA"], average=None, zero_division=0)[0]
def rec_DRNA(y_true, y_pred):
    return recall_score(y_true, y_pred, labels=["DRNA"], average=None, zero_division=0)[0]
def rec_nonDRNA(y_true, y_pred):
    return recall_score(y_true, y_pred, labels=["nonDRNA"], average=None, zero_division=0)[0]

def spe_DNA(y_true, y_pred):
    return specificity_score(y_true, y_pred, average=None, labels=['DNA'])[0]
def spe_RNA(y_true, y_pred):
    return specificity_score(y_true, y_pred, average=None, labels=['RNA'])[0]
def spe_DRNA(y_true, y_pred):
    return specificity_score(y_true, y_pred, average=None, labels=['DRNA'])[0]
def spe_nonDRNA(y_true, y_pred):
    return specificity_score(y_true, y_pred, average=None, labels=['nonDRNA'])[0]


In [None]:
# Load and prepare data
data = pd.read_csv('../training_dataset.csv')
X = data.drop('class', axis=1)
y = data['class']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# Initialize Gaussian Process Classifier
kernel = RBF(length_scale=1.0)
gpc = GaussianProcessClassifier(kernel=kernel, random_state=42)

# 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred = cross_val_predict(gpc, X_scaled, y, cv=skf)


In [None]:
# Calculate metrics
classes = ['DNA', 'RNA', 'DRNA', 'nonDRNA']
prec_funcs = {'DNA': prec_DNA, 'RNA': prec_RNA, 'DRNA': prec_DRNA, 'nonDRNA': prec_nonDRNA}
rec_funcs = {'DNA': rec_DNA, 'RNA': rec_RNA, 'DRNA': rec_DRNA, 'nonDRNA': rec_nonDRNA}
spe_funcs = {'DNA': spe_DNA, 'RNA': spe_RNA, 'DRNA': spe_DRNA, 'nonDRNA': spe_nonDRNA}

metrics = {}
for cls in classes:
    metrics[cls] = {
        'Precision': prec_funcs[cls](y, y_pred),
        'Recall': rec_funcs[cls](y, y_pred),
        'Specificity': spe_funcs[cls](y, y_pred)
    }

results_df = pd.DataFrame(metrics).T
results_df


In [None]:
# Confusion Matrix
cm = confusion_matrix(y, y_pred, labels=classes)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes, ax=ax1)
ax1.set_title('Confusion Matrix')
ax1.set_xlabel('Predicted')
ax1.set_ylabel('True')

cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm_norm, annot=True, fmt='.2%', cmap='Blues', xticklabels=classes, yticklabels=classes, ax=ax2)
ax2.set_title('Normalized Confusion Matrix')
ax2.set_xlabel('Predicted')
ax2.set_ylabel('True')

plt.tight_layout()
plt.show()


In [None]:
# Metrics visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
metrics_list = ['Precision', 'Recall', 'Specificity']
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']

for idx, (metric, color) in enumerate(zip(metrics_list, colors)):
    axes[idx].bar(classes, results_df[metric], color=color, edgecolor='black', alpha=0.7)
    axes[idx].set_title(metric)
    axes[idx].set_ylim([0, 1.1])
    axes[idx].axhline(y=results_df[metric].mean(), color='red', linestyle='--', linewidth=1)
    for i, cls in enumerate(classes):
        axes[idx].text(i, results_df.loc[cls, metric] + 0.02, f'{results_df.loc[cls, metric]:.3f}', 
                      ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()
