In [None]:
%pip install scikit-learn
%pip install torch
%pip install pandas
%pip install matplotlib

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import pandas as pd

In [None]:
def load_data(file_path):
    """
    Load the CSV file and return the sequences and secondary structures.
    
    Args:
    - file_path (str): The path to the CSV file.
    
    Returns:
    - sequences (list): A list of amino acid sequences.
    - structures (list): A list of corresponding secondary structures (sst3).
    """
    # Load the CSV data into a pandas DataFrame
    data = pd.read_csv(file_path)
    
    # Extract the sequence and secondary structure columns
    sequences = data['seq'].tolist()
    structures = data['sst3'].tolist()
    
    return sequences, structures

AMINO_ACIDS = 'ACDEFGHIKLMNPQRSTVWY'


def one_hot_encode(sequence):
    """
    One-hot encode an amino acid sequence.
    
    Args:
    - sequence (str): The amino acid sequence.
    
    Returns:
    - np.ndarray: A one-hot encoded matrix of shape (len(sequence), 20).
    """
    # Initialize a matrix of zeros with shape (sequence length, 20)
    one_hot_matrix = np.zeros((len(sequence), len(AMINO_ACIDS)), dtype=int)
    
    # For each amino acid in the sequence, set the corresponding position in the matrix to 1
    for i, amino_acid in enumerate(sequence):
        if amino_acid in AMINO_ACIDS:
            index = AMINO_ACIDS.index(amino_acid)
            one_hot_matrix[i, index] = 1
        else:
            raise ValueError(f"Unknown amino acid {amino_acid} in sequence.")
    
    return one_hot_matrix

def extract_features_and_labels(sequences, structures, window_size=15):
    """
    Extracts feature vectors using a sliding window approach and corresponding labels.
    
    Args:
    - sequences (list of str): List of amino acid sequences.
    - structures (list of str): List of corresponding secondary structures (sst3).
    - window_size (int): Size of the sliding window (default is 15).
    
    Returns:
    - X (list of np.ndarray): List of feature vectors (one-hot encoded windows).
    - y (list of str): List of labels (secondary structure for the center of the window).
    """
    X, y = [], []
    half_window = window_size // 2
    
    for sequence, structure in zip(sequences, structures):
        padded_sequence = 'X' * half_window + sequence + 'X' * half_window  # Padding with 'X'
        for i in range(len(sequence)):
            window = padded_sequence[i:i + window_size]  # Extract window
            if 'X' not in window:  # Ignore windows with padding (if any)
                one_hot_window = one_hot_encode(window)  # One-hot encode the window
                X.append(one_hot_window.flatten())  # Flatten to 1D
                y.append(structure[i])  # Label for the center amino acid
    
    return np.array(X), np.array(y)

In [None]:
train_file = 'training_data.csv'
test_file = 'test_data.csv'

train_sequences, train_structures = load_data(train_file)

# Load test data
test_sequences, test_structures = load_data(test_file)

# Feature extraction (sliding window and one-hot encoding)
X_train, y_train = extract_features_and_labels(train_sequences, train_structures)
X_test, y_test = extract_features_and_labels(test_sequences, test_structures)


c = 0.1

gam = 'scale'

# Initialize the SVM model with RBF kernel
svm = SVC(kernel='rbf', C=c, gamma=gam)

# Fit the model directly on the training data
svm.fit(X_train, y_train)

# Predict on the test set
y_pred = svm.predict(X_test)

# Predict on the test set

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.805277339976613
              precision    recall  f1-score   support

           C       0.76      0.83      0.79      8064
           E       0.85      0.25      0.38      2108
           H       0.84      0.91      0.87      9497

    accuracy                           0.81     19669
   macro avg       0.82      0.66      0.68     19669
weighted avg       0.81      0.81      0.79     19669

