In [None]:
# Binary Relevance(BR) Approach
# Skips the labels with only one instance as binary classifiers
# require at least two classes to learn from which is equivalent
# to skipping labels not having any instances in the training dataset

In [3]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score, classification_report, hamming_loss
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
import numpy as np
from google.colab import files

In [4]:
f = files.upload()

Saving all_data.csv to all_data.csv


In [5]:
# data read and extract feature matrix and target variable
data = pd.read_csv("all_data.csv")

x = data["sequence"]
y = data["goa"].apply(
    lambda k: [item for item in k.strip("[]").replace("'", "").split(', ')
               if item != '']).values
seq_list = x.to_list()

# Encoding sequences
def compute_kmer_frequencies(sequence, k) -> dict:
    kmer_counts = {}
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i + k]
        kmer_counts[kmer] = kmer_counts.get(kmer, 0) + 1

    total_kmers = sum(kmer_counts.values())
    kmer_frequencies = {kmer: count / total_kmers for kmer, count in
                        kmer_counts.items()}
    return kmer_frequencies

kmer_freq = [compute_kmer_frequencies(seq, 1) for seq in seq_list]

# Transforming the k-mer Frequencies into a Feature Matrix:
vector = DictVectorizer(sparse=False)
X_vectorized = vector.fit_transform(kmer_freq)


#---------------#
# label info
label_list = []  # all labels
label_count = 0  # number of labels
label_dict = {}  # no of instances per label
for lst in y:
    for label in lst:
        if label not in label_list:
            label_list.append(label)
            label_dict[label] = 0
            label_count += 1
for lst in y:
    for label in lst:
        label_dict[label] += 1

sorted_labels_by_instance_numbers = sorted(label_dict.items(), key=lambda k: k[1])
# print(sorted_labels_by_instance_numbers)

label_list_with_less_than_3_instances =[]
for key in label_dict:
    if label_dict[key] < 3:
        label_list_with_less_than_3_instances.append(key)

print(len(label_list_with_less_than_3_instances)) # 7777 - sp.db

734


In [6]:
# Splitting the Dataset
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y,
                                                    test_size=0.2,
                                                    random_state=1)


# Transforming target variable to a multilabel format
mlb = MultiLabelBinarizer(classes=label_list)
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

# Standardization of feature values
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
# labels not present in the training data set
y_train_labels = mlb.inverse_transform(y_train)
y_test_labels = mlb.inverse_transform(y_test)

# Convert to sets for easier comparison
train_label_set = {label for labels in y_train_labels for label in labels}
test_label_set = {label for labels in y_test_labels for label in labels}

# Find labels present in test set but not in training set
missing_labels = test_label_set - train_label_set

print(len(missing_labels))
print("Labels present in test but missing in train:", missing_labels)

11
Labels present in test but missing in train: {'GO:0099040', 'GO:2001225', 'GO:0047484', 'GO:0099038', 'GO:1905039', 'GO:0098908', 'GO:0090554', 'GO:0140115', 'GO:0046865', 'GO:0046943', 'GO:0140328'}


In [11]:
# Training Individual Classifiers for Each Label
y_pred = np.zeros_like(y_test) # to maintain consistent shaoe as y_test
count = 0 # to check if the number label of labels skipped is same as the number of labels not in the training dataset
for i in range(y_train.shape[1]):
    label = y_train[:, i]

    if len(np.unique(label)) > 1:
        clf = SVC(kernel='rbf', gamma=0.1)
        clf.fit(X_train, label)
        y_pred[:, i] = clf.predict(X_test)
    else:
        print(f"Skipping label index {i} due to insufficient class variation.")
        count += 1
print(count)

Skipping label index 5281 due to insufficient class variation.
Skipping label index 5700 due to insufficient class variation.
Skipping label index 5701 due to insufficient class variation.
Skipping label index 5702 due to insufficient class variation.
Skipping label index 5703 due to insufficient class variation.
Skipping label index 5704 due to insufficient class variation.
Skipping label index 5705 due to insufficient class variation.
Skipping label index 5706 due to insufficient class variation.
Skipping label index 5707 due to insufficient class variation.
Skipping label index 5708 due to insufficient class variation.
Skipping label index 5709 due to insufficient class variation.
11


In [9]:
# accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')

# gamma = 0.5 -> 0.91

Accuracy: 0.6005291005291006
