# SARS-CoV-2 Variants Classification using Machine Learning

## Setup

### Importing libraries

In [1]:
import numpy as np
import pandas as pd
import os 
import utilities as utils
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

## Data Engineering

### Data wrangling and cleaning

In [2]:
path = os.path.join('Asn2DataSet','TrainingDataset')
alphas = [utils.read_sequence(os.path.join(path, 'Alpha', seq)) for seq in os.listdir(os.path.join(path,'Alpha'))]
alphas = utils.clean_fasta(alphas)
betas = [utils.read_sequence(os.path.join(path, 'Beta', seq)) for seq in os.listdir(os.path.join(path,'Beta'))]
betas = utils.clean_fasta(betas)
deltas = [utils.read_sequence(os.path.join(path, 'Delta', seq)) for seq in os.listdir(os.path.join(path,'Delta'))]
deltas = utils.clean_fasta(deltas)
gammas = [utils.read_sequence(os.path.join(path, 'Gamma', seq)) for seq in os.listdir(os.path.join(path,'Gamma'))]
gammas = utils.clean_fasta(gammas)

### Data pre-processing

In [3]:
cgrs = []
labels = []
for seq in alphas:
    tmp = utils.cgr(seq, 'ACGT', 7)
    cgrs.append(tmp)
    labels.append('Alpha')

for seq in betas:
    tmp = utils.cgr(seq, 'ACGT', 7)
    cgrs.append(tmp)
    labels.append('Beta')

for seq in deltas:
    tmp = utils.cgr(seq, 'ACGT', 7)
    cgrs.append(tmp)
    labels.append('Delta')

for seq in gammas:
    tmp = utils.cgr(seq, 'ACGT', 7)
    cgrs.append(tmp)
    labels.append('Gamma')

cgrs = np.array(cgrs)
labels = np.array(labels)

In [4]:
cgr_vectors = np.array([mat.flatten() for mat in cgrs])
normalized_cgr_vectors = np.array([vector/np.max(vector) for vector in cgr_vectors])

In [5]:
label_map = {label: i for i,label in enumerate(np.unique(labels))}
dummy_labels = np.array([label_map[lab] for lab in labels])
numerical_to_label = {val: key for key,val in label_map.items()}

## Machine Learning

### Model Selection

In [6]:
clf_rf = RandomForestClassifier(criterion='gini',max_depth=3)
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=99)

### Model Evaluation

In [7]:
accs_rf = []
for train_i, test_j in kf.split(normalized_cgr_vectors):
    X_train, X_test = normalized_cgr_vectors[train_i], normalized_cgr_vectors[test_j]
    y_train, y_test = dummy_labels[train_i], dummy_labels[test_j]
    clf_rf.fit(X_train, y_train)
    y_preds = clf_rf.predict(X_test)
    accuracy = 100*accuracy_score(y_test, y_preds)
    accs_rf.append(accuracy)

In [8]:
for i,score in enumerate(accs_rf):
    print(f'Fold {i+1}: {score}')

Fold 1: 100.0
Fold 2: 100.0
Fold 3: 100.0
Fold 4: 100.0
Fold 5: 100.0
Fold 6: 100.0
Fold 7: 100.0
Fold 8: 100.0
Fold 9: 100.0
Fold 10: 100.0


In [9]:
print(f'The average accuracy of the model is: {np.mean(accs_rf)}%')

The average accuracy of the model is: 100.0%


### Model Training

In [10]:
clf_rf.fit(normalized_cgr_vectors, dummy_labels)

## Testing

In [11]:
test_seqs = []
paths = os.path.join('Asn2DataSet', 'TestingDataset')

testing_path = os.listdir(os.path.join('Asn2DataSet','TestingDataset'))
for fasta in testing_path:
    test_seqs.append(utils.read_sequence(os.path.join(paths, fasta)))
test_seqs = utils.clean_fasta(test_seqs)
test_cgrs = [utils.cgr(seq, 'ACGT', 7) for seq in test_seqs]
cgr_vectors_test = np.array([mat.flatten() for mat in test_cgrs])
normalized_cgr_vectors_test = np.array([vector/np.max(vector) for vector in cgr_vectors_test])

## Reporting

In [12]:
preds = clf_rf.predict(normalized_cgr_vectors_test)
for i,pred in enumerate(preds):
    print(f'Predicted label for {i+1}.fasta: {numerical_to_label[pred]}')

Predicted label for 1.fasta: Beta
Predicted label for 2.fasta: Gamma
Predicted label for 3.fasta: Beta
Predicted label for 4.fasta: Gamma
Predicted label for 5.fasta: Gamma
Predicted label for 6.fasta: Delta
Predicted label for 7.fasta: Delta
Predicted label for 8.fasta: Gamma
Predicted label for 9.fasta: Delta
Predicted label for 10.fasta: Delta
