In [1]:
#pip install Bio

In [2]:
from Bio import SeqIO
import numpy as np
from typing import List
import importlib
import fileDistanceProcessing as fdp
import importlib
importlib.reload(fdp) 

<module 'fileDistanceProcessing' from '/home/marko/Desktop/ip2/ip2_projekat/source/fileDistanceProcessing.py'>

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [4]:
import pandas as pd
def model_evaluation(y_test,y_pred,labels):
    print(f"Accuracy:  {accuracy_score(y_test, y_pred)}")
    
    cmtx = pd.DataFrame(
    confusion_matrix(y_test, y_pred, labels=labels ), 
    index=[ "true:"+s for s in labels ],
    columns=["pred:"+s for s in labels]
    )
    print("Confusion matrix:")
    print(cmtx)

In [5]:
X, y, y_protein=fdp.read_dataset_virus()

classes = ['sars1','mers', 'sars2']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)

<h1>KNN with simple difference distance (element-wise subtraction) </h1>

In [7]:
model = KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree', metric=fdp.simple_difference_distance)

In [8]:
model.fit(X_train, y_train)

In [9]:
y_pred = model.predict(X_test)

In [10]:
model_evaluation(y_test,y_pred, classes)

Accuracy:  0.993970893970894
Confusion matrix:
            pred:sars1  pred:mers  pred:sars2
true:sars1          17          0          20
true:mers            3       2282           6
true:sars2           0          0        2482


In [11]:
y_pred_train = model.predict(X_train)

In [12]:
model_evaluation(y_train, y_pred_train, classes)

Accuracy:  0.9960061443932412
Confusion matrix:
            pred:sars1  pred:mers  pred:sars2
true:sars1          43          0          31
true:mers            4       4644           4
true:sars2           0          0        5039


<h1>KNN with p-adic distance (p=2,5)  </h1>

In [13]:
model = KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree', metric=fdp.five_two_adic_distance)

In [14]:
model.fit(X_train, y_train)

In [15]:
y_pred = model.predict(X_test)

In [16]:
model_evaluation(y_test, y_pred, labels=classes)

Accuracy:  0.9941787941787942
Confusion matrix:
            pred:sars1  pred:mers  pred:sars2
true:sars1          17          0          20
true:mers            2       2283           6
true:sars2           0          0        2482


In [17]:
y_pred_train = model.predict(X_train)

In [18]:
model_evaluation(y_train, y_pred_train, labels=classes)

Accuracy:  0.9962109575012801
Confusion matrix:
            pred:sars1  pred:mers  pred:sars2
true:sars1          43          0          31
true:mers            2       4646           4
true:sars2           0          0        5039


<h1> KNN with simple codon distance (codon wise comparing) </h1>

Ideja je da vidimo koliko je nas skup podataka "lak". Odnosno koliko nam samo najjednostavnije razlikovanje u kodonima (1 ako je bar 1 nukleotid razlicit, 0 inace) daje dobre informacije za klasifikaciju.

In [19]:
# DEVELOPING

In [20]:
def simple_codon_distance(s1, s2):
    distance = 0
    for i in range(len(s1)):
        if s1[i] != s2[i]:
            distance += 1
    return distance

In [21]:
model = KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree', metric=simple_codon_distance)

In [22]:
model.fit(X_train, y_train)

In [23]:
y_pred = model.predict(X_test)

In [24]:
model_evaluation(y_test, y_pred, labels=classes)

Accuracy:  0.9943866943866944
Confusion matrix:
            pred:sars1  pred:mers  pred:sars2
true:sars1          17          0          20
true:mers            1       2284           6
true:sars2           0          0        2482


In [25]:
y_pred_train = model.predict(X_train)

In [26]:
model_evaluation(y_train, y_pred_train,classes)

Accuracy:  0.9962109575012801
Confusion matrix:
            pred:sars1  pred:mers  pred:sars2
true:sars1          43          0          31
true:mers            0       4647           5
true:sars2           0          1        5038


<h1> Klasifikacija po tipu proteina </h1>

Moramo da eliminisemo klase koje imaju samo jednu instancu jer KNN ne moze da radi sa njima. Funkcija eliminate_single_classes prima sortirani niz y i njemu odgovarajuci niz X i vraca odgovarajuce nizove X_new, y_new. y_new nece sadrzati ni jednu klasu koja se ne pojavljuje barem 2 puta. 

Objasnjenje kako radi:
Ide se iterativno kroz niz y i poredi se trenutna vrednost klase sa prethodnom. Ukoliko su iste, dodajemo u niz y_new tu vrednost (i u X_new odgovarajucu vrednost) i uvecavamo count za 1. Ukoliko nije bilo poklapanja, ako je count > 1 treba upisati prethodnu vrednost klase (jer znamo da je njen broj pojavljivanja veci od 1). Count resetujemo na 1 (jer smo naisli na novu klasu).

In [27]:
def eliminate_single_classes(X: List[List[int]], y: List[str]) -> (List[int], List[str]):
    #elementi moraju biti sortirani
    
    count = 1
    X_new = []
    y_new = []
    labels = []
    for i in range(1, len(y)):
        if y[i] == y[i-1]:
            X_new.append(X[i-1])
            y_new.append(y[i-1])
            count += 1
        else:
            if count > 1:
                X_new.append(X[i-1])
                y_new.append(y[i-1]) 
                labels.append(y[i-1])
            count = 1
    
    X = X_new
    y = y_new

    return X_new, y_new

Objasnjenje
Potrebno je da kada sortiramo y_protein, izvrsimo i promenu X i y listi na odgovarajuci nacin. Zato ih zippujemo i sortiramo samo po 3. clanu torke (po tipu proteina). Zip ce pokoordinatno praviti kolekciju torki. itemgetter ce dohvatiti 3. clan torke i proslediti kljucu po kom se sortira. Zatim hocemo da izvucemo X (zvacemo ga X_protein) i y_protein. Operator * nad listom prosledjen zip ce da raspakuje citavu listu torki na "niz" torki oblika (lista kodovi1, virus1, protein1) (lista kodovi2, virus2, protein2)... Nakon toga ce zip da pravi kolekciju torki pokoordinatno koja ce biti oblika (lista kodovi1, lista kodovi2, ...), (virus1, virus2, ...), (protein1, protein2, ...)

In [28]:
from operator import itemgetter

# sortiranje po y_protein
data = list(zip(X, y, y_protein))
data.sort(key=itemgetter(2))

#raspakovanje
X_protein, _, y_protein = zip(*data)
X_protein, y_protein = eliminate_single_classes(X_protein, y_protein)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_protein, y_protein, test_size=0.33, stratify=y_protein)

In [30]:
model = KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree', metric=fdp.five_two_adic_distance)

In [31]:
model.fit(X_train, y_train)

In [32]:
y_pred = model.predict(X_test)

In [33]:
accuracy_score(y_test, y_pred)

0.6754897874114214

In [34]:
model = KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree', metric=fdp.simple_difference_distance)

In [35]:
model.fit(X_train, y_train)

In [36]:
y_pred = model.predict(X_test)

In [37]:
accuracy_score(y_test, y_pred)

0.6754897874114214