In [1]:
import sklearn
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import MiniBatchKMeans
import matplotlib.pyplot as plt
from eli5 import show_weights, show_prediction
from xgboost import plot_importance
import seaborn as sns
from sklearn.manifold import TSNE
from collections import defaultdict

#custom
from py.helper import DataProcessing
from py.ml_metrics import plot_confusion_matrix
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

#global
sns.set(rc={'figure.figsize':(15,15)})

Using TensorFlow backend.


In [2]:
# set K-mer length here
kmer = 4

In [3]:
orf1 = DataProcessing('coronavirus_orf1ab.fasta', 'coronavirus_orf1ab_meta.csv')

In [4]:
#this will take a while
# kmers = orf1.get_amino_kmer_df(4)
amino_df = orf1.get_amino_df(4, 'csv')
print(amino_df.shape)
amino_df.drop_duplicates(subset='Accession', keep=False, inplace=True)
print(amino_df.shape)
amino_df.head()

(3046, 13)
(2384, 13)


Unnamed: 0,Accession,Release_Date,Species,Length,Geo_Location,Host,Isolation_Source,Collection_Date,GenBank_Title,seq_offset_0,seq_offset_1,seq_offset_2,seq_offset_3
1,YP_009555238,2019-02-21T00:00:00Z,Betacoronavirus 1,7095,USA,,,,Orf1ab [Human coronavirus OC43],MSKINKYGLELHWAPEFPWMFEDAEEKLDNPSSSEVDMICSTTAQK...,SKINKYGLELHWAPEFPWMFEDAEEKLDNPSSSEVDMICSTTAQKL...,KINKYGLELHWAPEFPWMFEDAEEKLDNPSSSEVDMICSTTAQKLE...,INKYGLELHWAPEFPWMFEDAEEKLDNPSSSEVDMICSTTAQKLET...
2,YP_002308478,2018-08-24T00:00:00Z,Bulbul coronavirus HKU11,6264,Hong Kong,Pycnonotus jocosus,,2007-01,orf1ab polyprotein [Bulbul coronavirus HKU11-934],MVKNVSKRSPIVLPQIQPPPLQLFIAVAAAEEGHPKDLKYLGNYNL...,VKNVSKRSPIVLPQIQPPPLQLFIAVAAAEEGHPKDLKYLGNYNLV...,KNVSKRSPIVLPQIQPPPLQLFIAVAAAEEGHPKDLKYLGNYNLVT...,NVSKRSPIVLPQIQPPPLQLFIAVAAAEEGHPKDLKYLGNYNLVTS...
3,YP_009513008,2018-08-24T00:00:00Z,Hedgehog coronavirus 1,7150,Germany,Erinaceus europaeus,feces,2012,orf1ab [Betacoronavirus Erinaceus/VMC/DEU/2012],MSSATGEGSQGARATYRAALNNEKRHDHVALTVPCCGTEAKVTALS...,SSATGEGSQGARATYRAALNNEKRHDHVALTVPCCGTEAKVTALSP...,SATGEGSQGARATYRAALNNEKRHDHVALTVPCCGTEAKVTALSPW...,ATGEGSQGARATYRAALNNEKRHDHVALTVPCCGTEAKVTALSPWF...
4,YP_009513020,2018-08-24T00:00:00Z,Coronavirus HKU15,6267,China: Hong Kong,Sus scrofa,,2010,replicase polyprotein [Porcine coronavirus HKU15],MAKNKSKRDAIALPENVPPPLQLFIHVAAAEEGHPKVTTYLGNYNL...,AKNKSKRDAIALPENVPPPLQLFIHVAAAEEGHPKVTTYLGNYNLY...,KNKSKRDAIALPENVPPPLQLFIHVAAAEEGHPKVTTYLGNYNLYA...,NKSKRDAIALPENVPPPLQLFIHVAAAEEGHPKVTTYLGNYNLYAT...
5,YP_009389424,2017-07-14T00:00:00Z,Wencheng Sm shrew coronavirus,6324,China,Suncus murinus,,2015,ORF1ab polyprotein [Wencheng Sm shrew coronavi...,MSVSKVELFVPISDEVDATHFGTFGDAVEAYASAAPSFEGVYFVAY...,SVSKVELFVPISDEVDATHFGTFGDAVEAYASAAPSFEGVYFVAYG...,VSKVELFVPISDEVDATHFGTFGDAVEAYASAAPSFEGVYFVAYGL...,SKVELFVPISDEVDATHFGTFGDAVEAYASAAPSFEGVYFVAYGLQ...


Split the data

In [5]:
#split dataframe
mask = np.random.rand(len(amino_df)) < 0.8
train_df = amino_df[mask]
test_df = amino_df[~mask]
print(f'Size of the test df: {len(test_df)}. Size of the tain df: {len(train_df)}.')

Size of the test df: 495. Size of the tain df: 1889.


In [6]:
#split
xtrain = train_df['seq_offset_0'].values
xtest = test_df['seq_offset_0'].values

In [7]:

def count_vectorize(kmer, xtest, xtrain):#vectorize
    ctv = CountVectorizer(analyzer='char', ngram_range=(kmer, kmer), lowercase=False) # kmer: k-mer length

    ctv.fit(list(xtrain)+list(xtest))
    xtrain_ctv = ctv.transform(xtrain)
    xtest_ctv = ctv.transform(xtest)
    return xtrain_ctv, xtest_ctv

In [8]:

def calculate_silhouette(xtrain_ctv, kmax):
    sil = []
    # dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2
    for k in range(2, kmax+1):
      kmeans = KMeans(n_clusters = k).fit(xtrain_ctv)
      labels = kmeans.labels_
      sil.append(silhouette_score(xtrain_ctv, labels, metric = 'euclidean'))
    return sil

In [29]:
# function returns WSS score for k values from 1 to kmax
def calculate_WSS_silhouette(xtrain_ctv, kmax):
    sse = [] #wss
    sil = [] 
    for k in range(2, kmax+1):
        kmeans = KMeans(n_clusters = k).fit(xtrain_ctv)
        centroids = kmeans.cluster_centers_
        pred_clusters = kmeans.predict(xtrain_ctv)
        curr_sse = 0

        # calculate square of Euclidean distance of each point from its cluster center and add to current WSS
        for i in range(xtrain_ctv.shape[0]):
            curr_center = centroids[pred_clusters[i]]
            curr_sse += (xtrain_ctv[i, 0] - curr_center[0]) ** 2 + (xtrain_ctv[i, 1] - curr_center[1]) ** 2
        sse.append(curr_sse)
        
        #calculate silhouette
        labels = kmeans.labels_
        sil.append(silhouette_score(xtrain_ctv, labels, metric = 'euclidean'))
    return sse, sil



In [None]:
kmax = 10
sil_scores = defaultdict(list)
elbow_scores = defaultdict(list)
kmer_max = 100
for i in range(1, kmer_max):
    xtrain_ctv, _ = count_vectorize(i, xtest, xtrain)
    elbow, sil = calculate_WSS_silhouette(xtrain_ctv, kmax)
    sil_scores[i].append(sil)
    elbow_scores[i].append(elbow)


In [33]:
sil_scores

defaultdict(list,
            {1: [[0.45619357014968104,
               0.5041712880772967,
               0.5827516501116515,
               0.6448596382066593,
               0.7092761470424749,
               0.7210108518868348,
               0.733548422074826,
               0.7472898273027836,
               0.7431119085900866]]})

In [None]:
import pickle

with open("sil_scores.pickle","wb") as f_handle:
    pickle.dump(sil_scores, f_handle)

    
with open("elbow_scores.pickle","wb") as f_handle:
    pickle.dump(elbow_scores, f_handle)
