# Analyse the features of different species.
- we will analyse all the species that has atleast 20 counts



## Download the data

In [1]:
#imports
import os
import csv
import sklearn
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from eli5 import show_weights, show_prediction
import seaborn as sns
from sklearn.manifold import TSNE
from collections import Counter
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from eli5 import show_weights
from sklearn.linear_model import LogisticRegression
from IPython.display import display, clear_output
import eli5
import joblib
from pathlib import Path

#custom
from py.orf1ab_dash_board import DataProcessing, get_dashboard
from py.ml_metrics import evaluate_model, multiclass_logloss
from py.plotting import plot_tsne

Using TensorFlow backend.


# Use the  following dashboard to get some context

In [2]:
get_dashboard(df_columns=["ALL", "Species", 'Geo_Location', 'Host', 'Isolation_Source'])

shape WITH duplicates: (3046, 10)
shape WITHOUT duplicates: (2384, 10)


Eplore the dataset here


Box(children=(Dropdown(options=('ALL', 'Species', 'Geo_Location', 'Host', 'Isolation_Source'), value='ALL'), O…

Output()

In [3]:


def get_data(orf1):  
    # read for data folder and out put 
    df = orf1.get_amino_df()
    print(f"shape WITH duplicates: {df.shape}")

    # remove duplicates
    df.drop_duplicates(subset='Accession', keep=False, inplace=True)
    print(f"shape WITHOUT duplicates: {df.shape}")
    df['Collection_Date'] = pd.to_datetime(df['Collection_Date'], errors='coerce').dt.strftime('%Y-%m-%d')
    df['Release_Date'] = pd.to_datetime(df['Release_Date'], errors='coerce').dt.strftime('%Y-%m-%d')
    df['Length'] = df['Length'].apply(str)
    return df

# Filter and select only those species that are atleast 20


In [4]:
def filter_column(df, column_name, min_count):
    '''
    df: dataframe
    column_name: column to filter
    min_count: minimum count required to be included
    '''
    counts = Counter(df[column_name])
    filtered = [key for key in counts if counts[key] >= min_count]
    print(filtered)
    df = df[df[column_name].isin(filtered)]
    return df[df[column_name].notna()]
    

In [5]:
def map_classes(df, column_name):
    #labels
    lbl_enc = LabelEncoder()
    y = lbl_enc.fit_transform(df[column_name].values)

    # map labels to numercial values
    #map labels to numerical value
    labels = list(lbl_enc.inverse_transform(y))
    return dict(zip(y, labels)), y


# Split the dataframe

In [6]:
def model_predict(df, column_name, kmer, results_path):
    
    class_dict, y = map_classes(df, column_name)
    #train test split
    xtrain, xvalid, ytrain, yvalid = train_test_split(df['seq'].values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)
    #vectorize
    ctv = CountVectorizer(analyzer='char', ngram_range=(kmer, kmer), lowercase=False) # kmer: k-mer length
    # embed
    ctv.fit(list(xtrain)+list(xvalid))
    xtrain_ctv = ctv.transform(xtrain)
    xvalid_ctv = ctv.transform(xvalid)
    
    # Fitting a simple Logistic Regression on Counts
    clf = LogisticRegression(C=1.0, max_iter=4000)
    clf.fit(xtrain_ctv, ytrain)
    predictions = clf.predict(xvalid_ctv)
    
    #make report
    report = classification_report(yvalid, predictions, 
                                   target_names=class_dict.values(), output_dict=True)
    report = pd.DataFrame(report).transpose()
    
    #extract features
    feature_imp = eli5.formatters.as_dataframe.explain_weights_df(clf, 
    feature_names=ctv.get_feature_names())
    feature_imp = feature_imp.replace({"target": class_dict})
    
    model_root = os.path.join(results_path, 'models')
    data_root = os.path.join(results_path, column_name)
    if not os.path.exists(data_root):
        os.mkdir(data_root)
        
    
        
    # save results
    #model 
    joblib.dump(clf, os.path.join(model_root, f"orf1_{column_name}_{k}_mer_lr_model.pkl"), compress=9)
    # predictions
    pred_label = [class_dict[i] for i in predictions]
    pd.DataFrame({column_name:pred_label, "pred":predictions}).to_csv(
        os.path.join(data_root, f"orf1_{column_name}_{k}_mer_lr_pred.csv"), index=False, header=True)

        
    #classification report
    report.to_csv(os.path.join(data_root, f"orf1_{column_name}_{k}_mer_lr_metrics.csv"), 
                  index=True, header=True)
    
    # feature importance
    feature_imp.to_csv(os.path.join(data_root, f"orf1_{column_name}_{k}_mer_lr_feature.csv"),
                       index=False, header=True)
    clear_output()
    display(report)
    display(show_weights(clf, vec=ctv, top=25, feature_filter=lambda x: x != '<BIAS>', 
                         target_names=class_dict))
    print(f"---> Analysis of {k}-mer done! Results in {data_root}")
    
    

In [None]:

BOOK_ROOT = os.path.dirname(os.path.realpath('__file__'))
DATA_PATH = Path(Path(BOOK_ROOT).resolve().parent, "data")
TOOLS_PATH = Path(Path(BOOK_ROOT).resolve().parent, "tools")
PLOTS_PATH = Path(Path(BOOK_ROOT).resolve().parent, "plots")
RESULTS_PATH = Path(Path(BOOK_ROOT).resolve().parent, "results")


orf1 = DataProcessing('coronavirus_orf1ab.fasta', 'coronavirus_orf1ab_meta.csv')
data = get_data(orf1)
for column_name in ['Host', 'Isolation_Source', 'Geo_Location']:
    df = filter_column(data, column_name, 20)
#     print(df.head())
    for k in range(10, 26):
        model_predict(df, column_name, k, RESULTS_PATH)
    

Unnamed: 0,precision,recall,f1-score,support
Sus scrofa,0.916667,1.0,0.956522,11.0
Camelus,0.947368,0.947368,0.947368,19.0
Homo sapiens,1.0,1.0,1.0,3.0
Chiroptera,1.0,1.0,1.0,5.0
Gallus gallus,1.0,1.0,1.0,28.0
Scotophilus kuhlii,1.0,0.98,0.989899,50.0
Camelus dromedarius,1.0,1.0,1.0,5.0
Felis catus,1.0,1.0,1.0,2.0
Rhinolophus sinicus,1.0,1.0,1.0,4.0
Mus musculus,1.0,1.0,1.0,59.0


Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9
+0.170,GDCVTVRVSQ,,,,,,,,
+0.170,CVTVRVSQQV,,,,,,,,
+0.170,SQQVDLVISD,,,,,,,,
+0.170,DCVTVRVSQQ,,,,,,,,
+0.170,TVRVSQQVDL,,,,,,,,
+0.170,FGDCVTVRVS,,,,,,,,
+0.170,VTVRVSQQVD,,,,,,,,
+0.170,VSQQVDLVIS,,,,,,,,
+0.170,RVSQQVDLVI,,,,,,,,
+0.170,VRVSQQVDLV,,,,,,,,

Weight?,Feature
+0.170,GDCVTVRVSQ
+0.170,CVTVRVSQQV
+0.170,SQQVDLVISD
+0.170,DCVTVRVSQQ
+0.170,TVRVSQQVDL
+0.170,FGDCVTVRVS
+0.170,VTVRVSQQVD
+0.170,VSQQVDLVIS
+0.170,RVSQQVDLVI
+0.170,VRVSQQVDLV

Weight?,Feature
+0.168,PVVSDTVEVP
+0.144,VVIADTLQEI
+0.144,VIADTLQEIP
+0.144,ADTLQEIPVV
+0.144,EIPVVSDTVE
+0.144,DTLQEIPVVS
+0.144,QEIPVVSDTV
+0.144,LQEIPVVSDT
+0.144,IADTLQEIPV
+0.144,IPVVSDTVEV

Weight?,Feature
+0.005,RNVLPTITQM
+0.005,TKRNVLPTIT
+0.005,KRNVLPTITQ
+0.004,VLPTITQMNL
+0.004,LPTITQMNLK
+0.004,NVLPTITQMN
+0.004,MTRCLAIHDC
+0.004,TRCLAIHDCF
+0.004,IKITEHSWSA
+0.004,VDEVSMCTNY

Weight?,Feature
+0.003,DVPAYVALVG
+0.003,ISDLYDGSTK
+0.003,SPAELKYMTA
+0.003,VLKTMFLLDD
+0.003,GQPGDAELTL
+0.003,CMTFDAKIVN
+0.003,DLLLELDFGA
+0.003,PEYRRDLVDC
+0.003,ALSTGVTYQT
+0.003,DLLISDLYDG

Weight?,Feature
+0.003,LNVEANSKMH
+0.003,KANCGDSFTI
+0.003,QIYVDLDPPC
+0.003,NFMGAGFYFW
+0.003,RGMVLGAISN
+0.003,YVDLDPPCKF
+0.003,GMVLGAISNV
+0.003,YDTRNLSVFN
+0.003,IYVDLDPPCK
+0.003,YKHLISLLGF

Weight?,Feature
+0.179,CYFGVFSFLN
+0.179,SFLNLKLRAP
+0.179,FLNLKLRAPM
+0.179,FGVFSFLNLK
+0.179,TCYFGVFSFL
+0.179,FSFLNLKLRA
+0.179,VFSFLNLKLR
+0.179,YFGVFSFLNL
+0.179,CTCYFGVFSF
+0.179,GVFSFLNLKL

Weight?,Feature
+0.110,KGYGXSCDQL
+0.110,GYGXSCDQLR
+0.110,WKGYGXSCDQ
+0.110,GMWKGYGXSC
+0.110,GXSCDQLREP
+0.110,VCGMWKGYGX
+0.110,YGXSCDQLRE
+0.110,CGMWKGYGXS
+0.110,MWKGYGXSCD
+0.110,XSCDQLREPL

Weight?,Feature
+0.011,LEVPRRNVAT
+0.011,HSTCCNLSHR
+0.011,VPRRNVATLQ
+0.011,REPMMQSADA
+0.011,TCERSEAGIC
+0.011,SLVLARKHST
+0.011,SLEVPRRNVA
+0.011,KHSTCCNLSH
+0.011,STCCNLSHRF
+0.011,QLREPMMQSA

Weight?,Feature
+0.003,SFSVTLLEDA
+0.003,KARIECYDGF
+0.003,LYKTLNQGVL
+0.003,VIVNNLNKSA
+0.003,VQSHVMRAVL
+0.003,CRQDLTEYTM
+0.003,GLLKNGKLLI
+0.003,ARIECYDGFK
+0.003,AVIVDNDVND
+0.003,QATSANVNRL

Weight?,Feature
+0.005,NKGPHEFCSQ
+0.005,DINKGPHEFC
+0.005,INKGPHEFCS
+0.005,NSSLLTLCAF
+0.005,WGYTGSLSSN
+0.005,GYTGSLSSNH
+0.005,QWGYTGSLSS
+0.004,QQWGYTGSLS
+0.004,DIVVVDEVSM
+0.004,NKTSLPTNVA


---> Analysis of 10-mer done! Results in /home/aneesh/Projects/covid_bh_ml/master/orf1ab/results/Host
