# Analyse the features of different species.
- we will analyse all the species that has atleast 20 counts



## Download the data

In [1]:
#imports
import os
import csv
import sklearn
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from eli5 import show_weights, show_prediction
import seaborn as sns
from sklearn.manifold import TSNE
from collections import Counter
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from eli5 import show_weights
from sklearn.linear_model import LogisticRegression
from IPython.display import display, clear_output
import eli5
import joblib

#custom
from py.orf1ab_dash_board import DataProcessing, get_dashboard
from py.ml_metrics import evaluate_model, multiclass_logloss
from py.plotting import plot_tsne

Using TensorFlow backend.


# Use the  following dashboard to get some context

In [2]:
get_dashboard(df_columns=["ALL", "Species", 'Geo_Location', 'Host', 'Isolation_Source'])

shape WITH duplicates: (3046, 10)
shape WITHOUT duplicates: (2384, 10)


Eplore the dataset here


Box(children=(Dropdown(options=('ALL', 'Species', 'Geo_Location', 'Host', 'Isolation_Source'), value='ALL'), O…

Output()

In [3]:


def get_data(orf1):  
    # read for data folder and out put 
    df = orf1.get_amino_df()
    print(f"shape WITH duplicates: {df.shape}")

    # remove duplicates
    df.drop_duplicates(subset='Accession', keep=False, inplace=True)
    print(f"shape WITHOUT duplicates: {df.shape}")
    df['Collection_Date'] = pd.to_datetime(df['Collection_Date'], errors='coerce').dt.strftime('%Y-%m-%d')
    df['Release_Date'] = pd.to_datetime(df['Release_Date'], errors='coerce').dt.strftime('%Y-%m-%d')
    df['Length'] = df['Length'].apply(str)
    return df

# Filter and select only those species that are atleast 20


In [4]:
def filter_column(df, column_name, min_count):
    '''
    df: dataframe
    column_name: column to filter
    min_count: minimum count required to be included
    '''
    counts = Counter(df[column_name])
    filtered = [key for key in counts if counts[key] >= min_count]
    print(filtered)
    df = df[df[column_name].isin(filtered)]
    return df[df[column_name].notna()]
    

In [5]:
def map_classes(df, column_name):
    #labels
    lbl_enc = LabelEncoder()
    y = lbl_enc.fit_transform(df[column_name].values)

    # map labels to numercial values
    #map labels to numerical value
    labels = list(lbl_enc.inverse_transform(y))
    return dict(zip(y, labels)), y


# Split the dataframe

In [6]:
def get_test_data(df, column_name):
    
    class_dict, y = map_classes(df, column_name)
    #train test split
    xtrain, xvalid, ytrain, yvalid = train_test_split(df['seq'].values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)
    return xvalid, yvalid
   

In [12]:
orf1 = DataProcessing('coronavirus_orf1ab.fasta', 'coronavirus_orf1ab_meta.csv')
data = get_data(orf1)
tasks = ['Species', 'Host', 'Isolation_Source', 'Geo_Location']
df_species = None
df_host = None
df_geo = None
df_source = None
tasks_dict = dict(zip(tasks, [df_source, df_host, df_geo, df_species]))
for column_name in tasks_dict:
    df = filter_column(data, column_name, 20)
    a1, _ = get_test_data(df, column_name)
    print(len(df), len(a1))
    tasks_dict[column_name] = df[df['seq'].isin(a1)]


shape WITH duplicates: (3046, 10)
shape WITHOUT duplicates: (2384, 10)
['Betacoronavirus 1', 'Coronavirus HKU15', 'Human coronavirus 229E', 'Middle East respiratory syndrome-related coronavirus', 'Alphacoronavirus 1', 'Avian coronavirus', 'Human coronavirus HKU1', 'Human coronavirus NL63', 'Severe acute respiratory syndrome-related coronavirus', 'Porcine epidemic diarrhea virus', 'Alphacoronavirus sp.']
2190 219
[nan, 'Sus scrofa', 'Camelus', 'Homo sapiens', 'Chiroptera', 'Gallus gallus', 'Scotophilus kuhlii', 'Camelus dromedarius', 'Felis catus', 'Rhinolophus sinicus', 'Mus musculus']
1855 186
[nan, 'feces', 'abdominal cavity', 'oronasopharynx', 'lung, oronasopharynx']
767 77
['USA', 'Hong Kong', 'China: Hong Kong', 'China', 'Kenya', 'Saudi Arabia', nan, 'Colombia', 'South Korea', 'Viet Nam', 'China: Beijing', 'United Arab Emirates', 'USA: Illinois', 'USA: Minnesota', 'USA: Ohio', 'USA: Nashville, TN', 'USA: Denver, CO', 'USA: Tennessee']
1801 181


In [13]:
tasks_dict['Species'].drop_duplicates(subset="seq")

Unnamed: 0,Accession,Release_Date,Species,Length,Geo_Location,Host,Isolation_Source,Collection_Date,GenBank_Title,seq
13,YP_009194637,2016-01-05,Human coronavirus 229E,6763,Saudi Arabia,Camelus,,2015-03-01,polyprotein ORF1ab [Camel alphacoronavirus],MACNRVTLAVASDTEISATGCSTIALAVRRYSEAASNGFRACRFVS...
44,YP_003766,2004-03-23,Human coronavirus NL63,6729,,,,,replicase polyprotein 1ab [Human coronavirus N...,MFYNQVTLAVASDSEISGFGFAIPSVAVRTYSEAAAQGFQACRFVA...
45,NP_828849,2003-04-14,Severe acute respiratory syndrome-related coro...,7073,Canada: Toronto,,blood,,orf1ab polyprotein (pp1ab) [Severe acute respi...,MESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEAREHL...
358,AWH65941,2020-02-23,Middle East respiratory syndrome-related coron...,7212,China,,,2013-07-20,ORF1ab [Middle East respiratory syndrome-relat...,MSSVAGVVTQGARNMYRAALNNEKRQDHVSLTTPLCGAGDLALRLT...
402,QGQ60275,2020-01-01,Porcine epidemic diarrhea virus,6781,Colombia,Sus scrofa,feces,2015-05-20,polyprotein [Porcine epidemic diarrhea virus],MASNHVTLAFANDAEISAFGFCTASEAVSYYSEAAASGFMQCRFVS...
...,...,...,...,...,...,...,...,...,...,...
2639,ABF65835,2006-05-28,Severe acute respiratory syndrome-related coro...,7073,,Mus musculus,,,nonstructural polyprotein pp1ab [Severe acute ...,MESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEAREHL...
2649,AAV49729,2005-07-08,Severe acute respiratory syndrome-related coro...,7073,,Paradoxurus hermaphroditus,,,replicase 1AB [SARS coronavirus B039],MESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEAREHL...
2686,AAR14802,2003-11-17,Severe acute respiratory syndrome-related coro...,7073,,,,,putative orf1ab polyprotein [SARS coronavirus ...,MESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEAREHL...
2705,AAK83365,2001-08-02,Betacoronavirus 1,7094,,,,,replicase [Bovine coronavirus],MSKINKYGLELHWAPEFPWMFEDAEEKLDNPSSSEVDIVCSTTAQK...
