# Analyse the features of different species.
- we will analyse all the species that has atleast 20 counts



## Download the data

In [1]:
#imports
import os
import csv
import sklearn
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from eli5 import show_weights, show_prediction
import seaborn as sns
from sklearn.manifold import TSNE
from collections import Counter
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from eli5 import show_weights
from sklearn.linear_model import LogisticRegression
from IPython.display import display, clear_output
import eli5
import joblib

#custom
from py.orf1ab_dash_board import DataProcessing, get_dashboard
from py.ml_metrics import evaluate_model, multiclass_logloss
from py.plotting import plot_tsne

Using TensorFlow backend.


# Use the  following dashboard to get some context

In [2]:
get_dashboard(df_columns=["ALL", "Species", 'Geo_Location', 'Host', 'Isolation_Source'])

shape WITH duplicates: (3046, 10)
shape WITHOUT duplicates: (2384, 10)


Eplore the dataset here


Box(children=(Dropdown(options=('ALL', 'Species', 'Geo_Location', 'Host', 'Isolation_Source'), value='ALL'), O…

Output()

In [3]:


def get_data(orf1):  
    # read for data folder and out put 
    df = orf1.get_amino_df()
    print(f"shape WITH duplicates: {df.shape}")

    # remove duplicates
    df.drop_duplicates(subset='Accession', keep=False, inplace=True)
    print(f"shape WITHOUT duplicates: {df.shape}")
    df['Collection_Date'] = pd.to_datetime(df['Collection_Date'], errors='coerce').dt.strftime('%Y-%m-%d')
    df['Release_Date'] = pd.to_datetime(df['Release_Date'], errors='coerce').dt.strftime('%Y-%m-%d')
    df['Length'] = df['Length'].apply(str)
    return df

# Filter and select only those species that are atleast 20


In [4]:
def filter_column(df, column_name, min_count):
    '''
    df: dataframe
    column_name: column to filter
    min_count: minimum count required to be included
    '''
    counts = Counter(df[column_name])
    filtered = [key for key in counts if counts[key] >= min_count]
    print(filtered)
    df = df[df[column_name].isin(filtered)]
    return df[df[column_name].notna()]
    

In [5]:
def map_classes(df, column_name):
    #labels
    lbl_enc = LabelEncoder()
    y = lbl_enc.fit_transform(df[column_name].values)

    # map labels to numercial values
    #map labels to numerical value
    labels = list(lbl_enc.inverse_transform(y))
    return dict(zip(y, labels)), y


# Split the dataframe

In [6]:
def model_predict(df, column_name, kmer, results_path):
    
    class_dict, y = map_classes(df, column_name)
    #train test split
    xtrain, xvalid, ytrain, yvalid = train_test_split(df['seq'].values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)
    #vectorize
    ctv = CountVectorizer(analyzer='char', ngram_range=(kmer, kmer), lowercase=False) # kmer: k-mer length
    # embed
    ctv.fit(list(xtrain)+list(xvalid))
    xtrain_ctv = ctv.transform(xtrain)
    xvalid_ctv = ctv.transform(xvalid)
    
    # Fitting a simple Logistic Regression on Counts
    clf = LogisticRegression(C=1.0, max_iter=4000)
    clf.fit(xtrain_ctv, ytrain)
    predictions = clf.predict(xvalid_ctv)
    
    #make report
    report = classification_report(yvalid, predictions, 
                                   target_names=class_dict.values(), output_dict=True)
    report = pd.DataFrame(report).transpose()
    
    #extract features
    feature_imp = eli5.formatters.as_dataframe.explain_weights_df(clf, 
    feature_names=ctv.get_feature_names())
    feature_imp = feature_imp.replace({"target": class_dict})
    
    model_root = os.path.join(results_path, 'models')
    data_root = os.path.join(results_path, column_name)
    if not os.path.exists(data_root):
        os.mkdir(data_root)
        
    
        
    # save results
    #model 
    joblib.dump(clf, os.path.join(model_root, f"orf1_{column_name}_{k}_mer_lr_model.pkl"), compress=9)
    # predictions
    pred_label = [class_dict[i] for i in predictions]
    pd.DataFrame({column_name:pred_label, "pred":predictions}).to_csv(
        os.path.join(data_root, f"orf1_{column_name}_{k}_mer_lr_pred.csv"), index=False, header=True)

        
    #classification report
    report.to_csv(os.path.join(data_root, f"orf1_{column_name}_{k}_mer_lr_metrics.csv"), 
                  index=True, header=True)
    
    # feature importance
    feature_imp.to_csv(os.path.join(data_root, f"orf1_{column_name}_{k}_mer_lr_feature.csv"),
                       index=False, header=True)
    clear_output()
    display(report)
    display(show_weights(clf, vec=ctv, top=25, feature_filter=lambda x: x != '<BIAS>', 
                         target_names=class_dict))
    print(f"---> Analysis of {k}-mer done! Results in {data_root}")
    
    

In [9]:
orf1 = DataProcessing('coronavirus_orf1ab.fasta', 'coronavirus_orf1ab_meta.csv')
data = get_data(orf1)
results_path = ''
for column_name in ['Species']:
    df = filter_column(data, column_name, 20)
#     print(df.head())
    for k in range(1, 10):
        model_predict(df, column_name, k, results_path)
    

shape WITH duplicates: (3046, 10)
shape WITHOUT duplicates: (2384, 10)
['Betacoronavirus 1', 'Coronavirus HKU15', 'Human coronavirus 229E', 'Middle East respiratory syndrome-related coronavirus', 'Alphacoronavirus 1', 'Avian coronavirus', 'Human coronavirus HKU1', 'Human coronavirus NL63', 'Severe acute respiratory syndrome-related coronavirus', 'Porcine epidemic diarrhea virus', 'Alphacoronavirus sp.']


NameError: name 'results_path' is not defined