In [26]:
import numpy as np
from IPython.display import display, HTML
from bokeh.models import CustomJS
from bokeh.io import curdoc, show, output_file, save
from bokeh.models import ColumnDataSource, Grid, LinearAxis, Plot, Text
#imports
import os
import csv
import sklearn
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from eli5 import show_weights, show_prediction
import seaborn as sns
from sklearn.manifold import TSNE
from collections import Counter
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from eli5 import show_weights
from sklearn.linear_model import LogisticRegression
from IPython.display import display, clear_output
import eli5

#custom
from py.orf1ab_dash_board import DataProcessing, get_dashboard



Using TensorFlow backend.


In [126]:
def show_plot(plot):
    output_file("bars.html")
    save(plot)
    display(HTML('bars.html'))
    
    
def get_data(orf1):  
    # read for data folder and out put 
    df = orf1.get_amino_df()
    print(f"shape WITH duplicates: {df.shape}")

    # remove duplicates
    df.drop_duplicates(subset='Accession', keep=False, inplace=True)
    print(f"shape WITHOUT duplicates: {df.shape}")
    df['Collection_Date'] = pd.to_datetime(df['Collection_Date'], errors='coerce').dt.strftime('%Y-%m-%d')
    df['Release_Date'] = pd.to_datetime(df['Release_Date'], errors='coerce').dt.strftime('%Y-%m-%d')
    df['Length'] = df['Length'].apply(str)
    return df
    
def filter_column(df, column_name, min_count):
    '''
    df: dataframe
    column_name: column to filter
    min_count: minimum count required to be included
    '''
    counts = Counter(df[column_name])
    filtered = [key for key in counts if counts[key] >= min_count]
    print(filtered)
    return df[df[column_name].isin(filtered)]

In [127]:
orf1 = DataProcessing('coronavirus_orf1ab.fasta', 'coronavirus_orf1ab_meta.csv')
results_path = os.path.join(os.getcwd(), 'results')
column_name = 'Species'
df = filter_column(get_data(orf1), column_name, 20)
df.head()

shape WITH duplicates: (3046, 10)
shape WITHOUT duplicates: (2384, 10)
['Betacoronavirus 1', 'Coronavirus HKU15', 'Human coronavirus 229E', 'Middle East respiratory syndrome-related coronavirus', 'Alphacoronavirus 1', 'Avian coronavirus', 'Human coronavirus HKU1', 'Human coronavirus NL63', 'Severe acute respiratory syndrome-related coronavirus', 'Porcine epidemic diarrhea virus', 'Alphacoronavirus sp.']


Unnamed: 0,Accession,Release_Date,Species,Length,Geo_Location,Host,Isolation_Source,Collection_Date,GenBank_Title,seq
1,YP_009555238,2019-02-21,Betacoronavirus 1,7095,USA,,,,Orf1ab [Human coronavirus OC43],MSKINKYGLELHWAPEFPWMFEDAEEKLDNPSSSEVDMICSTTAQK...
4,YP_009513020,2018-08-24,Coronavirus HKU15,6267,China: Hong Kong,Sus scrofa,,2010-01-01,replicase polyprotein [Porcine coronavirus HKU15],MAKNKSKRDAIALPENVPPPLQLFIHVAAAEEGHPKVTTYLGNYNL...
13,YP_009194637,2016-01-05,Human coronavirus 229E,6763,Saudi Arabia,Camelus,,2015-03-01,polyprotein ORF1ab [Camel alphacoronavirus],MACNRVTLAVASDTEISATGCSTIALAVRRYSEAASNGFRACRFVS...
17,YP_009047202,2014-07-23,Middle East respiratory syndrome-related coron...,7078,,Homo sapiens,,2012-06-13,1ab polyprotein [Middle East respiratory syndr...,MSFVAGVTAQGARGTYRAALNSEKHQDHVSLTVPLCGSGNLVEKLS...
20,YP_007188577,2012-12-13,Middle East respiratory syndrome-related coron...,7078,United Kingdom,Homo sapiens,,2012-09-11,ORF1b protein [Betacoronavirus England 1],MSFVAGVTAQGARGTYRAALNSEKHQDHVSLTVPLCGSGNLVEKLS...


In [132]:
import pandas as pd
from bokeh.plotting import figure
from bokeh.io import output_file, show, output_notebook
from bokeh.models import ColumnDataSource,Range1d, LabelSet #FactorRange
seq = df['seq'].values[0]

ht = 32
output_notebook()
# seqs = [seq[i:i+65] for i in range(0, len(seq), 65)] * 20 #65
seqs = ['a', 'b']
data = {'x':[0] * len(seqs), 'y':[(ht + 50) * i for i in range(len(seqs))], 'seq': seqs}
source = ColumnDataSource(data=data)
p = figure(plot_height=ht*len(seqs), plot_width=800,
    tools="pan,xwheel_zoom,reset,save,crosshair,box_zoom",
    active_drag='pan',
    active_scroll='xwheel_zoom',
    x_range=Range1d(0, 100, bounds="auto"),
    y_range=Range1d(0, 1200, bounds="auto")
    )
# p.circle(x='x', y='y', color="blue", size =.01, source = source)

#create labels
labels = LabelSet(x='x', y='y', text='seq', level='glyph', 
                  x_offset=0, y_offset=0, source=source,
                  text_color='black', text_alpha=0.9,
                  text_font_size='13pt',
                  text_baseline='bottom', text_align='left',
                  background_fill_color='green', background_fill_alpha=0.2,
                  render_mode='canvas')


p.add_layout(labels)
p.yaxis.visible = False
p.xaxis.visible = False
p.xgrid.visible = False
p.ygrid.visible = False
show_plot(p)

In [133]:
print(seq)

MSKINKYGLELHWAPEFPWMFEDAEEKLDNPSSSEVDMICSTTAQKLETDGICPENHVMVDCRRLLKQECCVQSSLIREIVMNASPYDLEVLLQDALQSREAVLVTTPLGMSLEACYVRGCNPKGWTMGLFRRRSVCNTGRCTVNKHVAYQLYMIDPAGVCLGAGQFVGWVIPLAFMPVQSRKFIVPWVMYLRKRGEKGAYNKDHGRGGFGHVYDFKVEDAYDQVHDEPKGKFSKKAYALIRGYRGVKPLLYVDQYGCDYTGSLADGLEAYADKTLQEMKALFPTWSQELLFDVIVAWHVVRDPRYVMRLQSAATIRSVAYVANPTEDLCDGSVVIKEPVHVYADDSIILRQYNLVDIMSHFYMEADTVVNAFYGVALKDCGFVMQFGYIDCEQDSCDFKGWIPGNMIDGFACTTCGHVYEVGDLMAQSSGVLPVNPVLHTKSAAGYGGFGCKDSFTLYGQTVVYFGGCVYWSPARNIWIPILKSSVKSYDSLVYTGVLGCKAIVKETNLICKALYLDYVQHKCGNLHQRELLGVSDVWHKQLLLNRGVYKPLLENIDYFNMRRAKFSLETFTVCADGFMPFLLDDLVPRAYYLAVSGQAFCDYADKLCHAVVSKSKELLDVSLDSLGAAIHYLNSKIVDLAQHFSDFGTSFVSKIVHFFKTFTTSTALAFAWVLFHVLHGAYIVVESDIYFVKNIPRYASAVAQAFQSVAKVVLDSLRVTFIDGLSCFKIGRRRICLSGRKIYEVERGLLHSSQLPLDVYDLTMPSQVQKAKQKPIYLKGSGSDFSLADSVVEVVTTSLTPCGYSEPPKVAAKICIVDNVYMAKAGDKYYPVVVDDHVGLLDQAWRVPCAGRRVTFKEQPTVKEIISMPKIIKVFYELDNDFNTILNTACGVFEVDDTVDMEEFYAVVIDAIEEKLSPCKELEGVGAKVSAFLQKLEDNPLFLFDEAGEEVLAPKLYCAFTAPEDDDFLEESDVEEDDVEGEETDLTVTSAGQPCVASE