In [39]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from collections import defaultdict, Counter
import wget
from Bio import SeqIO
from Bio.Alphabet import generic_protein
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split



In [40]:
# filename = wget.download("http://www.drive5.com/muscle/downloads3.8.31/muscle3.8.31_i86linux64.tar.gz")

# !tar xvzf $filename

In [41]:
class FastaMeta:
    '''
    object to hold fasta file and its corresponding meta file paths
    '''
    def __init__(self, fasta, meta):
        '''
        fasta: .fa file name
        meta: .csv file name
        '''
        self.data_path = 'https://raw.githubusercontent.com/covid19-bh-machine-learning/master/master/data/'
        # self.git_data_path = Path(Path(__file__).resolve().parent.parent.parent, "orf1ab-pyCode")
        self.fasta = os.path.join(self.data_path, fasta)
        self.meta = os.path.join(self.data_path, meta)

class DataProcessing(FastaMeta):
    def __init__(self, fasta, meta):
        super().__init__(fasta, meta)

    def get_amino_df(self):
        '''
        k = kmer length
        Generates all possible offsets of amino acid sequence and
        returns a pandas dataframe merged with given metadata
        meta_format ; csv tsv etc
        '''
        meta_df = pd.read_csv(self.meta, header=0)
        seq_seq = defaultdict(list)
        filename = self.fasta.split('/')[-1]
        if not os.path.exists(filename):
            filename = wget.download(self.fasta)
        seq_list = list(SeqIO.parse(filename, 'fasta', alphabet=generic_protein))
        for s in seq_list:
            seq_seq['seq'].append(str(s.seq))
        meta_df['seq'] = seq_seq['seq']
        return meta_df

In [42]:
def get_data(orf1):  
    # read for data folder and out put 
    df = orf1.get_amino_df()
    print(f"shape WITH duplicates: {df.shape}")

    # remove duplicates
    df.drop_duplicates(subset='Accession', keep=False, inplace=True)
    print(f"shape WITHOUT duplicates: {df.shape}")
    df['Collection_Date'] = pd.to_datetime(df['Collection_Date'], errors='coerce').dt.strftime('%Y-%m-%d')
    df['Release_Date'] = pd.to_datetime(df['Release_Date'], errors='coerce').dt.strftime('%Y-%m-%d')
    df['Length'] = df['Length'].apply(str)
    return df

def filter_column(df, column_name, min_count):
    '''
    df: dataframe
    column_name: column to filter
    min_count: minimum count required to be included
    '''
    counts = Counter(df[column_name])
    filtered = [key for key in counts if counts[key] >= min_count]
    print(filtered)
    df = df[df[column_name].isin(filtered)]
    return df[df[column_name].notna()]

def map_classes(df, column_name):
    #labels
    lbl_enc = LabelEncoder()
    y = lbl_enc.fit_transform(df[column_name].values)

    # map labels to numercial values
    #map labels to numerical value
    labels = list(lbl_enc.inverse_transform(y))
    return dict(zip(y, labels)), y

def get_test_data(df, column_name):
    
    class_dict, y = map_classes(df, column_name)
    #train test split
    xtrain, xvalid, ytrain, yvalid = train_test_split(df['seq'].values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)
    return xvalid, yvalid
   

In [54]:
orf1 = DataProcessing('coronavirus_orf1ab.fasta', 'coronavirus_orf1ab_meta.csv')
data = get_data(orf1)
tasks = ['Species', 'Host', 'Isolation_Source', 'Geo_Location']
df_species = None
df_host = None
df_geo = None
df_source = None
tasks_dict = dict(zip(tasks, [df_source, df_host, df_geo, df_species]))
for column_name in tasks_dict:
    df = filter_column(data, column_name, 20)
    a1, _ = get_test_data(df, column_name)
    print(len(df), len(a1))
    tasks_dict[column_name] = df[df['seq'].isin(a1)]

shape WITH duplicates: (3046, 10)
shape WITHOUT duplicates: (2384, 10)
['Betacoronavirus 1', 'Coronavirus HKU15', 'Human coronavirus 229E', 'Middle East respiratory syndrome-related coronavirus', 'Alphacoronavirus 1', 'Avian coronavirus', 'Human coronavirus HKU1', 'Human coronavirus NL63', 'Severe acute respiratory syndrome-related coronavirus', 'Porcine epidemic diarrhea virus', 'Alphacoronavirus sp.']
2190 219
[nan, 'Sus scrofa', 'Camelus', 'Homo sapiens', 'Chiroptera', 'Gallus gallus', 'Scotophilus kuhlii', 'Camelus dromedarius', 'Felis catus', 'Rhinolophus sinicus', 'Mus musculus']
1855 186
[nan, 'feces', 'abdominal cavity', 'oronasopharynx', 'lung, oronasopharynx']
767 77
['USA', 'Hong Kong', 'China: Hong Kong', 'China', 'Kenya', 'Saudi Arabia', nan, 'Colombia', 'South Korea', 'Viet Nam', 'China: Beijing', 'United Arab Emirates', 'USA: Illinois', 'USA: Minnesota', 'USA: Ohio', 'USA: Nashville, TN', 'USA: Denver, CO', 'USA: Tennessee']
1801 181


In [55]:
tasks_dict['Host'].head()

Unnamed: 0,Accession,Release_Date,Species,Length,Geo_Location,Host,Isolation_Source,Collection_Date,GenBank_Title,seq
17,YP_009047202,2014-07-23,Middle East respiratory syndrome-related coron...,7078,,Homo sapiens,,2012-06-13,1ab polyprotein [Middle East respiratory syndr...,MSFVAGVTAQGARGTYRAALNSEKHQDHVSLTVPLCGSGNLVEKLS...
41,YP_001039970,2007-02-21,Rousettus bat coronavirus HKU9,6930,China: Guangdong province,Chiroptera,,,orf1ab polyprotein [Rousettus bat coronavirus ...,MEGVPDPPKLKSMVVTTLKWCDPFANPNVTGWDIPIEEALEYAKQQ...
333,QBQ34487,2020-03-06,Coronavirus HKU15,6267,China,Sus scrofa,,2016-01-01,polyprotein 1ab [Porcine deltacoronavirus],MAKNKSKRDAIALPENVPPPLQLFIHVAAAEEGHPKVTTYLGNYNL...
398,QHB92363,2020-01-13,Porcine epidemic diarrhea virus,6781,China,Sus scrofa,,2018-03-01,polyprotein [Porcine epidemic diarrhea virus],MASNHVTLAFANDAEISAFGFCTASEAVSYYSEAAASGFMQCRFVS...
445,QGM12376,2019-11-25,Avian coronavirus,6639,Canada,Gallus gallus,,2017-01-01,1ab polyprotein [Infectious bronchitis virus],MASSLKQGVSPKPRDVILVAKDIPEQLCDALFFYTSHDPKDYADAF...


In [56]:
for task in tasks:
    tasks_dict[task].drop_duplicates(subset="seq", inplace=True)
    tasksdf = tasks_dict[task].sample(20, random_state=42)
    with open(f'orf1ab_{task}_test.fasta', 'w') as outFile:
        record_ids = tasksdf['Accession'].values
        for record in SeqIO.parse('coronavirus_orf1ab.fasta', 'fasta'):
            if record.id in record_ids:
                SeqIO.write(record, outFile, 'fasta')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [57]:
# # remove duplicate sequences
# from Bio import SeqIO
# with open('orf1ab.fasta', 'a') as outFile:
#     record_ids = set()
#     for record in SeqIO.parse('/kaggle/input/biohackathon-covid-ml-orf1ab-fasta-and-metadata/coronavirus_orf1ab.fasta', 'fasta'):
#         if record.id not in record_ids:
#             record_ids.add(record.id)
#             SeqIO.write(record, outFile, 'fasta')

In [58]:
#perform multiple sequence alignment
# from Bio.Align.Applications import ClustalwCommandline
# cline = ClustalwCommandline("/kaggle/working/clustalw-2.1-linux-x86_64-libcppstatic/clustalw2", infile="orf1ab_Geo_Location_test.fasta")
# cline()
# from Bio import AlignIO
# align = AlignIO.read("data/opuntia.aln", "clustal")
# print(align)

In [59]:
# pwd

In [60]:
# rm muscle3.8.31_i86linux64.tar.gz 

In [61]:
#MUSCLE
from Bio.Align.Applications import MuscleCommandline
muscle_exe = "/home/aneesh/Projects/covid_bh_ml/master/orf1ab/muscle3.8.31_i86linux64"
in_file = "orf1ab_Geo_Location_test.fasta"
out_file = "orf1ab_Geo_Location_test_aligned.fasta"
muscle_cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file)
muscle_cline()
print ("done")

done


In [None]:
def make_fasta_from_df():

In [None]:
%%bash
ls

In [62]:

# do imports
import os, io, random
import string
import numpy as np

from Bio.Seq import Seq
from Bio.Align import MultipleSeqAlignment
from Bio import AlignIO, SeqIO

# import panel as pn
# import panel.widgets as pnw
# pn.extension()

from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Plot, Grid, Range1d
from bokeh.models.glyphs import Text, Rect
from bokeh.layouts import gridplot
from IPython.display import display, HTML
from bokeh.io import output_file, show, save, curdoc, output_notebook
from bokeh.io import export_png
import matplotlib.pyplot as plt



In [63]:
def make_seq(length=40):    
    return ''.join([random.choice(['A','C','T','G']) for i in range(length)])

def mutate_seq(seq):
    """mutate a sequence randomly"""
    seq = list(seq)
    pos = np.random.randint(1,len(seq),6)    
    for i in pos:
        seq[i] = random.choice(['A','C','T','G'])
    return ''.join(seq)

def get_colors(seqs):
    """make colors for bases in sequence"""
    text = [i for s in list(seqs) for i in s]
    clrs =  {'G':'orange', 'P': 'orange', 'S':'orange', 'T':'orange', 'A':'orange',
            'H':'red', 'K':'red', 'R':'red', 'E':'red', 'D':'red',
            'F':'blue', 'W':'blue', 'Y':'blue',
            'I':'green', 'L':'green', 'M':'green', 'V':'green', 'C':'green',
            '-':'grey', 'X':'grey',
            'N':'magenta', 'Q':'magenta'}
#     G, P, S, T	Orange
# H, K, R	Red
# F, W, Y	Blue
# I, L, M, V	Green
    colors = [clrs[i] for i in text]
    return colors

def muscle_alignment(seqs):
    """Align 2 sequences with muscle"""
    filename = 'temp.faa'
    SeqIO.write(seqs, filename, "fasta")
    name = os.path.splitext(filename)[0]
    from Bio.Align.Applications import MuscleCommandline
    cline = MuscleCommandline(input=filename, out=name+'.txt')
    stdout, stderr = cline()
    align = AlignIO.read(name+'.txt', 'fasta')
    return align

In [106]:
# aligner ref https://dmnfarrell.github.io/bioinformatics/bokeh-sequence-aligner

def view_alignment(aln, fontsize="9pt", plot_width=800):
    """Bokeh sequence alignment view"""
    output_notebook()
    #make sequence and id lists from the aln object
    seqs = [rec.seq for rec in (aln)]
    ids = [rec.id for rec in aln]    
    text = [i for s in list(seqs) for i in s]
    colors = get_colors(seqs)    
    N = len(seqs[0])
    S = len(seqs)    
    width = .4

    x = np.arange(1,N+1)
    y = np.arange(0,S,1)
    #creates a 2D grid of coords from the 1D arrays
    xx, yy = np.meshgrid(x, y)
    #flattens the arrays
    gx = xx.ravel()
    gy = yy.flatten()
    #use recty for rect coords with an offset
    recty = gy+.5
    h= 1/S
    #now we can create the ColumnDataSource with all the arrays
    source = ColumnDataSource(dict(x=gx, y=gy, recty=recty, text=text, colors=colors))
    plot_height = len(seqs)*15+50
    x_range = Range1d(0,N+1, bounds='auto')
    if N>2000:
        viewlen=2000
    else:
        viewlen=N
    #view_range is for the close up view
    view_range = (0,viewlen)
    tools="xpan, wheel_zoom, reset, save"

    #entire sequence view (no text, with zoom)
    p = figure(title=None, plot_width= plot_width, plot_height=50,
               x_range=view_range, y_range=(0,S), tools=tools,
               min_border=0, toolbar_location='below')
    rects = Rect(x="x", y="recty",  width=1, height=1, fill_color="colors",
                 line_color=None, fill_alpha=0.6)
    p.add_glyph(source, rects)
    p.yaxis.visible = False
    p.grid.visible = False  

    #sequence text view with ability to scroll along x axis
    p1 = figure(title=None, plot_width=plot_width, plot_height=plot_height,
                x_range=view_range, y_range=ids, tools="xpan,reset",
                min_border=0, toolbar_location='below')#, lod_factor=1)          
    glyph = Text(x="x", y="y", text="text", text_align='center',text_color="black",
                text_font="monospace",text_font_size=fontsize)
    rects = Rect(x="x", y="recty",  width=1, height=1, fill_color="colors",
                line_color=None, fill_alpha=0.4)
    p1.add_glyph(source, glyph)
    p1.add_glyph(source, rects)

    p1.grid.visible = False
    p1.xaxis.major_label_text_font_style = "bold"
    p1.yaxis.minor_tick_line_width = 0
    p1.yaxis.major_tick_line_width = 0
    

    p = gridplot([[p],[p1]], toolbar_location='below') #, sizing_mode='stretch_both'
#     p.sizing_mode = 'scale_both'
    export_png(p, filename='plot.png')
#     plt.savefig("plot.png")
#     curdoc().add_root(p)
#     output_file('plot.html', mode='inline')
#     output_file("bars.html")
#     show(p)
#     display(HTML('plot.html'))
#     return p


In [107]:
BOKEH_RESOURCES='inline'
aln = AlignIO.read('orf1ab_Geo_Location_test_aligned.fasta','fasta')
view_alignment(aln, plot_width=15500) #p
# pn.pane.Bokeh(p)