In [1]:
import requests
import os
import pandas as pd
import json
import dateutil

In [2]:
headers = {
    'Accept': 'application/json',
    'Content-Type': 'application/json',
}

base_url = "https://api.c3.ai/covid/api/1"

In [3]:
def fetch_data(url, headers=None, data=None):
    hasMore = True
    
    data['spec']['limit'] = -1
    data['spec']['offset'] = 0
    
    results = []
    
    while hasMore:
        r = requests.post(url,
                          headers=headers,
                          data=json.dumps(data))
        if r.status_code != 200:
            raise RuntimeError("Problem fetching data! {} ({})".format(r.text, r.request.body))
        
        res_json = json.loads(r.text)
        hasMore = res_json['hasMore']
        data['spec']['offset'] += res_json['count']
        
        results += res_json['objs']
    
    result_df = pd.DataFrame(results)
    result_df = result_df.set_index('id')
    return result_df

## Data Exploration

In [13]:
data = {
    'spec': {
    }
}

bioasset_df = fetch_data(os.path.join(base_url, 'biologicalasset', 'fetch'),
                         headers=headers,
                         data=data)

In [16]:
nucleotideseq_df = bioasset_df[bioasset_df.assetType == 'nucleotide sequence']

In [27]:
dates = nucleotideseq_df.releaseDate.apply(lambda t: dateutil.parser.isoparse(t))

In [32]:
nucleotideseq_df

Unnamed: 0_level_0,sequence,assetType,sequenceType,species,genus,family,authors,genBankTitle,releaseDate,meta,version,publications,location,nucleotideCompleteness,host,collectionDate,isolationSource,bioSample
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
7BV2_P,{'id': '7BV2_P'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Yin,W., Mao,C., Luan,X., Shen,D.D., Shen,Q., S...","Chain P, The nsp12-nsp7-nsp8 complex bound to ...",2020-04-22T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,32358203,,,,,,
7BV2_T,{'id': '7BV2_T'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Yin,W., Mao,C., Luan,X., Shen,D.D., Shen,Q., S...","Chain T, Templete",2020-04-22T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,32358203,,,,,,
LC522350,{'id': 'LC522350'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Nicolasora,A.D., Mercado,E.S., Polotan,F.M., M...",Severe acute respiratory syndrome coronavirus ...,2020-02-08T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,,{'id': 'Philippines'},,Homo sapiens,2020-01-26T00:00:00Z,,
LC523807,{'id': 'LC523807'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Mercado,E.S., Manalo,J.I., Nicolasora,A.D., Me...",Severe acute respiratory syndrome coronavirus ...,2020-02-13T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,,{'id': 'Philippines'},,Homo sapiens,2020-02-06T00:00:00Z,,
LC523808,{'id': 'LC523808'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Mercado,E.S., Manalo,J.I., Nicolasora,A.D., Me...",Severe acute respiratory syndrome coronavirus ...,2020-02-13T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,,{'id': 'Philippines'},,Homo sapiens,2020-01-26T00:00:00Z,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MT612330,{'id': 'MT612330'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Caly,L., Seemann,T., Sait,M., Schultz,M.B., Dr...",Severe acute respiratory syndrome coronavirus ...,2020-06-15T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,,{'id': 'Victoria_Australia'},,Homo sapiens,2020-05-26T00:00:00Z,,
MT612331,{'id': 'MT612331'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Caly,L., Seemann,T., Sait,M., Schultz,M.B., Dr...",Severe acute respiratory syndrome coronavirus ...,2020-06-15T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,,{'id': 'Victoria_Australia'},,Homo sapiens,2020-05-26T00:00:00Z,,
MT612332,{'id': 'MT612332'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Caly,L., Seemann,T., Sait,M., Schultz,M.B., Dr...",Severe acute respiratory syndrome coronavirus ...,2020-06-15T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,,{'id': 'Victoria_Australia'},,Homo sapiens,2020-05-26T00:00:00Z,,
MT612333,{'id': 'MT612333'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Caly,L., Seemann,T., Sait,M., Schultz,M.B., Dr...",Severe acute respiratory syndrome coronavirus ...,2020-06-15T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,,{'id': 'Victoria_Australia'},,Homo sapiens,2020-05-26T00:00:00Z,,


In [38]:
def is_whole_genome(n):
    try:
        if 'complete genome' in n:
            return True
        else:
            return False
    except:
        return False

whole_genomes_df = nucleotideseq_df[nucleotideseq_df.genBankTitle.apply(is_whole_genome)]

In [40]:
whole_genomes_df = whole_genomes_df[whole_genomes_df.location.notna()]

In [43]:
china_whole_genomes_df = whole_genomes_df[whole_genomes_df.location.apply(lambda d: d['id']) == 'China']

In [44]:
len(china_whole_genomes_df)

24

In [45]:
china_whole_genomes_df.sequence.apply(lambda)

Unnamed: 0_level_0,sequence,assetType,sequenceType,species,genus,family,authors,genBankTitle,releaseDate,meta,version,publications,location,nucleotideCompleteness,host,collectionDate,isolationSource,bioSample
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
MN908947,{'id': 'MN908947'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Wu,F., Zhao,S., Yu,B., Chen,Y.M., Wang,W., Son...",Severe acute respiratory syndrome coronavirus ...,2020-01-12T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,32015508,{'id': 'China'},,Homo sapiens,2019-12-01T00:00:00Z,,
MN975262,{'id': 'MN975262'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Chan,J.F.-W., Yuan,S., Kok,K.H., To,K.K.-W., C...",Severe acute respiratory syndrome coronavirus ...,2020-01-24T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,,{'id': 'China'},,Homo sapiens,2020-01-11T00:00:00Z,"lung, oronasopharynx",
MN988668,{'id': 'MN988668'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Chen,L., Liu,W., Zhang,Q., Xu,K., Ye,G., Wu,W....",Severe acute respiratory syndrome coronavirus ...,2020-01-28T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,,{'id': 'China'},,Homo sapiens,2020-01-02T00:00:00Z,,
MN988669,{'id': 'MN988669'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Chen,L., Liu,W., Zhang,Q., Xu,K., Ye,G., Wu,W....",Severe acute respiratory syndrome coronavirus ...,2020-01-28T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,,{'id': 'China'},,Homo sapiens,2020-01-02T00:00:00Z,,
MT039874,{'id': 'MT039874'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Zhang,X.A., Fan,H., Qi,R.Z., Zheng,W., Zheng,K...",Severe acute respiratory syndrome coronavirus ...,2020-04-11T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,32222421,{'id': 'China'},,Homo sapiens,2020-01-22T00:00:00Z,,
MT079843,{'id': 'MT079843'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Wang,X., Zhou,Q., He,Y., Liu,L., Ma,X., Wei,X....",Severe acute respiratory syndrome coronavirus ...,2020-05-15T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,32366488,{'id': 'China'},,Homo sapiens,2020-01-22T00:00:00Z,,
MT079844,{'id': 'MT079844'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Wang,X., Zhou,Q., He,Y., Liu,L., Ma,X., Wei,X....",Severe acute respiratory syndrome coronavirus ...,2020-05-15T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,32366488,{'id': 'China'},,Homo sapiens,2020-01-22T00:00:00Z,,
MT079845,{'id': 'MT079845'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Wang,X., Zhou,Q., He,Y., Liu,L., Ma,X., Wei,X....",Severe acute respiratory syndrome coronavirus ...,2020-05-15T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,32366488,{'id': 'China'},,Homo sapiens,2020-01-22T00:00:00Z,,
MT079846,{'id': 'MT079846'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Wang,X., Zhou,Q., He,Y., Liu,L., Ma,X., Wei,X....",Severe acute respiratory syndrome coronavirus ...,2020-05-15T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,32366488,{'id': 'China'},,Homo sapiens,2020-01-22T00:00:00Z,,
MT079847,{'id': 'MT079847'},nucleotide sequence,GenBank,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,"Wang,X., Zhou,Q., He,Y., Liu,L., Ma,X., Wei,X....",Severe acute respiratory syndrome coronavirus ...,2020-05-15T00:00:00Z,"{'tenantTagId': 4, 'tenant': 'covid', 'tag': '...",1,32366488,{'id': 'China'},,Homo sapiens,2020-01-22T00:00:00Z,,


In [4]:
# Fetch sequences directly from Datalake restricted to China complete genomes

data = {
    'spec': {
        'filter': '(location.id == "China") && contains(genBankTitle, "complete genome")',
        'include': 'sequence.sequence'
    }
}

df = fetch_data(os.path.join(base_url, 'biologicalasset', 'fetch'),
                         headers=headers,
                         data=data)
df['raw_sequence'] = df.sequence.apply(lambda s: s['sequence'])

## Saving sequences to FASTA file

In [6]:
# Write to fasta
def write_sequences_to_fasta(filename, df):
    with open(filename, 'w') as file:
        def write_sequence(r):
            file.write('>{}\n'.format(r.name))
            file.write('{}\n'.format(r.raw_sequence))
        df.apply(write_sequence, axis=1)

fasta_file = 'china_whole_genomes_short.fa'
write_sequences_to_fasta(fasta_file, df.iloc[:4])

## Run Clustalw multi-sequence alignment

In [10]:
import subprocess

In [14]:
from Bio.Align.Applications import ClustalwCommandline
import subprocess
aligned_file = 'china_whole_genomes_short.phy'
clustalw_cline = ClustalwCommandline("clustalw2", align=True, infile=fasta_file, type='dna', outfile=aligned_file, output='phylip')
!{str(clustalw_cline)}




 CLUSTAL 2.1 Multiple Sequence Alignments


Sequence type explicitly set to DNA
Sequence format is Pearson
Sequence 1: MN908947   29903 bp
Sequence 2: MN975262   29891 bp
Sequence 3: MN988668   29881 bp
Sequence 4: MN988669   29881 bp
Start of Pairwise alignments
Aligning...

Sequences (1:2) Aligned. Score:  99
Sequences (1:3) Aligned. Score:  100
Sequences (1:4) Aligned. Score:  100
Sequences (2:3) Aligned. Score:  99
Sequences (2:4) Aligned. Score:  99
Sequences (3:4) Aligned. Score:  100
Guide tree file created:   [china_whole_genomes_short.dnd]

There are 3 groups
Start of Multiple Alignment

Aligning...
Group 1: Sequences:   2      Score:567739
Group 2: Sequences:   3      Score:567739
Group 3: Sequences:   4      Score:567754
Alignment Score 1197212

PHYLIP-Alignment file created   [china_whole_genomes_short.phy]



## Now we generate the Phylogenetic Tree

In [15]:
from Bio import AlignIO, SeqIO
from phylogenetics import PhylogeneticsProject
from ete3 import PhyloTree, Tree

In [16]:
# Phylogenetics tree
# --------------------------------------------------------
# Initialize a project class
# You can change the output_dir
project = PhylogeneticsProject(project_dir='project', overwrite=True)

# Read alignments into the project, change schema according to your file format
# @path: your relative path to the aligned sequences file
# @schema: file format
#project.read_data(path='clustalo_1.phylip', schema='phylip')
project.read_data(path=aligned_file, schema='phylip')

# Run compute_tree
project.compute_tree(datatype='nt', model='HKY85')

In [17]:
def get_species_name(uid):
    import pandas as pd
    try:
        d = project.data.set_index('uid').loc[uid, 'description']
        if pd.isna(d):
            return uid
        else:
            return d
    except Exception as e:
        return uid

# Read the tree we constructed
t = PhyloTree(newick='project/compute_tree.phy_phyml_tree.txt',
              sp_naming_function=get_species_name)

In [18]:
print(t.get_ascii(attributes=['species']))


   /-MN908947
  |
--|--MN975262
  |
  |   /-MN988669
   \-|
      \-MN988668
