In [1]:
import sys
import pandas as pd
import os
from path import Path
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from urllib.request import urlopen
import json
import statsmodels as sm
from statsmodels.formula.api import ols
from Bio import Seq, SeqIO, AlignIO, Phylo, Align
from jinja2 import Environment, FileSystemLoader  # html template engine
import cv2
import numpy as np
import skimage as sk
import matplotlib.pylab as plt
import datetime as dt

In [2]:
sys.path.append('../')

In [3]:
import bjorn_support as bs
import onion_trees as ot
import mutations as bm
import visualize as bv
import reports as br

In [4]:
# subs = pd.read_csv('/home/al/analysis/gisaid/subs_long_2021-01-18.csv.gz', compression='gzip')

In [5]:
# data = subs.loc[(subs['mutation']=='S:L452R')&(subs['location'].str.contains('San Diego'))]

# Data Priming

In [18]:
is_test = False
msa_fp = '/home/al/analysis/gisaid/sequences_2021-01-20_16-44_aligned.fasta.gz'
meta_fp = '/home/al/analysis/gisaid/metadata_2021-01-20_17-49.tsv.gz'
date = msa_fp.split('_')[1]

In [19]:
print(date)
print(msa_fp)
print(meta_fp)

2021-01-20
/home/al/analysis/gisaid/sequences_2021-01-20_16-44_aligned.fasta.gz
/home/al/analysis/gisaid/metadata_2021-01-20_17-49.tsv.gz


In [20]:
subs, _ = bm.identify_replacements_per_sample(msa_fp, meta_fp,  
                                           bm.GENE2POS, data_src='gisaid', 
                                           is_gzip=True, test=is_test)

Loading Alignment file at: /home/al/analysis/gisaid/sequences_2021-01-20_16-44_aligned.fasta.gz
Initial cleaning...
Identifying mutations...
Mapping Genes to mutations...
Computing codon numbers...
Fetching reference codon...
Fetching alternative codon...
Mapping amino acids...
Naming substitutions
Fusing with metadata...



Columns (8) have mixed types.Specify dtype option on import or set low_memory=False.



In [21]:
subs_fp = f'/home/al/analysis/gisaid/subs_long_{date}.csv.gz'
subs.to_csv(subs_fp, index=False, compression='gzip')

In [22]:
dels, _ = bm.identify_deletions_per_sample(msa_fp, meta_fp,  
                                           bm.GENE2POS, data_src='gisaid', 
                                           min_del_len=3, is_gzip=True, test=is_test)

Loading Alignment file at: /home/al/analysis/gisaid/sequences_2021-01-20_16-44_aligned.fasta.gz
Initial cleaning...
Identifying deletions...
Mapping Genes to mutations...
Computing codon numbers...
Fetching reference codon...
Mapping amino acids...
Naming deletions
Fuse with metadata...



Columns (8) have mixed types.Specify dtype option on import or set low_memory=False.



In [23]:
dels_fp = f'/home/al/analysis/gisaid/dels_long_{date}.csv.gz'
dels.to_csv(dels_fp, index=False, compression='gzip')

In [24]:
(dels.groupby(['mutation', 'absolute_coords', 'del_len', 'del_seq'])
     .agg(num_samples=('idx', 'nunique'))
     .reset_index()
     .nlargest(50, 'num_samples'))

Unnamed: 0,mutation,absolute_coords,del_len,del_seq,num_samples
2985,S:DEL69/70.0,21765:21770,6,catgtc,29741
1407,ORF1a:DEL3675/3677.0,11288:11296,9,tggttttaa,22862
2817,S:DEL144/144.0,21991:21993,3,att,22602
686,ORF1a:DEL1/25.666666666666668,266:342,77,ggagagccttgtccctggtttcaacgagaaaacacacgtccaactc...,4541
526,ORF10:DEL37/39.0,29666:29674,9,cacatagca,2643
1531,ORF1a:DEL448/448.0,1605:1607,3,gac,2065
962,ORF1a:DEL141/143.0,686:694,9,gtcatttga,1646
279,N:DEL362/467.0,29357:29674,318,attcccaccaacagagcctaaaaaggacaaaaagaagaaggctgat...,967
1091,ORF1a:DEL2081/2081.0,6506:6508,3,taa,935
1625,ORF1a:DEL85/85.0,516:518,3,atg,909


# Analysis Report Generation

In [4]:
# feature = 'mutations'
# values = ['S:S13I', 'S:W152C', 'S:L452R']
feature = 'mutation'
# S:Q677H, M:A85S, N:D377Y,
values = ['ORF1a:I4205V', 'ORF1b:D1183Y', 'S:S13I', 'S:W152C', 'S:L452R']#, 'S:L452R'] # 'ORF1ab:I4205V', 
input_params = {
    'date': '01/21/2021',
    'gisaid_data_fp' : '/home/al/analysis/gisaid/subs_long_2021-01-20.csv.gz',
    'gisaid_meta_fp': '/home/al/analysis/gisaid/metadata_2021-01-20_17-49.tsv.gz',
    'vocs': ['B.1.1.7', 'B.1.1.70'],
    'strain': 'CAVUI1',
    'msa_fp': Path('/home/al/analysis/gisaid/sequences_2021-01-20_16-44_aligned.fasta.gz'),
    'meta_fp' : Path('/home/al/code/HCoV-19-Genomics/metadata.csv'),
    'tree_fp' : Path('/home/al/analysis/alab_mutations_01-01-2021/alab/seqs_aligned.fa.treefile'),
    'subs_fp' : '/home/al/analysis/alab_mutations_01-01-2021/alab_substitutions_long_01-01-2021.csv',
    'countries_fp' : '/home/al/data/geojsons/countries.geo.json',
    'states_fp' : "/home/al/data/geojsons/us-states.json",
    'counties_fp' : '/home/al/data/geojsons/us-counties.json',
    'patient_zero' : 'NC_045512.2',
    'b117_meta' : '/home/al/analysis/b117/nextstrain_groups_neherlab_ncov_S.N501_metadata.tsv',
    'b117_tree': 'test_data/b117_seqs_aligned.fasta.treefile',
    'sample_sz': 300,
    'sampling_img_fp' : "/home/al/analysis/b117/figs/sars-cov-2_EM_v3.png"
}

In [7]:
results = br.generate_voc_data(feature, values, input_params)

In [8]:
html = br.generate_voc_html(feature, values, results, template_name='mut.html')
# br.save_html(html, f'test_data/orf1ab_i4205v_report.html')
br.save_html(html, f'test_data/cavui1_report.html')

In [9]:
# feature = 'mutations'
# values = ['S:S13I', 'S:W152C', 'S:L452R']
feature = 'mutation'
# S:Q677H, M:A85S, N:D377Y,
values = ['S:S13I', 'S:W152C', 'S:L452R']#, 'S:W152C', 'S:L452R'] # 'ORF1ab:I4205V', 
input_params['strain'] = 'CAVUI1S'

In [10]:
results = br.generate_voc_data(feature, values, input_params)

In [11]:
html = br.generate_voc_html(feature, values, results, template_name='mut.html')
# br.save_html(html, f'test_data/orf1ab_i4205v_report.html')
br.save_html(html, f'test_data/cavui1s_report.html')

In [5]:
runs = {'orf1a_i4205v': 'ORF1a:I4205V', 'orf1b_d1183y': 'ORF1b:D1183Y', 
        's_s13i': 'S:S13I', 's_w152c': 'S:W152C', 's_l452r': 'S:L452R'}
for name, mut in runs.items():
    input_params['strain'] = mut
    results = br.generate_voc_data(feature, [mut], input_params)
    html = br.generate_voc_html(feature, [mut], results, template_name='mut.html')
    br.save_html(html, f'test_data/{name}_report.html')

In [17]:
gisaid = pd.read_csv(input_params['gisaid_data_fp'], compression='gzip')

In [24]:
mutations = ['S:Q677H', 'M:A85S', 'N:D377Y']

In [30]:
m = gisaid.groupby(['country', 'division', 'strain']).agg(mutations=('mutation', 'unique')).reset_index()


In [31]:
m['is_vui'] = m['mutations'].apply(bv.is_vui, args=(set(mutations),))

In [32]:
m.loc[m['is_vui']==True, 'division'].value_counts()

Ontario          25
New York         10
Maryland          2
Wisconsin         2
West Virginia     1
Massachusetts     1
Michigan          1
Texas             1
Name: division, dtype: int64

In [16]:
# res = (gisaid.groupby(['date', 'country', 'division', 
#                                         'purpose_of_sequencing',
#                                         'location', 'pangolin_lineage', 'strain'])
#                        .agg(mutations=('mutation', 'unique')).reset_index())
# res['is_vui'] = res['mutations'].apply(bv.is_vui, args=(set(values),))

In [12]:

def get_mutations(data: pd.DataFrame, lineage: str='B.1.1.7'):
    mutations = set(data[data['pangolin_lineage']==lineage]['mutation'].unique().tolist())
    return mutations