In [1]:
import sys
import pandas as pd
import os
import gc
import gzip
from path import Path
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from urllib.request import urlopen
import json
import statsmodels as sm
from statsmodels.formula.api import ols
from Bio import Seq, SeqIO, AlignIO, Phylo, Align
from jinja2 import Environment, FileSystemLoader  # html template engine
# import cv2
import numpy as np
import skimage as sk
import matplotlib.pylab as plt
import datetime as dt

In [2]:
sys.path.append('../')

In [3]:
import bjorn_support as bs
import onion_trees as ot
import mutations as bm
import visualize as bv
import reports as br
import data as bd

In [4]:
# subs = pd.read_csv('/home/al/analysis/gisaid/subs_long_2021-01-18.csv.gz', compression='gzip')

In [5]:
# data = subs.loc[(subs['mutation']=='S:L452R')&(subs['location'].str.contains('San Diego'))]

# Data Priming

In [6]:
is_test = False
msa_fp = '/valhalla/gisaid/sequences_2021-01-30_aligned.fasta'
meta_fp = '/valhalla/gisaid/metadata_2021-01-30_10-36.tsv.gz'
date = msa_fp.split('_')[1]

In [7]:
print(date)
print(msa_fp)
print(meta_fp)

2021-01-30
/valhalla/gisaid/sequences_2021-01-30_aligned.fasta
/valhalla/gisaid/metadata_2021-01-30_10-36.tsv.gz


In [8]:
# subs, _ = bm.identify_replacements_per_sample(msa_fp, meta_fp,  
#                                            bd.GENE2POS, data_src='gisaid', 
#                                            is_gzip=True, test=is_test)

In [9]:
# subs_fp = f'/valhalla/gisaid/subs_long_{date}.csv.gz'
# subs.to_csv(subs_fp, index=False, compression='gzip')

In [10]:
# dels, _ = bm.identify_deletions_per_sample(msa_fp, meta_fp,  
#                                            bd.GENE2POS, data_src='gisaid', 
#                                            min_del_len=1, is_gzip=True, 
#                                            test=is_test)

In [11]:
# dels_fp = f'/valhalla/gisaid/dels_long_{date}.csv.gz'
# dels.to_csv(dels_fp, index=False, compression='gzip')

In [12]:
# (dels.groupby(['mutation', 'absolute_coords', 'del_len', 'del_seq'])
#      .agg(num_samples=('idx', 'nunique'))
#      .reset_index()
#      .nlargest(50, 'num_samples'))

In [13]:
# df = pd.read_csv(meta_fp, sep='\t', compression='gzip')

In [14]:
cols = ['strain', 'pangolin_lineage', 'date', 'date_submitted', 'country', 'division', 'location']

In [15]:
# df.loc[(df['pangolin_lineage']=='P.1') & (df['country'].str.contains('USA'))][cols]

# Analysis Report Generation

In [16]:
subs_fp = '/valhalla/gisaid/subs_long_2021-01-30.csv.gz'

In [17]:
# df = pd.read_csv(subs_fp, compression='gzip')

In [18]:
# feature = 'mutations'
# values = ['S:S13I', 'S:W152C', 'S:L452R']
feature = 'mutation'
# S:Q677H, M:A85S, N:D377Y,
values = ['ORF1a:I4205V', 'ORF1b:D1183Y', 'S:S13I', 'S:W152C', 'S:L452R']#['S:P681R', 'S:S254F', 'S:G769V'] 
input_params = {
    'date': '2021-01-30',
    'strain': 'CAVUI1',
    'gisaid_data_fp' : subs_fp,
    'gisaid_meta_fp': meta_fp,
    'msa_fp': Path(msa_fp),
    'sampling_type': 'random',
    'vocs': ['B.1.1.7', 'B.1.1.70'],
    'meta_fp' : Path('/home/al/code/HCoV-19-Genomics/metadata.csv'),
    'tree_fp' : Path('/home/al/analysis/alab_mutations_01-01-2021/alab/seqs_aligned.fa.treefile'),
    'subs_fp' : '/home/al/analysis/alab_mutations_01-01-2021/alab_substitutions_long_01-01-2021.csv',
    'countries_fp' : '/home/al/data/geojsons/countries.geo.json',
    'states_fp' : "/home/al/data/geojsons/us-states.json",
    'counties_fp' : '/home/al/data/geojsons/us-counties.json',
    'patient_zero' : 'NC_045512.2',
    'b117_meta' : '/home/al/analysis/b117/nextstrain_groups_neherlab_ncov_S.N501_metadata.tsv',
    'b117_tree': 'test_data/b117_seqs_aligned.fasta.treefile',
    'sample_sz': 300,
    'sampling_img_fp' : "/home/al/analysis/b117/figs/sars-cov-2_EM_v3.png"
}

In [19]:
results = br.generate_voc_data(feature, values, input_params)

Loading variant data...
Collecting input parameters...
Fetching strain data...
Generating text-based results
Generating geo-based results
Generating time-based results...
Generating genomic results...
Results generated on ['ORF1a:I4205V', 'ORF1b:D1183Y', 'S:S13I', 'S:W152C', 'S:L452R']...


In [21]:
html = br.generate_voc_html(feature, values, results, template_name='mut.html')
# br.save_html(html, f'test_data/orf1ab_i4205v_report.html')
br.save_html(html, f'test_data/cavui1_current_report.html')

Results for ['ORF1a:I4205V', 'ORF1b:D1183Y', 'S:S13I', 'S:W152C', 'S:L452R'] embedded in HTML report
Results saved in test_data/cavui1_current_report.html


0

In [21]:
# data = pd.read_csv(input_params['gisaid_data_fp'], compression='gzip')

In [22]:
runs = {'orf1b_d1183y': 'ORF1b:D1183Y', 
        's_s13i': 'S:S13I', 's_w152c': 'S:W152C', 's_l452r': 'S:L452R',
        's_n501y': 'S:N501Y', 's_p681h': 'S:P681H', 's_e484k': 'S:E484K'}
for name, mut in runs.items():
    input_params['strain'] = mut
    results = br.generate_voc_data(feature, [mut], input_params)
    html = br.generate_voc_html(feature, [mut], results, template_name='mut.html')
    br.save_html(html, f'test_data/{name}_current_report.html')

Loading variant data...
Collecting input parameters...
Fetching strain data...
Generating text-based results
Generating geo-based results
Generating time-based results...
Generating genomic results...
Results generated on ['ORF1b:D1183Y']...
Results for ['ORF1b:D1183Y'] embedded in HTML report
Results saved in test_data/orf1b_d1183y_current_report.html
Loading variant data...
Collecting input parameters...
Fetching strain data...
Generating text-based results
Generating geo-based results
Generating time-based results...
Generating genomic results...
Results generated on ['S:S13I']...
Results for ['S:S13I'] embedded in HTML report
Results saved in test_data/s_s13i_current_report.html
Loading variant data...
Collecting input parameters...
Fetching strain data...
Generating text-based results
Generating geo-based results
Generating time-based results...
Generating genomic results...
Results generated on ['S:W152C']...
Results for ['S:W152C'] embedded in HTML report
Results saved in test_d

In [19]:
runs = {'s_p681r': 'S:P681R'}
for name, mut in runs.items():
    input_params['strain'] = mut
    results = br.generate_voc_data(feature, [mut], input_params)
    html = br.generate_voc_html(feature, [mut], results, template_name='mut.html')
    br.save_html(html, f'test_data/{name}_current_report.html')

Loading variant data...
Collecting input parameters...
Fetching strain data...
Generating text-based results
Generating geo-based results
Generating time-based results...
Generating genomic results...
Results generated on ['S:P681R']...
Results for ['S:P681R'] embedded in HTML report
Results saved in test_data/s_p681r_current_report.html


In [21]:
# feature = 'mutations'
# values = ['S:S13I', 'S:W152C', 'S:L452R']
feature = 'mutation'
# S:Q677H, M:A85S, N:D377Y,
values = ['ORF1a:I4205V', 'ORF1b:D1183Y', 'S:S13I', 'S:W152C', 'S:L452R']#, 'S:L452R']#, 'S:W152C', 'S:L452R'] # 'ORF1ab:I4205V', 
input_params['strain'] = 'CAVUI1'

In [22]:
results = br.generate_voc_data(feature, values, input_params)

Loading variant data...
Collecting input parameters...
Fetching strain data...
Generating text-based results
Generating geo-based results
Generating time-based results...
Generating genomic results...
Results generated on ['ORF1a:I4205V', 'ORF1b:D1183Y', 'S:S13I', 'S:W152C', 'S:L452R']...


In [23]:
html = br.generate_voc_html(feature, values, results, template_name='mut.html')
# br.save_html(html, f'test_data/orf1ab_i4205v_report.html')
br.save_html(html, f'test_data/cavui1_current_report.html')

Results for ['ORF1a:I4205V', 'ORF1b:D1183Y', 'S:S13I', 'S:W152C', 'S:L452R'] embedded in HTML report
Results saved in test_data/cavui1_current_report.html


0

In [17]:
gisaid = pd.read_csv(input_params['gisaid_data_fp'], compression='gzip')

In [24]:
mutations = ['S:Q677H', 'M:A85S', 'N:D377Y']

In [30]:
m = gisaid.groupby(['country', 'division', 'strain']).agg(mutations=('mutation', 'unique')).reset_index()


In [31]:
m['is_vui'] = m['mutations'].apply(bv.is_vui, args=(set(mutations),))

In [32]:
m.loc[m['is_vui']==True, 'division'].value_counts()

Ontario          25
New York         10
Maryland          2
Wisconsin         2
West Virginia     1
Massachusetts     1
Michigan          1
Texas             1
Name: division, dtype: int64

In [16]:
# res = (gisaid.groupby(['date', 'country', 'division', 
#                                         'purpose_of_sequencing',
#                                         'location', 'pangolin_lineage', 'strain'])
#                        .agg(mutations=('mutation', 'unique')).reset_index())
# res['is_vui'] = res['mutations'].apply(bv.is_vui, args=(set(values),))

In [12]:

def get_mutations(data: pd.DataFrame, lineage: str='B.1.1.7'):
    mutations = set(data[data['pangolin_lineage']==lineage]['mutation'].unique().tolist())
    return mutations