In [1]:
import sys
import time
import json
import numpy as np
import pandas as pd

import bjorn_support as bs
import onion_trees as ot
import mutations as bm
import visualize as bv
import reports as br
import data as bd

In [2]:
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [3]:
date = '2021-01-30'
countries_fp = '/Users/al/Documents/scripps/data/geojsons/countries.geo.json'
states_fp = '/Users/al/Documents/scripps/data/geojsons/us-states.json'
subs = pd.read_csv('/Users/al/Documents/scripps/analysis/gisaid/subs_long_2021-01-30.csv.gz', 
                   compression='gzip')
dels = pd.read_csv('/Users/al/Documents/scripps/analysis/gisaid/dels_long_2021-01-30.csv.gz', 
                   compression='gzip')

In [4]:
cols = ['mutation', 'strain', 'country', 'division', 'location', 'date', 'absolute_coords', 'del_len']

In [5]:
dels['pos'] = dels['absolute_coords'].apply(lambda x: int(x.split(':')[0]))
dels['ref_codon'] = dels['del_seq'].copy()

In [6]:
print(subs.shape)
print(dels.shape)
subs['type'] = 'substitution'
muts = pd.concat([subs, dels])
print(muts.shape)

(6847704, 40)
(162247, 44)
(7009951, 49)


In [8]:
muts['tmp'] = muts['date'].str.split('-')
muts = muts[muts['tmp'].str.len()>=2]
muts['date'] = pd.to_datetime(muts['date'], errors='coerce')

In [9]:
with open(countries_fp) as f:
    countries = json.load(f)
country_map = {x['properties']['name']: x['id'] for x in countries['features']}
muts['country_id'] = muts['country'].apply(lambda x: country_map.get(x, 'NA'))
with open(states_fp) as f:
    states = json.load(f)
state_map = {x['properties']['name']: x['id'] for x in states['features']}
muts['division_id'] = muts['division'].apply(lambda x: state_map.get(x, 'NA'))

In [10]:
muts.rename(columns={
    'date': 'date_collected',
    'GISAID_clade': 'gisaid_clade',
    'Nextstrain_clade': 'nextstrain_clade',
    'del_len': 'change_length_nt'
    }, inplace=True)

In [11]:
muts.columns

Index(['idx', 'replacements', 'pos', 'gene', 'gene_start_pos', 'codon_num',
       'codon_start', 'ref_codon', 'alt_codon', 'ref_aa', 'alt_aa', 'mutation',
       'strain', 'virus', 'gisaid_epi_isl', 'genbank_accession',
       'date_collected', 'region', 'country', 'division', 'location',
       'region_exposure', 'country_exposure', 'division_exposure', 'segment',
       'length', 'host', 'age', 'sex', 'nextstrain_clade', 'pangolin_lineage',
       'gisaid_clade', 'originating_lab', 'submitting_lab', 'authors', 'url',
       'title', 'paper_url', 'date_submitted', 'purpose_of_sequencing', 'type',
       'seq_len', 'del_positions', 'change_length_nt', 'relative_coords',
       'absolute_coords', 'del_seq', 'prev_5nts', 'next_5nts', 'tmp',
       'country_id', 'division_id'],
      dtype='object')

In [13]:
muts['nt_map_coords'] = muts[['gene', 'pos']].apply(bs.compute_acc_nt_pos, 
                            args=(bd.GENE2NTCOORDS,), 
                            axis=1)

In [14]:
muts['aa_map_coords'] = muts[['gene', 'codon_num']].apply(bs.compute_acc_aa_pos, 
                            args=(bd.GENE2AACOORDS,), 
                            axis=1)

In [15]:
muts['date_modified'] = date

In [16]:
muts['is_synonymous'] = False
muts.loc[muts['ref_aa']==muts['alt_aa'], 'is_synonymous'] = True

In [17]:
meta_info = ['strain', 'date_modified',
        'date_collected','date_submitted',
        'country_id', 'country', 
        'division_id', 'division', 'location', 
        'submitting_lab', 'originating_lab',
        'authors', 'pangolin_lineage', 
        'gisaid_clade', 'nextstrain_clade',
        'gisaid_epi_isl', 'genbank_accession',
        'purpose_of_sequencing']

muts_info = ['type', 'mutation', 'gene', 
             'ref_codon', 'pos', 'alt_codon', 
             'is_synonymous', 
             'ref_aa', 'codon_num', 'alt_aa', 
             'absolute_coords', 
             'change_length_nt']

In [18]:
muts.loc[muts['location']=='unk', 'location'] = 'NA'
muts.loc[muts['purpose_of_sequencing']=='?', 'purpose_of_sequencing'] = 'NA'
muts.loc[muts['genbank_accession']=='?', 'genbank_accession'] = 'NA'

In [19]:
muts.fillna('NA', inplace=True)

In [20]:
sample_ids = muts[['strain']].drop_duplicates().sample(10)['strain'].unique()
test = muts[muts['strain'].isin(sample_ids)]

In [21]:
print(muts.shape)
# muts.drop_duplicates(subset=['strain', 'mutation'], inplace=True)
print(muts.shape)

(6937790, 56)
(6937790, 56)


In [22]:
muts.drop(columns=['nt_map_coords', 'aa_map_coords'], inplace=True)

In [23]:
# GENERATE JSON DATA MODEL
start = time.time()
(muts.groupby(meta_info, as_index=True)
             .apply(lambda x: x[muts_info].to_dict('records'))
             .reset_index()
             .rename(columns={0:'mutations'})
             .to_json(f'test_data/data_model_{date}.json.gz', 
                      orient='records',
                      compression='gzip'))
end = time.time()
print(f'Execution time: {end - start} seconds')

Execution time: 609.5467598438263 seconds


In [20]:
# TEST 
start = time.time()
(test.groupby(meta_info, as_index=True)
             .apply(lambda x: x[muts_info].to_dict('records'))
             .reset_index()
             .rename(columns={0:'mutations'})
             .to_json(f'test_data/TEST_data_model_{date}.json.gz', 
                      orient='records',
#                       compression='gzip'
                     )
)
end = time.time()
print(f'Execution time: {end - start} seconds')

Execution time: 0.05672788619995117 seconds


In [None]:
# (dels.groupby(['mutation', 'absolute_coords', 'del_len', 'del_seq'])
#      .agg(num_samples=('idx', 'nunique'))
#      .reset_index()
#      .nlargest(50, 'num_samples'))