In [1]:
import sys
import time
import json
import numpy as np
import pandas as pd

import bjorn_support as bs
import onion_trees as ot
import mutations as bm
import visualize as bv
import reports as br
import data as bd

In [2]:
date = '2021-02-17'
unknown_val = 'None'

In [3]:
countries_fp = '/home/al/data/geojsons/gadm_countries.json'
states_fp = '/home/al/data/geojsons/gadm_divisions.json'
locations_fp = '/home/al/data/geojsons/gadm_locations.json'
subs = pd.read_csv(f'/valhalla/gisaid/subs_long_{date}.csv.gz', 
                   compression='gzip')
dels = pd.read_csv(f'/valhalla/gisaid/dels_long_{date}.csv.gz', 
                   compression='gzip')

In [4]:
# cols = ['mutation', 'strain', 'country', 'division', 'location', 'date_collected', 'absolute_coords', 'del_len']

In [5]:
dels['pos'] = dels['absolute_coords'].apply(lambda x: int(x.split(':')[0]))
dels['ref_codon'] = dels['del_seq'].copy()

In [6]:
dels['gene_start_pos'] = dels['gene'].apply(lambda x: bd.GENE2POS[x]['start']+2)

In [7]:
dels['pos_in_codon'] = (dels['pos'] - dels['gene_start_pos']) % 3

In [8]:
# Relative positions in codon counts [internal]
dels['pos_in_codon'].value_counts()

2    25
0    23
1    20
Name: pos_in_codon, dtype: int64

In [9]:
def assign_deletion_codon_coords(x):
    if (x['pos_in_codon'] + x['del_len']) <= 3:
        return x['gene'] + ':DEL' + str(x['codon_num'] + (x['pos_in_codon']/3))
    deletion = x['gene'] + ':DEL' + str(x['codon_num'] + (x['pos_in_codon']/3))\
               + '/' + str(x['codon_num'] + (1 + (x['pos_in_codon']/3))\
               + (x['del_len']/3) - 1)
    return deletion

In [10]:
def assign_deletion(x):
    if (x['pos_in_codon'] + x['del_len']) <= 3:
        return x['gene'] + ':DEL' + str(x['codon_num'])
    deletion = x['gene'] + ':DEL' + str(x['codon_num']) + '/' + str(x['codon_num'] + (x['del_len']//3) - 1)
    return deletion

In [11]:
dels['mutation'] = dels[['pos_in_codon', 'gene', 'codon_num', 'del_len']].apply(assign_deletion, axis=1)
# dels['mutation'].value_counts()

In [12]:
dels['deletion_codon_coords'] = dels[['pos_in_codon', 'gene', 'codon_num', 'del_len']].apply(assign_deletion_codon_coords, axis=1)

In [13]:
# dels['mutation'] = dels['mutation'].apply(lambda x: x.split('.')[0])

In [14]:
def is_frameshift(x):
    if x % 3 == 0:
        return False
    return True
dels['is_frameshift'] = dels['del_len'].apply(is_frameshift)

In [15]:
print(subs.shape)
print(dels.shape)
subs['type'] = 'substitution'
muts = pd.concat([subs, dels])
print(muts.shape)

(1654, 25)
(68, 33)
(1722, 37)


In [16]:
# muts.loc[muts['pangolin_lineage'].isin(['B.1.427'])].sort_values('date', ascending=True)[['idx', 'date', 'date_submitted', 'division', 'originating_lab']]

In [17]:
muts.columns

Index(['idx', 'replacements', 'pos', 'gene', 'gene_start_pos', 'codon_num',
       'codon_start', 'ref_codon', 'alt_codon', 'ref_aa', 'alt_aa', 'mutation',
       'strain', 'location', 'date_collected', 'date_submitted', 'clade',
       'pango_lineage', 'pango_version', 'accession_id', 'country', 'division',
       'country_normed', 'division_normed', 'location_normed', 'type',
       'seq_len', 'del_positions', 'del_len', 'relative_coords',
       'absolute_coords', 'del_seq', 'prev_5nts', 'next_5nts', 'pos_in_codon',
       'deletion_codon_coords', 'is_frameshift'],
      dtype='object')

In [18]:
muts['tmp'] = muts['date_collected'].str.split('-')
muts = muts[muts['tmp'].str.len()>=2]
muts.loc[muts['tmp'].str.len()==2, 'date_collected'] += '-15'
muts['date_collected'] = pd.to_datetime(muts['date_collected'], errors='coerce')
muts = muts[muts['date_collected']<date]

In [19]:
# muts.loc[muts['tmp'].str.len()==2, 'date']

In [20]:
muts['date_collected'] = muts['date_collected'].astype(str)

In [21]:
# muts['date'].max()

In [22]:
muts['date_collected'].max()

'2021-02-06'

In [23]:
test = muts.groupby('date_collected').agg(num_samples=('strain', 'nunique')).reset_index()
test[test.duplicated(subset=['date_collected'])]

Unnamed: 0,date_collected,num_samples


In [24]:
muts['date_collected']

0     2020-11-01
1     2020-11-01
2     2020-11-01
3     2020-11-01
4     2020-11-01
         ...    
63    2021-01-14
64    2021-01-14
65    2021-01-17
66    2021-01-17
67    2021-01-17
Name: date_collected, Length: 1669, dtype: object

In [25]:
with open(countries_fp) as f:
    countries = json.load(f)
muts['country_id'] = muts['country_normed'].apply(lambda x: countries.get(x, unknown_val))
with open(states_fp) as f:
    states = json.load(f)
muts['division_id'] = muts['division_normed'].apply(lambda x: states.get(x, unknown_val))
with open(locations_fp) as f:
    locations = json.load(f)
muts['location_id'] = muts['location_normed'].apply(lambda x: locations.get(x, unknown_val))

In [26]:
muts.rename(columns={
#     'date': 'date_collected',
#     'GISAID_clade': 'gisaid_clade',
#     'Nextstrain_clade': 'nextstrain_clade',
    'del_len': 'change_length_nt'
    }, inplace=True)

In [27]:
muts.columns

Index(['idx', 'replacements', 'pos', 'gene', 'gene_start_pos', 'codon_num',
       'codon_start', 'ref_codon', 'alt_codon', 'ref_aa', 'alt_aa', 'mutation',
       'strain', 'location', 'date_collected', 'date_submitted', 'clade',
       'pango_lineage', 'pango_version', 'accession_id', 'country', 'division',
       'country_normed', 'division_normed', 'location_normed', 'type',
       'seq_len', 'del_positions', 'change_length_nt', 'relative_coords',
       'absolute_coords', 'del_seq', 'prev_5nts', 'next_5nts', 'pos_in_codon',
       'deletion_codon_coords', 'is_frameshift', 'tmp', 'country_id',
       'division_id', 'location_id'],
      dtype='object')

In [28]:
# muts['nt_map_coords'] = muts[['gene', 'pos']].apply(bs.compute_acc_nt_pos, 
#                             args=(bd.GENE2NTCOORDS,), 
#                             axis=1)

In [29]:
# muts['aa_map_coords'] = muts[['gene', 'codon_num']].apply(bs.compute_acc_aa_pos, 
#                             args=(bd.GENE2AACOORDS,), 
#                             axis=1)

In [30]:
muts['date_modified'] = date

In [31]:
muts['is_synonymous'] = False
muts.loc[muts['ref_aa']==muts['alt_aa'], 'is_synonymous'] = True

In [32]:
muts.loc[muts['location']=='unk', 'location'] = unknown_val
# muts.loc[muts['purpose_of_sequencing']=='?', 'purpose_of_sequencing'] = unknown_val
# muts.loc[muts['genbank_accession']=='?', 'genbank_accession'] = unknown_val

In [33]:
muts.fillna(unknown_val, inplace=True)

In [34]:
muts.loc[muts['division']==muts['country'], 'division'] = unknown_val

In [35]:
sample_ids = muts[['strain']].drop_duplicates().sample(10)['strain'].unique()
test = muts[muts['strain'].isin(sample_ids)]

In [36]:
print(muts.shape)
# muts.drop_duplicates(subset=['strain', 'mutation'], inplace=True)
print(muts.shape)

(1669, 43)
(1669, 43)


In [37]:
muts['date_collected']

0     2020-11-01
1     2020-11-01
2     2020-11-01
3     2020-11-01
4     2020-11-01
         ...    
63    2021-01-14
64    2021-01-14
65    2021-01-17
66    2021-01-17
67    2021-01-17
Name: date_collected, Length: 1669, dtype: object

In [38]:
# muts.loc[muts['country']==muts['division']]

In [39]:
muts.columns

Index(['idx', 'replacements', 'pos', 'gene', 'gene_start_pos', 'codon_num',
       'codon_start', 'ref_codon', 'alt_codon', 'ref_aa', 'alt_aa', 'mutation',
       'strain', 'location', 'date_collected', 'date_submitted', 'clade',
       'pango_lineage', 'pango_version', 'accession_id', 'country', 'division',
       'country_normed', 'division_normed', 'location_normed', 'type',
       'seq_len', 'del_positions', 'change_length_nt', 'relative_coords',
       'absolute_coords', 'del_seq', 'prev_5nts', 'next_5nts', 'pos_in_codon',
       'deletion_codon_coords', 'is_frameshift', 'tmp', 'country_id',
       'division_id', 'location_id', 'date_modified', 'is_synonymous'],
      dtype='object')

In [40]:
meta_info = [
        'strain', 'accession_id',
        'date_modified', 'date_collected', 'date_submitted',
        'country_id', 'country', 'country_normed',
        'division_id', 'division', 'division_normed',
        'location_id', 'location', 'location_normed',
#         'submitting_lab', 'originating_lab',
#         'authors', 
        'pango_lineage', 'pango_version',
        'clade', 
#         'nextstrain_clade',
#         'gisaid_epi_isl', 'genbank_accession',
#         'purpose_of_sequencing',
            ]

muts_info = ['type', 'mutation', 'gene', 
             'ref_codon', 'pos', 'alt_codon', 
             'is_synonymous', 
             'ref_aa', 'codon_num', 'alt_aa', 
             'absolute_coords', 
             'change_length_nt', 'is_frameshift',
             'deletion_codon_coords']

In [41]:
# # TEST 
# start = time.time()
# (test.groupby(meta_info, as_index=True)
#              .apply(lambda x: x[muts_info].to_dict('records'))
#              .reset_index()
#              .rename(columns={0:'mutations'})
#              .to_json(f'test_data/TEST_api_data_model_{date}.json', 
#                       orient='records',
# #                       compression='gzip'
#                      )
# )
# end = time.time()
# print(f'Execution time: {end - start} seconds')

In [42]:
# test||

In [43]:
# muts.drop(columns=['nt_map_coords', 'aa_map_coords'], inplace=True)

In [44]:
muts['date_submitted'].max()

'2021-02-16'

In [45]:
muts['date_submitted']

0     2020-11-25
1     2020-11-25
2     2020-11-25
3     2020-11-25
4     2020-11-25
         ...    
63    2021-02-08
64    2021-02-08
65    2021-02-01
66    2021-02-01
67    2021-02-01
Name: date_submitted, Length: 1669, dtype: object

In [46]:
muts['date_collected']

0     2020-11-01
1     2020-11-01
2     2020-11-01
3     2020-11-01
4     2020-11-01
         ...    
63    2021-01-14
64    2021-01-14
65    2021-01-17
66    2021-01-17
67    2021-01-17
Name: date_collected, Length: 1669, dtype: object

In [47]:
# muts['date_modified']

In [48]:
muts[muts['type']=='deletion']['mutation'].value_counts()

S:DEL144/144          19
ORF1a:DEL3675/3677    19
S:DEL69/70            19
ORF1a:DEL85/85         1
S:DEL241/243           1
ORF10:DEL16/39         1
ORF1a:DEL1/25          1
ORF10:DEL19/39         1
S:DEL706/2704          1
ORF1a:DEL1/7508        1
ORF7a:DEL119           1
Name: mutation, dtype: int64

In [49]:
muts.loc[(muts['type']=='deletion')&(muts['mutation']==1)][['pos', 'change_length_nt', 'mutation']]

Unnamed: 0,pos,change_length_nt,mutation


In [50]:
# GENERATE JSON DATA MODEL
start = time.time()
(muts.groupby(meta_info, as_index=True)
             .apply(lambda x: x[muts_info].to_dict('records'))
             .reset_index()
             .rename(columns={0:'mutations'})
             .to_json(f'test_data/TEST_api_data.json.gz', 
                      orient='records',
                      compression='gzip'))
end = time.time()
print(f'Execution time: {end - start} seconds')

Execution time: 0.40732288360595703 seconds


In [42]:
muts.drop_duplicates(subset=['strain', 'mutation'], inplace=True)

In [65]:
muts = muts[muts['gene']!='5UTR']