In [2]:
import sys
import time
import json
import numpy as np
import pandas as pd

import bjorn_support as bs
import onion_trees as ot
import mutations as bm
import visualize as bv
import reports as br
import data as bd

In [3]:
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [4]:
date = '2021-01-26'
countries_fp = '/home/al/data/geojsons/countries.geo.json'
states_fp = '/home/al/data/geojsons/us-states.json'
subs = pd.read_csv('/home/al/analysis/gisaid/subs_long_2021-01-25.csv.gz', 
                   compression='gzip')
dels = pd.read_csv('/home/al/analysis/gisaid/dels_long_2021-01-25.csv.gz', 
                   compression='gzip')

In [5]:
print(subs.shape)
print(dels.shape)
subs['type'] = 'substitution'
muts = pd.concat([subs, dels])
print(muts.shape)

(6328749, 38)
(117950, 44)
(6446699, 47)


In [8]:
muts_2 = prime_mutation_logic(muts.copy())

In [9]:
# pick any two random samples
s_x, s_y = np.random.randint(0, muts.shape[0], size=2)

s_x = muts.iloc[s_x]['strain']
s_y = muts.iloc[s_y]['strain']

In [11]:
# debugging & profiling
start = time.time()
get_sample_mutations(muts_2, sample_id=s_x)
end = time.time()
print(f"Execution time: {end-start} s")

Execution time: 0.09739899635314941 s


In [12]:
# debugging & profiling
start = time.time()
get_sample_mutations_old(muts, sample_id=s_x)
end = time.time()
print(f"Execution time: {end-start} s")

Execution time: 6.325293302536011 s


In [17]:
m_x = get_sample_mutations(muts_2, sample_id=s_x)
m_y = get_sample_mutations(muts_2, sample_id=s_y)
print(f"Common mutations between samples {s_x} and {s_y}:\n")
print(m_x & m_y)

Common mutations between samples Chile/MA-194114-B/2020 and USA/MI-UM-10036454184/2020:

{'ORF1a:F924F', 'S:D614G', '5UTR:R81C', 'ORF1b:P314L'}


In [18]:
print(f"Mutations found in either samples {s_x} and {s_y}:\n")
print(m_x | m_y)

Mutations found in either samples Chile/MA-194114-B/2020 and USA/MI-UM-10036454184/2020:

{'ORF1b:A302S', 'N:P67S', 'ORF1b:P314L', 'ORF1a:F2602F', 'ORF1b:R2613C', '5UTR:R81C', 'S:A924A', 'N:D377Y', 'ORF1b:Y23Y', 'ORF3a:Q57H', 'ORF1a:L3352F', 'S:T307I', 'S:T723T', 'N:S2F', 'ORF1b:N1653D', 'N:P199L', 'ORF1a:N786N', 'S:D614G', 'ORF1b:Y446Y', 'ORF1a:E2940E', 'N:R203K', 'ORF1a:T1250I', 'ORF1a:T265I', 'ORF1b:D412D', 'ORF1a:D3897D', 'ORF3a:G172V', 'ORF1a:T1246I', 'S:E1182Q', 'ORF1a:F924F', 'ORF8:S24L', 'ORF1a:M2606I', 'ORF1a:P4312P', 'ORF7b:L17L', 'N:G204R', 'ORF1a:G3278S', 'ORF1b:L1531L'}


In [19]:
print(f"Mutations found in sample {s_x} but NOT in {s_y}:\n")
print(m_x - m_y)

Mutations found in sample Chile/MA-194114-B/2020 but NOT in USA/MI-UM-10036454184/2020:

{'S:T723T', 'ORF1b:A302S', 'N:S2F', 'ORF1a:F2602F', 'ORF1a:N786N', 'ORF1a:T1246I', 'S:E1182Q', 'S:A924A', 'ORF1a:P4312P', 'N:R203K', 'ORF7b:L17L', 'ORF1a:T1250I', 'N:G204R', 'S:T307I', 'ORF1a:G3278S', 'ORF1b:Y23Y', 'ORF1b:D412D', 'ORF1b:L1531L'}


In [20]:
print(f"Mutations found in sample {s_y} but NOT in {s_x}:\n")
print(m_y - m_x)

Mutations found in sample USA/MI-UM-10036454184/2020 but NOT in Chile/MA-194114-B/2020:

{'ORF1a:D3897D', 'ORF3a:G172V', 'ORF1b:N1653D', 'N:P67S', 'N:P199L', 'ORF3a:Q57H', 'ORF1b:R2613C', 'ORF1b:Y446Y', 'ORF1a:L3352F', 'ORF1a:M2606I', 'ORF1a:E2940E', 'ORF8:S24L', 'N:D377Y', 'ORF1a:T265I'}
