In [15]:
import sys
import time
import json
import numpy as np
import pandas as pd
from Bio import Seq, SeqIO, AlignIO, Phylo, Align


import bjorn_support as bs
import onion_trees as ot
import mutations as bm
import visualize as bv
import reports as br
import data as bd

In [20]:
t = "False"
bool(t)

True

In [2]:
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [3]:
ref_path = '/home/al/data/hcov19/NC045512.fasta'
in_fasta_path = '/valhalla/gisaid/sequences_2021-01-28_09-27.fasta'
out_fasta_path = '/valhalla/gisaid/sequences_2021-01-28.fasta'
out_sam_path = '/valhalla/gisaid/sequences_2021-01-28.sam'
out_algn_path = '/valhalla/gisaid/sequences_2021-01-28_aligned.fasta'

In [14]:
# TEST: concat_fasta()
## Test before concat
seqs = bs.load_fasta(in_fasta_path)
test = [rec for rec in seqs if rec.id=='NC_045512.2']
assert len(test)==0, f'ERROR: The reference at {ref_path} was already found in {in_fasta_path}. Cannot run test'

In [4]:
bs.concat_fasta_2([in_fasta_path, ref_path], out_fasta_path)

'/valhalla/gisaid/sequences_2021-01-28.fasta'

In [7]:
## Test after concat
seqs = bs.load_fasta(out_fasta_path)
test = [rec for rec in seqs if rec.id=='NC_045512.2']
assert len(test)==1, f'ERROR: The reference at {ref_path} was NOT found after concatenation. Something is off'

In [4]:
bs.run_minimap2(out_fasta_path, out_sam_path, ref_path)

[M::mm_idx_gen::1.686*1.00] collected minimizers
[M::mm_idx_gen::1.690*1.00] sorted minimizers
[M::main::1.690*1.00] loaded/built the index for 1 target sequence(s)
[M::mm_mapopt_update::1.690*1.00] mid_occ = 100
[M::mm_idx_stat] kmer size: 19; skip: 19; is_hpc: 0; #seq: 1
[M::mm_idx_stat::1.690*1.00] distinct minimizers: 3006 (100.00% are singletons); average occurrences: 1.000; average spacing: 9.948; total length: 29903
[M::worker_pipeline::13.383*19.81] mapped 16779 sequences
[M::worker_pipeline::23.243*22.08] mapped 16780 sequences
[M::worker_pipeline::33.434*23.03] mapped 16781 sequences
[M::worker_pipeline::43.282*23.51] mapped 16780 sequences
[M::worker_pipeline::52.956*23.81] mapped 16782 sequences
[M::worker_pipeline::62.900*24.03] mapped 16781 sequences
[M::worker_pipeline::72.709*24.18] mapped 16793 sequences
[M::worker_pipeline::82.789*24.30] mapped 16799 sequences
[M::worker_pipeline::92.489*24.39] mapped 16780 sequences
[M::worker_pipeline::102.179*24.46] mapped 16793 se

'/valhalla/gisaid/sequences_2021-01-28.sam'

In [17]:
df = pd.read_csv('/valhalla/gisaid/subs_long_2021-01-30.csv.gz', compression='gzip')

In [4]:
date = '2021-01-26'
countries_fp = '/home/al/data/geojsons/countries.geo.json'
states_fp = '/home/al/data/geojsons/us-states.json'
subs = pd.read_csv('/home/al/analysis/gisaid/subs_long_2021-01-25.csv.gz', 
                   compression='gzip')
dels = pd.read_csv('/home/al/analysis/gisaid/dels_long_2021-01-25.csv.gz', 
                   compression='gzip')

In [5]:
print(subs.shape)
print(dels.shape)
subs['type'] = 'substitution'
muts = pd.concat([subs, dels])
print(muts.shape)

(6328749, 38)
(117950, 44)
(6446699, 47)


In [8]:
muts_2 = prime_mutation_logic(muts.copy())

In [9]:
# pick any two random samples
s_x, s_y = np.random.randint(0, muts.shape[0], size=2)

s_x = muts.iloc[s_x]['strain']
s_y = muts.iloc[s_y]['strain']

In [11]:
# debugging & profiling
start = time.time()
get_sample_mutations(muts_2, sample_id=s_x)
end = time.time()
print(f"Execution time: {end-start} s")

Execution time: 0.09739899635314941 s


In [12]:
# debugging & profiling
start = time.time()
get_sample_mutations_old(muts, sample_id=s_x)
end = time.time()
print(f"Execution time: {end-start} s")

Execution time: 6.325293302536011 s


In [17]:
m_x = get_sample_mutations(muts_2, sample_id=s_x)
m_y = get_sample_mutations(muts_2, sample_id=s_y)
print(f"Common mutations between samples {s_x} and {s_y}:\n")
print(m_x & m_y)

Common mutations between samples Chile/MA-194114-B/2020 and USA/MI-UM-10036454184/2020:

{'ORF1a:F924F', 'S:D614G', '5UTR:R81C', 'ORF1b:P314L'}


In [18]:
print(f"Mutations found in either samples {s_x} and {s_y}:\n")
print(m_x | m_y)

Mutations found in either samples Chile/MA-194114-B/2020 and USA/MI-UM-10036454184/2020:

{'ORF1b:A302S', 'N:P67S', 'ORF1b:P314L', 'ORF1a:F2602F', 'ORF1b:R2613C', '5UTR:R81C', 'S:A924A', 'N:D377Y', 'ORF1b:Y23Y', 'ORF3a:Q57H', 'ORF1a:L3352F', 'S:T307I', 'S:T723T', 'N:S2F', 'ORF1b:N1653D', 'N:P199L', 'ORF1a:N786N', 'S:D614G', 'ORF1b:Y446Y', 'ORF1a:E2940E', 'N:R203K', 'ORF1a:T1250I', 'ORF1a:T265I', 'ORF1b:D412D', 'ORF1a:D3897D', 'ORF3a:G172V', 'ORF1a:T1246I', 'S:E1182Q', 'ORF1a:F924F', 'ORF8:S24L', 'ORF1a:M2606I', 'ORF1a:P4312P', 'ORF7b:L17L', 'N:G204R', 'ORF1a:G3278S', 'ORF1b:L1531L'}


In [19]:
print(f"Mutations found in sample {s_x} but NOT in {s_y}:\n")
print(m_x - m_y)

Mutations found in sample Chile/MA-194114-B/2020 but NOT in USA/MI-UM-10036454184/2020:

{'S:T723T', 'ORF1b:A302S', 'N:S2F', 'ORF1a:F2602F', 'ORF1a:N786N', 'ORF1a:T1246I', 'S:E1182Q', 'S:A924A', 'ORF1a:P4312P', 'N:R203K', 'ORF7b:L17L', 'ORF1a:T1250I', 'N:G204R', 'S:T307I', 'ORF1a:G3278S', 'ORF1b:Y23Y', 'ORF1b:D412D', 'ORF1b:L1531L'}


In [20]:
print(f"Mutations found in sample {s_y} but NOT in {s_x}:\n")
print(m_y - m_x)

Mutations found in sample USA/MI-UM-10036454184/2020 but NOT in Chile/MA-194114-B/2020:

{'ORF1a:D3897D', 'ORF3a:G172V', 'ORF1b:N1653D', 'N:P67S', 'N:P199L', 'ORF3a:Q57H', 'ORF1b:R2613C', 'ORF1b:Y446Y', 'ORF1a:L3352F', 'ORF1a:M2606I', 'ORF1a:E2940E', 'ORF8:S24L', 'N:D377Y', 'ORF1a:T265I'}
