In [1]:
import sys
sys.path.append('../src/')
import shutil
import numpy as np
import pandas as pd
import re
from path import Path
import plotly
import plotly.express as px
import plotly.graph_objects as go
import bjorn_support as bs
import mutations as bm

## Identifying Mutations (Sequence Analysis) 

In [2]:
seqs_fps = bs.get_filepaths('/valhalla/analysis/', data_fmt='fa', data_type='consensus', 
                            tech='illumina', generalised=True)

In [3]:
len(seqs_fps)

3034

In [4]:
out_dir = Path('/valhalla/ancestral_anomaly/distances')
if not Path.isdir(out_dir):
    Path.mkdir(out_dir)

In [5]:
fa_dir = out_dir/'fa'
if not Path.isdir(fa_dir):
    Path.mkdir(fa_dir)
for s_id, fp in seqs_fps.items():
    fn = fp[0].split('/')[-1]
    shutil.copy(fp[0], f'{fa_dir}/{fn}')

NameError: name 'fa_dir' is not defined

In [6]:
out_seqs_fp = out_dir/'analysis_seqs_2021-04-23.fa'

In [19]:
cmd = f'for a in $(ls {fa_dir}/*.fa) ; do cat $a >> {out_seqs_fp} ; done'
bs.run_command(cmd)

In [7]:
msa_fp = out_dir/'analysis_aligned_2021-04-23.fa'

In [110]:
bs.align_fasta_reference(out_seqs_fp, 
                         msa_fp, 
                         ref_fp='/home/al/data/hcov19/NC045512.fasta',
                         num_cpus=25)

In [8]:
msa = bs.load_fasta(msa_fp, is_aligned=True)

In [9]:
subs, _ = bm.identify_replacements_per_sample(msa)
# subs

Initial cleaning...
Identifying mutations...
Mapping Genes to mutations...
Computing codon numbers...
Fetching reference codon...
Fetching alternative codon...
Mapping amino acids...
Naming substitutions
Fusing with metadata...


In [10]:
dels, _ = bm.identify_deletions_per_sample(msa)
# dels

Initial cleaning...
Identifying deletions...
Mapping Genes to mutations...
Computing codon numbers...
Fetching reference codon...
Mapping amino acids...
Naming deletions
Fuse with metadata...


In [11]:
subs.columns

Index(['idx', 'seq_len', 'replacements', 'pos', 'gene', 'gene_start_pos',
       'codon_num', 'codon_start', 'ref_codon', 'alt_codon', 'ref_aa',
       'alt_aa', 'mutation', 'type'],
      dtype='object')

In [13]:
# compute number of mutations per sample
subs_info = subs.groupby('idx').agg(num_aa_muts=('mutation', 'nunique'),
                                    num_nt_muts=('replacements', 'nunique')).reset_index()
subs_info.rename(columns={'idx': 'fasta_hdr'}, inplace=True)
subs_info['SAMPLE_ID'] = subs_info['fasta_hdr'].apply(lambda x: x.split('_')[1])
# subs_info.loc[subs_info['num_aa_muts']!=subs_info['num_nt_muts']]

## Fetch Mapped-Unmapped Statistics (QC Analysis) 

In [14]:
# fetch mapped-unmapped filepaths and load all into a single dataframe
map_fps = bs.get_filepaths('/valhalla/analysis/', data_fmt='tsv', data_type='merged_aligned_bams', 
                            tech='illumina/reports', generalised=True, return_type='list')
map_df = pd.concat([pd.read_csv(fp, sep='\t') for fp in map_fps])
map_df.rename(columns={'SAMPLE': 'PATH_mapped'}, inplace=True)
map_df['SAMPLE_ID'] = map_df['PATH_mapped'].apply(lambda x: x.split('/')[-1].split('_')[0])
map_df.sort_values('mapped', inplace=True)
map_df.drop_duplicates('SAMPLE_ID', keep='last', inplace=True)
map_df.head()

Unnamed: 0,PATH_mapped,mapped,unmapped,SAMPLE_ID
53,/valhalla/analysis/2021.02.15_new_primers/merg...,12,54,SEARCH-7036
22,/valhalla/analysis/2021.03.15_new_primers/merg...,890,116,SEARCH-7578
17,/valhalla/analysis/2021.03.15_new_primers/merg...,909,109,SEARCH-7573
10,/valhalla/analysis/2021.03.19_county_samples/m...,1093,1053,NTC-SCV2-031521-AM-1-V2
63,/valhalla/analysis/2021.04.07_old_primers/merg...,1318,82,SEARCH-8350


In [29]:
# merge with dataframe containing substitution-based mutation information
print(subs_info.shape)
data_df = pd.merge(subs_info, map_df, on='SAMPLE_ID')
print(data_df.shape)

(3030, 4)
(3030, 7)


## Fetch Depth Statistics (QC Analysis) 

In [30]:
cov_cols = ['SAMPLE', 'COVERAGE', 'AVG_DEPTH']
cov_fps = bs.get_filepaths('/valhalla/analysis/', data_fmt='tsv', data_type='trimmed_bams', 
                            tech='illumina/reports', generalised=True, return_type='list')
cov_df = pd.concat([pd.read_csv(fp, sep='\t') for fp in cov_fps])[cov_cols]
cov_df['SAMPLE'] = cov_df['SAMPLE'].astype(str)
cov_df.rename(columns={'SAMPLE': 'PATH_depth'}, inplace=True)
cov_df['SAMPLE_ID'] = cov_df['PATH_depth'].apply(lambda x: x.split('_')[0])
cov_df.sort_values('COVERAGE', inplace=True)
cov_df.drop_duplicates('SAMPLE_ID', keep='last', inplace=True)
# cov_df.loc[~cov_df['SAMPLE_ID'].str.contains('SEARCH')]
print(cov_df.shape)
cov_df.head()

(3036, 4)


Unnamed: 0,PATH_depth,COVERAGE,AVG_DEPTH,SAMPLE_ID
0,SEARCH-7036_L001_L002.trimmed.sorted.bam,0.0,0.0457,SEARCH-7036
7,NTC-SCV2-031521-AM-1-V2_L001.trimmed.sorted.bam,0.0,0.0,NTC-SCV2-031521-AM-1-V2
1,SEARCH-7006_L001_L002.trimmed.sorted.bam,1.18331,9.54065,SEARCH-7006
0,SEARCH-6800_L001_L002.trimmed.sorted.bam,2.05039,26.0206,SEARCH-6800
1,SEARCH-6799_L001_L002.trimmed.sorted.bam,2.07079,22.576,SEARCH-6799


In [31]:
# merge with dataframe containing substitution mutations and mapped-unmapped statistics
print(data_df.shape)
data_df = pd.merge(data_df, cov_df, on='SAMPLE_ID')
print(data_df.shape)

(3030, 7)
(3030, 10)


## Fetch Trim Statistics (QC Analysis) 

In [32]:
analysis_fp = '/valhalla/analysis/'
trim_fps = bs.get_filepaths(analysis_fp,data_type='logs/trimmed',
                             data_fmt='log',
                             generalised=True,
                             return_type='list')
# bs.copy_files(fastq_fps, destination_dir)
len(trim_fps)

3070

In [35]:
trim_df = pd.DataFrame(columns=['SAMPLE_ID', 'trimmed_pct', 'quality_pct', 
                                'trimmed_count', 'quality_count', 'PATH_trim'])
missing_trims = []
for fp in trim_fps:
    with open(fp, 'r') as fh:
        sample_data = {}
        sample_data['PATH_trim'] = fp
        s_id = fp.split('/')[-1].split('_')[0]
        sample_data['SAMPLE_ID'] = s_id
        sample_data['PATH_trim'] = fp
        data = fh.readlines()
        try:
            trim_line = [l for l in data if 'Trimmed primers' in l][0]
            sample_data['trimmed_pct'], sample_data['trimmed_count'] = re.findall(r'\d+(?:\.\d+)?', trim_line)
            quality_line = [l for l in data if 'quality trimmed' in l][0]
            sample_data['quality_pct'], sample_data['quality_count'] = re.findall(r'\d+(?:\.\d+)?', quality_line)[:2]
        except:
            missing_trims.append(s_id)
        trim_df = trim_df.append(pd.Series(sample_data), ignore_index=True)
trim_df[['trimmed_count', 'quality_count']] = trim_df[['trimmed_count', 'quality_count']].fillna(-1).astype(int)
trim_df.sort_values('trimmed_count', inplace=True)
trim_df.drop_duplicates('SAMPLE_ID', keep='last', inplace=True)
print(trim_df.shape)
trim_df.head()

(3035, 6)


Unnamed: 0,SAMPLE_ID,trimmed_pct,quality_pct,trimmed_count,quality_count,PATH_trim
1550,NTC-SCV2-031521-AM-1-V2,0.0,0.0,0,0,/valhalla/analysis/2021.03.19_county_samples/l...
42,SEARCH-7036,100.0,0.0,12,0,/valhalla/analysis/2021.02.15_new_primers/logs...
106,SEARCH-8350,16.43,2.88,217,38,/valhalla/analysis/2021.04.07_old_primers/logs...
863,SEARCH-8175,16.29,4.38,309,83,/valhalla/analysis/2021.03.29_old_primers/logs...
796,SEARCH-8151,15.91,4.53,471,134,/valhalla/analysis/2021.03.29_old_primers/logs...


In [36]:
# sample that are missing trim log files
missing_trims

['SEARCH-6132',
 'SEARCH-6127',
 '21-306985-SCV2-033021-AM-1-TB-v2-20pM',
 '21-308908-SCV2-033021-AM-1-TB-v2-20pM',
 '21-305648-SCV2-033021-AM-1-TB-v2-20pM',
 '21-298438-SCV2-031521-AM-1-V2',
 '21-170487-SCV2-031521-AM-1-V2',
 '21-295987-SCV2-031521-AM-1-V2',
 '21-295989-SCV2-031521-AM-1-V2',
 '21-227289-SCV2-031521-AM-1-V2']

In [37]:
# merge with dataframe containing substitution mutations and mapped-unmapped statistics
print(data_df.shape)
data_df = pd.merge(data_df, trim_df, on='SAMPLE_ID')
print(data_df.shape)
data_df.head()

(3030, 10)
(3030, 15)


Unnamed: 0,fasta_hdr,num_aa_muts,num_nt_muts,SAMPLE_ID,PATH_mapped,mapped,unmapped,PATH_depth,COVERAGE,AVG_DEPTH,trimmed_pct,quality_pct,trimmed_count,quality_count,PATH_trim
0,Consensus_21-170487-SCV2-031521-AM-1-V2_L001_t...,22,22,21-170487-SCV2-031521-AM-1-V2,/valhalla/analysis/2021.03.22_county_samples/m...,1575848,3682,21-170487-SCV2-031521-AM-1-V2_L001.trimmed.sor...,100.0,7435.68,19.74,1.23,313901,19546,/valhalla/analysis/2021.03.22_county_samples/l...
1,Consensus_21-227289-SCV2-031521-AM-1-V2_L001_t...,27,27,21-227289-SCV2-031521-AM-1-V2,/valhalla/analysis/2021.03.22_county_samples/m...,1690368,4826,21-227289-SCV2-031521-AM-1-V2_L001.trimmed.sor...,100.0,7836.76,25.53,1.16,434093,19794,/valhalla/analysis/2021.03.22_county_samples/l...
2,Consensus_21-256229-SCV-022621-AM_L001_thresho...,29,29,21-256229-SCV-022621-AM,/valhalla/analysis/2021.03.12/merged_aligned_b...,1923028,17100,21-256229-SCV-022621-AM_L001.trimmed.sorted.bam,100.0,7487.77,25.51,2.62,524768,53963,/valhalla/analysis/2021.03.19_county_samples/l...
3,Consensus_21-256232-SCV-022621-AM_L001_thresho...,29,29,21-256232-SCV-022621-AM,/valhalla/analysis/2021.03.12/merged_aligned_b...,2679823,40793,21-256232-SCV-022621-AM_L001.trimmed.sorted.bam,100.0,10253.3,23.81,2.57,658311,71068,/valhalla/analysis/2021.03.19_county_samples/l...
4,Consensus_21-256234-SCV-022621-AM_L001_thresho...,29,29,21-256234-SCV-022621-AM,/valhalla/analysis/2021.03.12/merged_aligned_b...,2304368,46566,21-256234-SCV-022621-AM_L001.trimmed.sorted.bam,100.0,8523.87,28.67,2.65,666276,61627,/valhalla/analysis/2021.03.19_county_samples/l...


In [38]:
data_df.columns

Index(['fasta_hdr', 'num_aa_muts', 'num_nt_muts', 'SAMPLE_ID', 'PATH_mapped',
       'mapped', 'unmapped', 'PATH_depth', 'COVERAGE', 'AVG_DEPTH',
       'trimmed_pct', 'quality_pct', 'trimmed_count', 'quality_count',
       'PATH_trim'],
      dtype='object')

In [80]:
cols = ['SAMPLE_ID', 'num_aa_muts', 'num_nt_muts',
        'COVERAGE', 'AVG_DEPTH', 'mapped', 'unmapped',
        'trimmed_pct', 'quality_pct', 'trimmed_count', 'quality_count',
        'PATH_depth', 'PATH_mapped', 'PATH_trim', 'fasta_hdr']
data_df = data_df[cols]
data_df[cols].to_csv('/valhalla/ancestral_anomaly/qc_analysis_data.tsv', sep='\t', index=False)
data_df[cols].to_excel('/valhalla/ancestral_anomaly/qc_analysis_data.xlsx', index=False)

In [40]:
data_df.loc[data_df['SAMPLE_ID']=='SEARCH-8770']

Unnamed: 0,SAMPLE_ID,num_aa_muts,num_nt_muts,COVERAGE,AVG_DEPTH,mapped,unmapped,trimmed_pct,quality_pct,trimmed_count,quality_count,PATH_depth,PATH_mapped,PATH_trim,fasta_hdr
2847,SEARCH-8770,4,4,98.9255,920.28,637215,26471,85.98,61.21,548783,390716,SEARCH-8770_L001_L002.trimmed.sorted.bam,/valhalla/analysis/2021.04.12_itru_primers/mer...,/valhalla/analysis/2021.04.12_itru_primers/log...,Consensus_SEARCH-8770_L001_L002_threshold_0.5_...


In [41]:
data_df.head()

Unnamed: 0,SAMPLE_ID,num_aa_muts,num_nt_muts,COVERAGE,AVG_DEPTH,mapped,unmapped,trimmed_pct,quality_pct,trimmed_count,quality_count,PATH_depth,PATH_mapped,PATH_trim,fasta_hdr
0,21-170487-SCV2-031521-AM-1-V2,22,22,100.0,7435.68,1575848,3682,19.74,1.23,313901,19546,21-170487-SCV2-031521-AM-1-V2_L001.trimmed.sor...,/valhalla/analysis/2021.03.22_county_samples/m...,/valhalla/analysis/2021.03.22_county_samples/l...,Consensus_21-170487-SCV2-031521-AM-1-V2_L001_t...
1,21-227289-SCV2-031521-AM-1-V2,27,27,100.0,7836.76,1690368,4826,25.53,1.16,434093,19794,21-227289-SCV2-031521-AM-1-V2_L001.trimmed.sor...,/valhalla/analysis/2021.03.22_county_samples/m...,/valhalla/analysis/2021.03.22_county_samples/l...,Consensus_21-227289-SCV2-031521-AM-1-V2_L001_t...
2,21-256229-SCV-022621-AM,29,29,100.0,7487.77,1923028,17100,25.51,2.62,524768,53963,21-256229-SCV-022621-AM_L001.trimmed.sorted.bam,/valhalla/analysis/2021.03.12/merged_aligned_b...,/valhalla/analysis/2021.03.19_county_samples/l...,Consensus_21-256229-SCV-022621-AM_L001_thresho...
3,21-256232-SCV-022621-AM,29,29,100.0,10253.3,2679823,40793,23.81,2.57,658311,71068,21-256232-SCV-022621-AM_L001.trimmed.sorted.bam,/valhalla/analysis/2021.03.12/merged_aligned_b...,/valhalla/analysis/2021.03.19_county_samples/l...,Consensus_21-256232-SCV-022621-AM_L001_thresho...
4,21-256234-SCV-022621-AM,29,29,100.0,8523.87,2304368,46566,28.67,2.65,666276,61627,21-256234-SCV-022621-AM_L001.trimmed.sorted.bam,/valhalla/analysis/2021.03.12/merged_aligned_b...,/valhalla/analysis/2021.03.19_county_samples/l...,Consensus_21-256234-SCV-022621-AM_L001_thresho...


In [56]:
search_ids = ['SEARCH-8770',
'SEARCH-8722',
'SEARCH-8766',
'SEARCH-8714',
'SEARCH-8652',
'SEARCH-8751',
'SEARCH-8755',
'SEARCH-8666']
# search_ids = ['SEARCH-8708','SEARCH-8702','SEARCH-8722','SEARCH-8770','SEARCH-8749',
#              'SEARCH-8686','SEARCH-8720','SEARCH-8732']

In [57]:
data_df[data_df['COVERAGE']>=95].shape

(2515, 15)

In [58]:
qc_filter = (data_df['COVERAGE']>=95) & (data_df['AVG_DEPTH']>=1000) & (data_df['trimmed_count']>=150000)
ans = data_df.loc[qc_filter]
ans.loc[ans['SAMPLE_ID'].isin(search_ids)]

Unnamed: 0,SAMPLE_ID,num_aa_muts,num_nt_muts,COVERAGE,AVG_DEPTH,mapped,unmapped,trimmed_pct,quality_pct,trimmed_count,quality_count,PATH_depth,PATH_mapped,PATH_trim,fasta_hdr


In [59]:
ans.shape

(2343, 15)

In [51]:
data_df['quality_pct'] = data_df['quality_pct'].astype(float)
data_df['trimmed_pct'] = data_df['trimmed_pct'].astype(float)

In [72]:
print(data_df.shape)
data_df[data_df['COVERAGE']>95].shape

(3030, 15)


(2515, 15)

In [78]:
cov_filter = 95
num_passed = data_df[data_df['COVERAGE']>cov_filter].shape[0]
num_failed = data_df[data_df['COVERAGE']<cov_filter].shape[0]
fig = go.Figure(
        data=go.Scatter(y=data_df.loc[(data_df['COVERAGE']>cov_filter)]['COVERAGE'], 
                        x=data_df.loc[(data_df['COVERAGE']>cov_filter)]['num_nt_muts'],
                        name=f'Coverage Filter Pass ({num_passed} samples)',
                        mode='markers',
                        text=data_df.loc[(data_df['COVERAGE']>cov_filter)][['SAMPLE_ID', 'AVG_DEPTH', 'COVERAGE', 
                                                                              'num_nt_muts', 'mapped', 'trimmed_count']],
                        hovertemplate="<b>Sample: %{text[0]}</b><br>" +
                                      "<b>Avg depth: %{text[1]}</b><br>" +
                                      "<b>Coverage: %{text[2]}</b><br>" +
                                      "<b>Number of nt mutations: %{text[3]}</b><br>" +
                                      "<b>Number of mapped reads: %{text[4]}</b><br>" +
                                      "<b>Number of trimmed reads: %{text[5]}</b><br>",
                        marker_color='rgba(144,238,144,.6)'))
fig.add_trace(
        go.Scatter(y=data_df.loc[(data_df['COVERAGE']<cov_filter)]['COVERAGE'], 
                        x=data_df.loc[(data_df['COVERAGE']<cov_filter)]['num_nt_muts'],
                        name=f'Coverage Filter Fail ({num_failed} samples)',
                        mode='markers',
                        text=data_df.loc[(data_df['COVERAGE']<cov_filter)][['SAMPLE_ID', 'AVG_DEPTH', 'COVERAGE', 
                                                'num_nt_muts', 'mapped', 'trimmed_count']],
                        hovertemplate="<b>Sample: %{text[0]}</b><br>" +
                                      "<b>Avg depth: %{text[1]}</b><br>" +
                                      "<b>Coverage: %{text[2]}</b><br>" +
                                      "<b>Number of nt mutations: %{text[3]}</b><br>" +
                                      "<b>Number of mapped reads: %{text[4]}</b><br>" +
                                      "<b>Number of trimmed reads: %{text[5]}</b><br>",
                        marker_color='rgba(220,20,60,.6)'))
fig.update_layout(title=f'Coverage-based QC filter',
                  legend_title_text=f'Coverage threshold: {cov_filter}%',
                  yaxis_title=f'% Coverage', 
                  xaxis_title='Number of nucleotide mutations',
                  template='plotly_white', autosize=True)
fig.write_html('/valhalla/ancestral_anomaly/coverage_plot.html')
fig.show()

In [79]:
depth_filter = 1000
cov_filter = 95
max_depth = 5000
num_passed = data_df[(data_df['COVERAGE']>cov_filter) & (data_df['AVG_DEPTH']>depth_filter)].shape[0]
num_failed = data_df[(data_df['COVERAGE']>cov_filter) & (data_df['AVG_DEPTH']<depth_filter)].shape[0]
filt = (data_df['COVERAGE']>cov_filter) & (data_df['AVG_DEPTH']<max_depth)
fig = go.Figure(
        data=go.Scatter(y=data_df.loc[filt & (data_df['AVG_DEPTH']>depth_filter)]['AVG_DEPTH'], 
                        x=data_df.loc[filt & (data_df['AVG_DEPTH']>depth_filter)]['num_nt_muts'],
                        name=f'Depth Filter Pass ({num_passed} samples)',
                        mode='markers',
                        text=data_df.loc[filt 
                                      & (data_df['AVG_DEPTH']>depth_filter)][['SAMPLE_ID', 'AVG_DEPTH', 'COVERAGE', 
                                                                              'num_nt_muts', 'mapped', 'trimmed_count']],
                        hovertemplate="<b>Sample: %{text[0]}</b><br>" +
                                      "<b>Avg depth: %{text[1]}</b><br>" +
                                      "<b>Coverage: %{text[2]}</b><br>" +
                                      "<b>Number of nt mutations: %{text[3]}</b><br>" +
                                      "<b>Number of mapped reads: %{text[4]}</b><br>" +
                                      "<b>Number of trimmed reads: %{text[5]}</b><br>",
                        marker_color='rgba(144,238,144,.6)'))
fig.add_trace(
        go.Scatter(y=data_df.loc[filt & (data_df['AVG_DEPTH']<depth_filter)]['AVG_DEPTH'], 
                        x=data_df.loc[filt & (data_df['AVG_DEPTH']<depth_filter)]['num_nt_muts'],
                        name=f'Depth Filter Fail ({num_failed} samples)',
                        mode='markers',
                        text=data_df.loc[filt & (data_df['AVG_DEPTH']<depth_filter)][['SAMPLE_ID', 'AVG_DEPTH', 'COVERAGE', 
                                                'num_nt_muts', 'mapped', 'trimmed_count']],
                        hovertemplate="<b>Sample: %{text[0]}</b><br>" +
                                      "<b>Avg depth: %{text[1]}</b><br>" +
                                      "<b>Coverage: %{text[2]}</b><br>" +
                                      "<b>Number of nt mutations: %{text[3]}</b><br>" +
                                      "<b>Number of mapped reads: %{text[4]}</b><br>" +
                                      "<b>Number of trimmed reads: %{text[5]}</b><br>",
                        marker_color='rgba(220,20,60,.6)'))
fig.update_layout(title=f'Depth-based QC filter (sample with depth > {max_depth} and coverage < {cov_filter}% are ignored)',
                  legend_title_text=f'Depth threshold: {depth_filter}',
                  yaxis_title=f'Average Depth', 
                  xaxis_title='Number of nucleotide mutations',
                  template='plotly_white', autosize=True)
fig.write_html('/valhalla/ancestral_anomaly/depth_plot.html')
fig.show()

In [93]:
filt = (data_df['COVERAGE']>95)
fig = go.Figure(
        data=go.Scatter(y=data_df.loc[filt]['unmapped_log'], 
                        x=data_df.loc[filt]['num_nt_muts'],
                        mode='markers',
                        text=data_df.loc[filt][['SAMPLE_ID', 'AVG_DEPTH', 'COVERAGE', 'num_nt_muts', 'mapped']],
                        hovertemplate="<b>Sample: %{text[0]}</b><br>" +
                                      "<b>Avg depth: %{text[1]}</b><br>" +
                                      "<b>Coverage: %{text[2]}</b><br>" +
                                      "<b>Number of nt mutations: %{text[3]}</b><br>" +
                                      "<b>Number of mapped reads: %{text[4]}</b><br>",
                        marker_color='rgba(220,20,60,.6)'))
fig.update_layout(yaxis_title=f'UnMapped Read Count (logarithmic)', 
                  xaxis_title='Number of nucleotide mutations',
                  template='plotly_white', autosize=True)
fig.show()

In [85]:
filt = (data_df['COVERAGE']>95) &(data_df['mapped']<300000)
fig = go.Figure(
        data=go.Scatter(y=data_df.loc[filt]['mapped'], 
                        x=data_df.loc[filt]['num_nt_muts'],
                        mode='markers',
                        text=data_df.loc[filt][['SAMPLE_ID', 'AVG_DEPTH', 'COVERAGE', 'num_nt_muts', 'mapped']],
                        hovertemplate="<b>Sample: %{text[0]}</b><br>" +
                                      "<b>Avg depth: %{text[1]}</b><br>" +
                                      "<b>Coverage: %{text[2]}</b><br>" +
                                      "<b>Number of nt mutations: %{text[3]}</b><br>" +
                                      "<b>Number of mapped reads: %{text[4]}</b><br>",
                        marker_color='rgba(220,20,60,.6)'))
fig.update_layout(yaxis_title=f'Mapped Read Count', 
                  xaxis_title='Number of nucleotide mutations',
                  template='plotly_white', autosize=True)
fig.show()

In [71]:
filt = (data_df['COVERAGE']>95) & (data_df['AVG_DEPTH']<5000)
fig = go.Figure(
        data=go.Scatter(y=data_df.loc[filt]['AVG_DEPTH'], 
                        x=data_df.loc[filt]['num_nt_muts'],
                        mode='markers',
                        text=data_df.loc[filt][['SAMPLE_ID', 'AVG_DEPTH', 'COVERAGE', 'num_nt_muts']],
                        hovertemplate="<b>Sample: %{text[0]}</b><br>" +
                                      "<b>Avg depth: %{text[1]}</b><br>" +
                                      "<b>Coverage: %{text[2]}</b><br>" +
                                      "<b>Number of nt mutations: %{text[3]}</b><br>",
                        marker_color='rgba(220,20,60,.6)'))
fig.update_layout(yaxis_title=f'Average Depth', 
                  xaxis_title='Number of nucleotide mutations',
                  template='plotly_white', autosize=True)
fig.show()

In [101]:
search_ids = ['SEARCH-8770',
'SEARCH-8722',
'SEARCH-8766',
'SEARCH-8714',
'SEARCH-8652',
'SEARCH-8751',
'SEARCH-8755',
'SEARCH-8666']
# search_ids = ['SEARCH-8708','SEARCH-8702','SEARCH-8722','SEARCH-8770','SEARCH-8749',
#              'SEARCH-8686','SEARCH-8720','SEARCH-8732']

In [106]:
data_df[data_df['COVERAGE']>=95].shape

(2515, 12)

In [104]:
ans.shape

(2397, 12)

In [103]:
qc_filter = (data_df['COVERAGE']>=95) & (data_df['AVG_DEPTH']>=1000) & (data_df['mapped']>=150000)
ans = data_df.loc[qc_filter]
ans.loc[ans['SAMPLE_ID'].isin(search_ids)]

Unnamed: 0,idx,num_aa_muts,num_nt_muts,SAMPLE_ID,SAMPLE_x,COVERAGE,AVG_DEPTH,SAMPLE_y,mapped,unmapped,unmapped_log,mapped_unmapped


In [107]:
ans.loc[ans['num_nt_muts']<10]

Unnamed: 0,idx,num_aa_muts,num_nt_muts,SAMPLE_ID,SAMPLE_x,COVERAGE,AVG_DEPTH,SAMPLE_y,mapped,unmapped,unmapped_log,mapped_unmapped
86,Consensus_SEARCH-5990_L001_L002_threshold_0.5_...,9,9,SEARCH-5990,SEARCH-5990_L001_L002.trimmed.sorted.bam,100.0,5631.64,/valhalla/analysis/2021.01.23.hCoV19_2/merged_...,5619312,62454,11.042186,89.975214
241,Consensus_SEARCH-6145_L001_L002_threshold_0.5_...,8,8,SEARCH-6145,SEARCH-6145_L001_L002.trimmed.sorted.bam,100.0,3219.28,/valhalla/analysis/2021.01.23.hCoV19_2/merged_...,3923709,2826823,14.854664,1.388028
365,Consensus_SEARCH-6269_L001_L002_threshold_0.5_...,8,8,SEARCH-6269,SEARCH-6269_L001_L002.trimmed.sorted.bam,98.4121,2468.1,/valhalla/analysis/2021.02.01/merged_aligned_b...,4042879,3145551,14.9615,1.285269
762,Consensus_SEARCH-6666_L001_L002_threshold_0.5_...,4,4,SEARCH-6666,SEARCH-6666_L001_L002.trimmed.sorted.bam,99.0819,1638.1,/valhalla/analysis/2021.02.08/merged_aligned_b...,521606,6243620,15.647071,0.083542
1219,Consensus_SEARCH-7130_L001_L002_threshold_0.5_...,9,9,SEARCH-7130,SEARCH-7130_L001_L002.trimmed.sorted.bam,98.8507,2130.19,/valhalla/analysis/2021.03.04/merged_aligned_b...,1492486,22156212,16.913628,0.067362
2420,Consensus_SEARCH-8343_L001_L002_threshold_0.5_...,9,9,SEARCH-8343,SEARCH-8343_L001_L002.trimmed.sorted.bam,100.0,10096.3,/valhalla/analysis/2021.04.07_old_primers/merg...,2234712,3802,8.243283,587.772751
2422,Consensus_SEARCH-8345_L001_L002_threshold_0.5_...,9,9,SEARCH-8345,SEARCH-8345_L001_L002.trimmed.sorted.bam,100.0,12049.1,/valhalla/analysis/2021.04.07_old_primers/merg...,2570330,17018,9.742027,151.035962
2423,Consensus_SEARCH-8346_L001_L002_threshold_0.5_...,8,8,SEARCH-8346,SEARCH-8346_L001_L002.trimmed.sorted.bam,100.0,13802.8,/valhalla/analysis/2021.04.07_old_primers/merg...,3069671,4491,8.409831,683.516143
2424,Consensus_SEARCH-8347_L001_L002_threshold_0.5_...,8,8,SEARCH-8347,SEARCH-8347_L001_L002.trimmed.sorted.bam,100.0,10983.1,/valhalla/analysis/2021.04.07_old_primers/merg...,2362771,8909,9.094817,265.211696
2425,Consensus_SEARCH-8348_L001_L002_threshold_0.5_...,7,7,SEARCH-8348,SEARCH-8348_L001_L002.trimmed.sorted.bam,99.8436,29413.0,/valhalla/analysis/2021.04.07_old_primers/merg...,6269143,46139,10.739414,135.875138


# DEV

In [34]:
subs[subs['mutation']=='N:D3L']

Unnamed: 0,idx,seq_len,replacements,pos,gene,gene_start_pos,codon_num,codon_start,ref_codon,alt_codon,ref_aa,alt_aa,mutation,type
13,Consensus_21-295987-SCV2-031521-AM-1-V2_L001_t...,29674,28279:c,28279,N,28273,3,28279,GAT,CTA,D,L,N:D3L,substitution
13,Consensus_21-295987-SCV2-031521-AM-1-V2_L001_t...,29674,28280:t,28280,N,28273,3,28279,GAT,CTA,D,L,N:D3L,substitution
13,Consensus_21-295987-SCV2-031521-AM-1-V2_L001_t...,29674,28281:a,28281,N,28273,3,28279,GAT,CTA,D,L,N:D3L,substitution
14,Consensus_21-295989-SCV2-031521-AM-1-V2_L001_t...,29674,28279:c,28279,N,28273,3,28279,GAT,CTA,D,L,N:D3L,substitution
14,Consensus_21-295989-SCV2-031521-AM-1-V2_L001_t...,29674,28280:t,28280,N,28273,3,28279,GAT,CTA,D,L,N:D3L,substitution
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3028,Consensus_SV0049328_L001_L002_threshold_0.5_qu...,29674,28280:t,28280,N,28273,3,28279,GAT,CTA,D,L,N:D3L,substitution
3028,Consensus_SV0049328_L001_L002_threshold_0.5_qu...,29674,28281:a,28281,N,28273,3,28279,GAT,CTA,D,L,N:D3L,substitution
3029,Consensus_SV0049543_L001_L002_threshold_0.5_qu...,29674,28279:c,28279,N,28273,3,28279,GAT,CTA,D,L,N:D3L,substitution
3029,Consensus_SV0049543_L001_L002_threshold_0.5_qu...,29674,28280:t,28280,N,28273,3,28279,GAT,CTA,D,L,N:D3L,substitution


In [33]:
t = subs.groupby(['idx', 'mutation']).agg(num_nt_muts=('replacements', 'nunique')).reset_index()
t[t['num_nt_muts']>1]

Unnamed: 0,idx,mutation,num_nt_muts
338,Consensus_21-295987-SCV2-031521-AM-1-V2_L001_t...,N:D3L,3
341,Consensus_21-295987-SCV2-031521-AM-1-V2_L001_t...,N:R203K,2
370,Consensus_21-295989-SCV2-031521-AM-1-V2_L001_t...,N:D3L,3
372,Consensus_21-295989-SCV2-031521-AM-1-V2_L001_t...,N:R203K,2
402,Consensus_21-298438-SCV2-031521-AM-1-V2_L001_t...,N:R203K,2
...,...,...,...
65517,Consensus_SV0048180__threshold_0.5_quality_20,N:D3L,3
65520,Consensus_SV0048180__threshold_0.5_quality_20,N:R203K,2
65554,Consensus_SV0049328_L001_L002_threshold_0.5_qu...,N:D3L,3
65563,Consensus_SV0049543_L001_L002_threshold_0.5_qu...,N:D3L,3


In [130]:
re.findall(r'\d+(?:\.\d+)?', quality_line)[:2]

['10.99', '67824']

In [128]:
trim_line

'Trimmed primers from 17.61% (108729) of reads.\n'

In [132]:
fp = '/valhalla/analysis/2021.01.23.hCoV19/logs/trimmed/SEARCH-6132_L001_L002.log'
# fp = trim_fps[3]
with open(fp, 'r') as fh:
    sample_data = {}
    sample_data['SAMPLE_ID'] = fp.split('/')[-1].split('_')[0]
    data = fh.readlines()
    trim_line = [l for l in data if 'Trimmed primers' in l][0]
    sample_data['trimmed_pct'], sample_data['trimmed_count'] = re.findall(r'\d+(?:\.\d+)?', trim_line)
    quality_line = [l for l in data if 'quality trimmed' in l][0]
    sample_data['quality_pct'], sample_data['quality_count'] = re.findall(r'\d+(?:\.\d+)?', quality_line)[:2]

IndexError: list index out of range

In [134]:
data[]

'Processed 40% reads ... \n'

In [122]:

x,y = re.findall(r'\d+(?:\.\d+)?', trim_line)
x

'92.39'

In [None]:
trim_fps[3]

In [64]:
data_df.columns

Index(['idx', 'num_aa_muts', 'num_nt_muts', 'SAMPLE_ID', 'SAMPLE', 'COVERAGE',
       'AVG_DEPTH'],
      dtype='object')