# 1. Parameters

In [1]:
sequences_split_dir = 'input/split'
sequences_subset_all = 'input/all.fasta.gz'
pangolin_out_dir = 'input/pangolin'
subset_metadata_path = 'input/metadata-subsample.tsv'
output_metadata_pangolin_path = 'input/metadata-subsample-pangolin.tsv'

In [2]:
from pathlib import Path

sequences_split_dir = Path(sequences_split_dir)
sequences_subset_all = Path(sequences_subset_all)
pangolin_out_dir = Path(pangolin_out_dir)
subset_metadata_path = Path(subset_metadata_path)
output_metadata_pangolin_path = Path(output_metadata_pangolin_path)

# 2. Run pangolin

In [3]:
!conda run --name pangolin pangolin --all-versions

pangolin: 4.1.2
pangolin-data: 1.14
constellations: v0.1.10
scorpio: 0.3.17
usher 0.5.6
gofasta 1.1.0
minimap2 2.24-r1122
faToVcf: 426



In [6]:
# Takes a while so I run this on the command-line instead of in the notebook
print(f"conda run --name pangolin pangolin -t 52 -o {pangolin_out_dir} {sequences_subset_all}")

conda run --name pangolin pangolin -t 52 -o input/pangolin input/all.fasta.gz


# 3. Load up and join pangolin to metadata

## 3.1. Load pangolin results

In [7]:
import pandas as pd

pangolin_report = pangolin_out_dir / 'lineage_report.csv'

p_df = pd.read_csv(pangolin_report)
p_df

Unnamed: 0,taxon,lineage,conflict,ambiguity_score,scorpio_call,scorpio_support,scorpio_conflict,scorpio_notes,version,pangolin_version,scorpio_version,constellation_version,is_designated,qc_status,qc_notes,note
0,Switzerland/120005/2020,B.1.1,0.596296,,,,,,PUSHER-v1.14,4.1.2,0.3.17,v0.1.10,False,pass,Ambiguous_content:0.26,Usher placements: B(1/270) B.1(29/270) B.1.1(1...
1,Switzerland/120040/2020,B.1.1,0.000000,,,,,,PUSHER-v1.14,4.1.2,0.3.17,v0.1.10,False,pass,Ambiguous_content:0.02,Usher placements: B.1.1(1/1)
2,Switzerland/100128/2020,B.1.93,0.000000,,,,,,PUSHER-v1.14,4.1.2,0.3.17,v0.1.10,False,pass,Ambiguous_content:0.02,Usher placements: B.1.93(1/1)
3,Switzerland/100814/2020,B.1,0.000000,,,,,,PUSHER-v1.14,4.1.2,0.3.17,v0.1.10,False,pass,Ambiguous_content:0.02,Usher placements: B.1(2/2)
4,Switzerland/101130/2020,B.1.1,0.000000,,,,,,PUSHER-v1.14,4.1.2,0.3.17,v0.1.10,False,pass,Ambiguous_content:0.02,Usher placements: B.1.1(1/1)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,LR794648,B.1.273,0.000000,,,,,,PANGO-v1.14,4.1.2,0.3.17,v0.1.10,True,pass,Ambiguous_content:0.02,Assigned from designation hash.
99996,LR794654,B.1,0.000000,,,,,,PANGO-v1.14,4.1.2,0.3.17,v0.1.10,True,pass,Ambiguous_content:0.02,Assigned from designation hash.
99997,LR881662,B.1.160,0.000000,,,,,,PANGO-v1.14,4.1.2,0.3.17,v0.1.10,True,pass,Ambiguous_content:0.02,Assigned from designation hash.
99998,LR881721,B.1.416.1,0.000000,,,,,,PANGO-v1.14,4.1.2,0.3.17,v0.1.10,True,pass,Ambiguous_content:0.02,Assigned from designation hash.


## 3.2. Load other metadata

In [8]:
import pandas as pd

metadata_df = pd.read_csv(subset_metadata_path, sep='\t')
metadata_df

Unnamed: 0,strain,virus,gisaid_epi_isl,genbank_accession,sra_accession,date,region,country,division,location,...,QC_stop_codons,QC_overall_score,QC_overall_status,frame_shifts,deletions,insertions,substitutions,aaSubstitutions,clock_deviation,missing_data_percent
0,Scotland/QEUH-3DF3D03/2022,ncov,EPI_ISL_12630193,OW855487,ERR9720965,2022-05-03,Europe,United Kingdom,Scotland,,...,good,0.000000,good,,"11288-11296,21633-21641,28362-28370,29734-29759",,"C241T,T670G,G1857T,C2790T,C3037T,G4184A,C4321T...","E:T9I,M:Q19E,M:A63T,N:P13L,N:R203K,N:G204R,N:S...",2.0,0.421362
1,Denmark/DCGC-384569/2022,ncov,EPI_ISL_10258944,OX022835,ERS12093125,2022-02-15,Europe,Denmark,Syddanmark,,...,good,0.000000,good,,"509-523,11288-11296,21633-21641,28362-28370",,"C241T,T670G,C2790T,C3037T,G4184A,C4321T,C9344T...","E:T9I,M:Q19E,M:A63T,N:P13L,N:R203K,N:G204R,N:S...",5.0,0.404642
2,OW500387,ncov,?,OW500387,ERS11571025,2022-01-17,Europe,United Kingdom,United Kingdom,,...,good,3.666516,good,,"6513-6515,11285-11293,21765-21770,21987-21995,...",22204:GAGCCAGAA,"C241T,A2832G,C3037T,C3241T,T5386G,G5924A,G8393...","E:T9I,M:D3G,M:Q19E,M:A63T,N:P13L,N:R203K,N:G20...",-3.0,2.732167
3,USA/TG1138574/2022,ncov,EPI_ISL_8874412,ON541191,SRR19475243,2022-01-03,North America,USA,Arizona,Pima County,...,good,1.623265,good,,"6513-6515,11285-11293,21765-21770,21987-21995,...",22204:GAGCCAGAA,"C241T,A1992G,A2832G,C3037T,T5386G,G8393A,C1002...","E:T9I,M:D3G,M:A63T,N:P13L,N:R203K,N:G204R,N:D3...",0.0,2.153630
4,USA/TX-CDC-ASC210194138/2021,ncov,EPI_ISL_5230152,OK406723,,2021-08-20,North America,USA,Texas,,...,good,6.250000,good,,"22029-22034,28248-28253,28271",,"G210T,T224C,C241T,C1218T,C3037T,G4181T,C6402T,...","M:I82T,N:D63G,N:R203M,N:G215C,N:D377Y,ORF1a:S3...",3.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,Switzerland/SO-ETHZ-561547/2020,ncov,EPI_ISL_1598444,OU190135,ERS6825234,2021-03-29,Europe,Switzerland,Solothurn,,...,good,0.000000,good,,"11288-11296,21765-21770,21992-21994,28271",,"C241T,C913T,C3037T,C3267T,A4964G,C5388A,C5986T...","N:D3L,N:R203K,N:G204R,N:S235F,ORF1a:T1001I,ORF...",0.0,0.030097
99996,Denmark/DCGC-569733/2022,ncov,?,OX329283,ERS13409574,2022-08-18,Europe,Denmark,Midtjylland,,...,good,26.361111,good,,"11288-11296,21633-21641,21765-21770,28362-28370",,"C241T,T670G,C1627T,C2790T,C3037T,C3294T,T3324C...","E:T9I,M:A63T,N:P13L,N:R95C,N:R203K,N:G204R,N:S...",0.0,0.458148
99997,England/MILK-3A04FDF/2022,ncov,EPI_ISL_11078341,OW210645,ERR9333689,2022-03-09,Europe,United Kingdom,England,,...,good,0.000000,good,,"11288-11296,21633-21641,28362-28370,29734-29759",,"C241T,T670G,C2790T,C3037T,G4184A,C4321T,C9344T...","E:T9I,M:Q19E,M:A63T,N:P13L,N:R203K,N:G204R,N:S...",0.0,0.421362
99998,England/ALDP-B5CC7F/2020,ncov,EPI_ISL_655902,OB997493,ERR4973707,2020-11-08,Europe,United Kingdom,England,,...,good,0.000000,good,,,,"C66T,C241T,T445C,A1987G,C3037T,G4187A,C6286T,C...","N:A220V,ORF1a:G1308S,ORF1a:K3839R,ORF1b:P314L,...",3.0,0.404642


In [9]:
metadata_df.columns

Index(['strain', 'virus', 'gisaid_epi_isl', 'genbank_accession',
       'sra_accession', 'date', 'region', 'country', 'division', 'location',
       'region_exposure', 'country_exposure', 'division_exposure', 'segment',
       'length', 'host', 'age', 'sex', 'Nextstrain_clade', 'pango_lineage',
       'GISAID_clade', 'originating_lab', 'submitting_lab', 'authors', 'url',
       'title', 'paper_url', 'date_submitted', 'sampling_strategy',
       'Nextclade_pango', 'missing_data', 'divergence', 'nonACGTN', 'coverage',
       'rare_mutations', 'reversion_mutations', 'potential_contaminants',
       'QC_missing_data', 'QC_mixed_sites', 'QC_rare_mutations',
       'QC_snp_clusters', 'QC_frame_shifts', 'QC_stop_codons',
       'QC_overall_score', 'QC_overall_status', 'frame_shifts', 'deletions',
       'insertions', 'substitutions', 'aaSubstitutions', 'clock_deviation',
       'missing_data_percent'],
      dtype='object')

## 3.3. Join to pangolin results

In [10]:
metadata_pangolin_df = metadata_df.merge(p_df, left_on='strain', right_on='taxon')
metadata_pangolin_df

Unnamed: 0,strain,virus,gisaid_epi_isl,genbank_accession,sra_accession,date,region,country,division,location,...,scorpio_conflict,scorpio_notes,version,pangolin_version,scorpio_version,constellation_version,is_designated,qc_status,qc_notes,note
0,Scotland/QEUH-3DF3D03/2022,ncov,EPI_ISL_12630193,OW855487,ERR9720965,2022-05-03,Europe,United Kingdom,Scotland,,...,0.00,scorpio call: Alt alleles 64; Ref alleles 0; A...,PANGO-v1.14,4.1.2,0.3.17,v0.1.10,True,pass,Ambiguous_content:0.02,Assigned from designation hash.
1,Denmark/DCGC-384569/2022,ncov,EPI_ISL_10258944,OX022835,ERS12093125,2022-02-15,Europe,Denmark,Syddanmark,,...,0.00,scorpio call: Alt alleles 64; Ref alleles 0; A...,PANGO-v1.14,4.1.2,0.3.17,v0.1.10,True,pass,Ambiguous_content:0.02,Assigned from designation hash.
2,OW500387,ncov,?,OW500387,ERS11571025,2022-01-17,Europe,United Kingdom,United Kingdom,,...,0.00,scorpio call: Alt alleles 51; Ref alleles 0; A...,PUSHER-v1.14,4.1.2,0.3.17,v0.1.10,False,pass,Ambiguous_content:0.04,Usher placements: BA.1.17.2(3/3)
3,USA/TG1138574/2022,ncov,EPI_ISL_8874412,ON541191,SRR19475243,2022-01-03,North America,USA,Arizona,Pima County,...,0.00,scorpio call: Alt alleles 50; Ref alleles 0; A...,PUSHER-v1.14,4.1.2,0.3.17,v0.1.10,False,pass,Ambiguous_content:0.04,Usher placements: BA.1.1(2/2)
4,USA/TX-CDC-ASC210194138/2021,ncov,EPI_ISL_5230152,OK406723,,2021-08-20,North America,USA,Texas,,...,0.08,scorpio call: Alt alleles 12; Ref alleles 1; A...,PUSHER-v1.14,4.1.2,0.3.17,v0.1.10,False,pass,Ambiguous_content:0.02,Usher placements: AY.39(1/1)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,Switzerland/SO-ETHZ-561547/2020,ncov,EPI_ISL_1598444,OU190135,ERS6825234,2021-03-29,Europe,Switzerland,Solothurn,,...,0.04,scorpio call: Alt alleles 22; Ref alleles 1; A...,PUSHER-v1.14,4.1.2,0.3.17,v0.1.10,False,pass,Ambiguous_content:0.02,Usher placements: Q.4(1/1)
99996,Denmark/DCGC-569733/2022,ncov,?,OX329283,ERS13409574,2022-08-18,Europe,Denmark,Midtjylland,,...,0.02,scorpio call: Alt alleles 58; Ref alleles 1; A...,PUSHER-v1.14,4.1.2,0.3.17,v0.1.10,False,pass,Ambiguous_content:0.02,Usher placements: BA.5.2(1/1)
99997,England/MILK-3A04FDF/2022,ncov,EPI_ISL_11078341,OW210645,ERR9333689,2022-03-09,Europe,United Kingdom,England,,...,0.00,scorpio call: Alt alleles 64; Ref alleles 0; A...,PANGO-v1.14,4.1.2,0.3.17,v0.1.10,True,pass,Ambiguous_content:0.02,Assigned from designation hash.
99998,England/ALDP-B5CC7F/2020,ncov,EPI_ISL_655902,OB997493,ERR4973707,2020-11-08,Europe,United Kingdom,England,,...,,,PANGO-v1.14,4.1.2,0.3.17,v0.1.10,True,pass,Ambiguous_content:0.02,Assigned from designation hash.


# 4. Save joined metadata to file

In [11]:
metadata_pangolin_df.to_csv(output_metadata_pangolin_path, sep='\t', index=False)

print(f'Saved metadata and pangolin to {output_metadata_pangolin_path}')

Saved metadata and pangolin to input/metadata-subsample-pangolin.tsv
