# 1. Parameters

In [1]:
sequences_split_dir = 'input/split'
sequences_subset_all = 'input/all.fasta.gz'
pangolin_out_dir = 'input/pangolin'
subset_metadata_path = 'input/metadata-subsample.tsv'
output_metadata_pangolin_path = 'input/metadata-subsample-pangolin.tsv'

In [2]:
from pathlib import Path

sequences_split_dir = Path(sequences_split_dir)
sequences_subset_all = Path(sequences_subset_all)
pangolin_out_dir = Path(pangolin_out_dir)
subset_metadata_path = Path(subset_metadata_path)
output_metadata_pangolin_path = Path(output_metadata_pangolin_path)

# 2. Run pangolin

In [3]:
!conda run --name pangolin pangolin --all-versions

pangolin: 3.1.11
pangolearn: 2021-08-24
constellations: v0.0.15
scorpio: 0.3.12
pango-designation: 1.2.76



In [4]:
!conda run --name pangolin pangolin -t 52 -o {pangolin_out_dir} {sequences_subset_all}

[32mAll dependencies satisfied.[0m
[32mThe query file is:[0m/home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/sars-cov-2/input/all.fasta.gz
[32m** Sequence QC **[0m
Sequence name                 	                   Reason	Value     

OB998205                      	       N content too high	0.42      

OU395174                      	       N content too high	0.45      

OU419825                      	       N content too high	0.49      

OU421540                      	       N content too high	0.47      

OU487653                      	       N content too high	0.37      

OU489002                      	       N content too high	0.45      

Japan/DP0700/2020             	            Seq too short	22839     

USA/20x1094/2020              	       N content too high	0.37      

USA/UT-01685/2020             	       N content too high	0.43      

KEN/C800/2020                 	       N content too high	0.46      

HG996918                      	       N

# 3. Load up and join pangolin to metadata

## 3.1. Load pangolin results

In [5]:
import pandas as pd

pangolin_report = pangolin_out_dir / 'lineage_report.csv'

p_df = pd.read_csv(pangolin_report)
p_df

Unnamed: 0,taxon,lineage,conflict,ambiguity_score,scorpio_call,scorpio_support,scorpio_conflict,version,pangolin_version,pangoLEARN_version,pango_version,status,note
0,Wuhan/IPBCAMS-WH-04/2019,B,,,,,,PANGO-v1.2.66,3.1.11,2021-08-24,v1.2.66,passed_qc,Assigned from designation hash.
1,CHN/Wuhan_IME-WH05/2019,B,,,,,,PANGO-v1.2.66,3.1.11,2021-08-24,v1.2.66,passed_qc,Assigned from designation hash.
2,Switzerland/100108/2020,B.1,,,,,,PANGO-v1.2.66,3.1.11,2021-08-24,v1.2.66,passed_qc,Assigned from designation hash.
3,Switzerland/100816/2020,B.1,,,,,,PANGO-v1.2.66,3.1.11,2021-08-24,v1.2.66,passed_qc,Assigned from designation hash.
4,LR862442,C.35,,,,,,PANGO-v1.2.66,3.1.11,2021-08-24,v1.2.66,passed_qc,Assigned from designation hash.
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,OU488207,,,,,,,PANGO-v1.2.66,3.1.11,2021-08-24,v1.2.66,fail,N_content:0.41
99996,OU490562,,,,,,,PANGO-v1.2.66,3.1.11,2021-08-24,v1.2.66,fail,N_content:0.47
99997,OU490569,,,,,,,PANGO-v1.2.66,3.1.11,2021-08-24,v1.2.66,fail,N_content:0.45
99998,OU490836,,,,,,,PANGO-v1.2.66,3.1.11,2021-08-24,v1.2.66,fail,N_content:0.31


## 3.2. Load other metadata

In [6]:
import pandas as pd

metadata_df = pd.read_csv(subset_metadata_path, sep='\t')
metadata_df

Unnamed: 0,strain,virus,gisaid_epi_isl,genbank_accession,sra_accession,date,region,country,division,location,...,divergence,nonACGTN,rare_mutations,snp_clusters,QC_missing_data,QC_mixed_sites,QC_rare_mutations,QC_snp_clusters,clock_deviation,missing_data_percent
0,OU420663,ncov,?,OU420663,ERR5939958,2020,Europe,United Kingdom,England,,...,40.0,0.0,15.0,0.0,good,good,good,good,?,0.351135
1,USA/NY-CUIMC-NP-3606/2020,ncov,?,MZ702266,,2020-12-02,North America,USA,New York,New York City,...,21.0,0.0,7.0,0.0,mediocre,good,good,good,0.0,4.417617
2,Switzerland/BL-UHB-42491849/2020,ncov,EPI_ISL_930932,OU281100,,2020-10-18,Europe,Switzerland,Basel-Land,,...,19.0,0.0,6.0,0.0,good,good,good,good,0.0,0.297629
3,Scotland/QEUH-1618558/2021,ncov,EPI_ISL_2487678,OU314699,ERR6069165,2021-06-02,Europe,United Kingdom,Scotland,,...,33.0,0.0,7.0,0.0,mediocre,good,good,good,-5.0,3.885898
4,Scotland/QEUH-16829C6/2021,ncov,EPI_ISL_2554507,OU329920,ERR6085250,2021-06-10,Europe,United Kingdom,Scotland,,...,37.0,0.0,5.0,0.0,good,good,good,good,1.0,0.404642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,USA/OH-CDC-QDX23654955/2021,ncov,EPI_ISL_1669162,MW989219,,2021-04-05,North America,USA,Ohio,,...,35.0,0.0,9.0,0.0,good,good,good,good,0.0,0.000000
99996,USA/KS-CDC-LC0014845/2021,ncov,EPI_ISL_1255089,MW704675,,2021-02-09,North America,USA,Kansas,,...,21.0,0.0,6.0,0.0,good,good,good,good,-2.0,3.394308
99997,USA/WA-S5065/2020,ncov,EPI_ISL_1114222,MW689132,,2020-11-06,North America,USA,Washington,Grays Harbor County,...,14.0,0.0,0.0,0.0,good,good,good,good,-2.0,0.006688
99998,England/MILK-10AE881/2021,ncov,EPI_ISL_993691,OD978562,ERR5303908,2021-01-17,Europe,United Kingdom,England,,...,32.0,0.0,7.0,0.0,good,good,good,good,0.0,1.595158


In [7]:
metadata_df.columns

Index(['strain', 'virus', 'gisaid_epi_isl', 'genbank_accession',
       'sra_accession', 'date', 'region', 'country', 'division', 'location',
       'region_exposure', 'country_exposure', 'division_exposure', 'segment',
       'length', 'host', 'age', 'sex', 'Nextstrain_clade', 'pango_lineage',
       'GISAID_clade', 'originating_lab', 'submitting_lab', 'authors', 'url',
       'title', 'paper_url', 'date_submitted', 'sampling_strategy',
       'missing_data', 'divergence', 'nonACGTN', 'rare_mutations',
       'snp_clusters', 'QC_missing_data', 'QC_mixed_sites',
       'QC_rare_mutations', 'QC_snp_clusters', 'clock_deviation',
       'missing_data_percent'],
      dtype='object')

## 3.3. Join to pangolin results

In [8]:
metadata_pangolin_df = metadata_df.merge(p_df, left_on='strain', right_on='taxon')
metadata_pangolin_df

Unnamed: 0,strain,virus,gisaid_epi_isl,genbank_accession,sra_accession,date,region,country,division,location,...,ambiguity_score,scorpio_call,scorpio_support,scorpio_conflict,version,pangolin_version,pangoLEARN_version,pango_version,status,note
0,OU420663,ncov,?,OU420663,ERR5939958,2020,Europe,United Kingdom,England,,...,1.000000,Alpha (B.1.1.7-like),1.0000,0.0000,PLEARN-v1.2.66,3.1.11,2021-08-24,v1.2.66,passed_qc,scorpio call: Alt alleles 23; Ref alleles 0; A...
1,USA/NY-CUIMC-NP-3606/2020,ncov,?,MZ702266,,2020-12-02,North America,USA,New York,New York City,...,0.950970,,,,PLEARN-v1.2.66,3.1.11,2021-08-24,v1.2.66,passed_qc,
2,Switzerland/BL-UHB-42491849/2020,ncov,EPI_ISL_930932,OU281100,,2020-10-18,Europe,Switzerland,Basel-Land,,...,,,,,PANGO-v1.2.66,3.1.11,2021-08-24,v1.2.66,passed_qc,Assigned from designation hash.
3,Scotland/QEUH-1618558/2021,ncov,EPI_ISL_2487678,OU314699,ERR6069165,2021-06-02,Europe,United Kingdom,Scotland,,...,0.967377,Alpha (B.1.1.7-like),0.8261,0.0435,PLEARN-v1.2.66,3.1.11,2021-08-24,v1.2.66,passed_qc,scorpio call: Alt alleles 19; Ref alleles 1; A...
4,Scotland/QEUH-16829C6/2021,ncov,EPI_ISL_2554507,OU329920,ERR6085250,2021-06-10,Europe,United Kingdom,Scotland,,...,,Delta (B.1.617.2-like),1.0000,0.0000,PANGO-v1.2.66,3.1.11,2021-08-24,v1.2.66,passed_qc,scorpio call: Alt alleles 13; Ref alleles 0; A...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,USA/OH-CDC-QDX23654955/2021,ncov,EPI_ISL_1669162,MW989219,,2021-04-05,North America,USA,Ohio,,...,1.000000,Alpha (B.1.1.7-like),0.9565,0.0435,PLEARN-v1.2.66,3.1.11,2021-08-24,v1.2.66,passed_qc,scorpio call: Alt alleles 22; Ref alleles 1; A...
99996,USA/KS-CDC-LC0014845/2021,ncov,EPI_ISL_1255089,MW704675,,2021-02-09,North America,USA,Kansas,,...,0.969668,,,,PLEARN-v1.2.66,3.1.11,2021-08-24,v1.2.66,passed_qc,
99997,USA/WA-S5065/2020,ncov,EPI_ISL_1114222,MW689132,,2020-11-06,North America,USA,Washington,Grays Harbor County,...,0.999926,,,,PLEARN-v1.2.66,3.1.11,2021-08-24,v1.2.66,passed_qc,
99998,England/MILK-10AE881/2021,ncov,EPI_ISL_993691,OD978562,ERR5303908,2021-01-17,Europe,United Kingdom,England,,...,,Alpha (B.1.1.7-like),0.9565,0.0435,PANGO-v1.2.66,3.1.11,2021-08-24,v1.2.66,passed_qc,scorpio call: Alt alleles 22; Ref alleles 1; A...


# 4. Save joined metadata to file

In [9]:
metadata_pangolin_df.to_csv(output_metadata_pangolin_path, sep='\t', index=False)

print(f'Saved metadata and pangolin to {output_metadata_pangolin_path}')

Saved metadata and pangolin to input/metadata-subsample-pangolin.tsv
