# 1. Parameters

In [1]:
metadata_file = 'input/metadata.tsv.gz'
input_sequences = 'input/sequences.fasta.xz'
sequences_split_dir = 'input/split'
subset_input_files_path = 'input/input-files.tsv'
subset_metadata_path = 'input/metadata-subsample.tsv'

subsample_number = 10000
sample_seed = 21561

In [2]:
from pathlib import Path

metadata_file = Path(metadata_file)
input_sequences = Path(input_sequences)
sequences_split_dir = Path(sequences_split_dir)
subset_input_files_path = Path(subset_input_files_path)
subset_metadata_path = Path(subset_metadata_path)

# 2. Load metadata

In [3]:
import pandas as pd

metadata_all_df = pd.read_csv(metadata_file, sep='\t')
metadata_all_df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,strain,virus,gisaid_epi_isl,genbank_accession,sra_accession,date,region,country,division,location,...,missing_data,divergence,nonACGTN,rare_mutations,snp_clusters,QC_missing_data,QC_mixed_sites,QC_rare_mutations,QC_snp_clusters,clock_deviation
0,ARG/Cordoba-1006-155/2020,ncov,?,MW553298,,2020-06-04,South America,Argentina,Argentina,,...,0.0,13.0,0.0,8.0,0.0,good,good,good,good,5.0
1,ARG/Cordoba-1083-6/2020,ncov,?,MW553296,,2020-06-04,South America,Argentina,Argentina,,...,0.0,14.0,0.0,9.0,0.0,good,good,good,good,6.0
2,ARG/Cordoba-11300-61/2020,ncov,?,MW553301,,2020-06-04,South America,Argentina,Argentina,,...,392.0,11.0,0.0,6.0,0.0,good,good,good,good,3.0
3,ARG/Cordoba-11301-61/2020,ncov,?,MW553314,,2020-06-04,South America,Argentina,Argentina,,...,0.0,13.0,0.0,8.0,0.0,good,good,good,good,5.0
4,ARG/Cordoba-11419-61/2020,ncov,?,MW553297,,2020-06-04,South America,Argentina,Argentina,,...,0.0,13.0,0.0,8.0,0.0,good,good,good,good,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095212,mink/NED/NB02_index/2020,ncov,EPI_ISL_447631,MT457398,,2020-04-25,Europe,Netherlands,Netherlands,,...,0.0,12.0,0.0,11.0,0.0,good,good,good,good,7.0
1095213,mink/NED/NB03_index/2020,ncov,EPI_ISL_447633,MT457400,,2020-05-06,Europe,Netherlands,Netherlands,,...,0.0,13.0,0.0,9.0,0.0,good,good,good,good,6.0
1095214,mink/NED/NB04_index/2020,ncov,EPI_ISL_447634,MT457401,,2020-05-06,Europe,Netherlands,Netherlands,,...,0.0,15.0,0.0,12.0,0.0,good,good,good,good,8.0
1095215,mink/NLD/1/2020,ncov,EPI_ISL_431778,MT396266,,2020-04-24,Europe,Netherlands,North Brabant,Milheeze,...,0.0,9.0,0.0,5.0,0.0,good,good,good,good,3.0


## 2.1. Look for duplicates in table

In [4]:
len_all = len(metadata_all_df)
len_no_duplicates = metadata_all_df['strain'].agg('count')
if len_all == len_no_duplicates:
    print(f'No duplicates: len_all={len_all} == len_no_duplicates={len_no_duplicates}')
else:
    print(f'Some duplicates found: len_all={len_all} == len_no_duplicates={len_no_duplicates}')

No duplicates: len_all=1095217 == len_no_duplicates=1095217


## 2.2. Subsample dataframe

In [5]:
metadata_df = metadata_all_df.sample(subsample_number, random_state=sample_seed)
metadata_df

Unnamed: 0,strain,virus,gisaid_epi_isl,genbank_accession,sra_accession,date,region,country,division,location,...,missing_data,divergence,nonACGTN,rare_mutations,snp_clusters,QC_missing_data,QC_mixed_sites,QC_rare_mutations,QC_snp_clusters,clock_deviation
366798,England/QEUH-9F48A4/2020,ncov,EPI_ISL_588307,OA992707,ERR4688590,2020-09-25,Europe,United Kingdom,England,,...,121.0,15.0,1.0,1.0,0.0,good,good,good,good,0.0
1028608,USA/UT-CDC-QDX26118077/2021,ncov,EPI_ISL_2873924,MZ534972,,2021-06-22,North America,USA,Utah,,...,0.0,37.0,0.0,5.0,0.0,good,good,good,good,1.0
805271,USA/IA-CDC-2-4503024/2021,ncov,EPI_ISL_2709554,MZ453424,,2021-05-25,North America,USA,Iowa,,...,1.0,37.0,0.0,11.0,0.0,good,good,good,good,-1.0
498317,OU163868,ncov,?,OU163868,,2021-05-04,Europe,Germany,Rheinland-Pfalz,,...,0.0,38.0,0.0,9.0,0.0,good,good,good,good,0.0
368548,England/QEUH-A5E063/2020,ncov,EPI_ISL_597431,OA995643,ERR4763980,2020-10-12,Europe,United Kingdom,England,,...,121.0,16.0,0.0,1.0,0.0,good,good,good,good,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
936441,USA/NJ-CDC-QDX24758678/2021,ncov,EPI_ISL_2369287,MZ319327,,2021-05-07,North America,USA,New Jersey,,...,0.0,41.0,0.0,14.0,0.0,good,good,good,good,3.0
1053482,USA/VT-MASPHL-02207/2021,ncov,?,MW893851,,2021-02-10,North America,USA,USA,,...,220.0,21.0,0.0,8.0,0.0,good,good,good,good,-2.0
90975,England/ALDP-16967D4/2021,ncov,EPI_ISL_2605675,OU337161,ERR6105992,2021-06-08,Europe,United Kingdom,England,,...,121.0,40.0,0.0,16.0,0.0,good,good,mediocre,good,0.0
1020255,USA/TX-DSHS-2441/2020,ncov,?,MW724764,SRR13660100,2020-06-24,North America,USA,Texas,,...,3695.0,11.0,0.0,0.0,0.0,bad,good,good,good,0.0


# 3. Extract sequences to separate files and prepare dataframe

In [6]:
from Bio import SeqIO
from pathlib import Path
from os import mkdir
import lzma
import gzip
import shutil
import time

total = len(metadata_all_df)

if sequences_split_dir.exists():
    shutil.rmtree(sequences_split_dir)

if not sequences_split_dir.exists():
    mkdir(sequences_split_dir)
    
included_samples = set(metadata_df['strain'].tolist())

print_on = 20000

time_before = time.time()
count = 0
input_file_data = []
with lzma.open(input_sequences, 'tr') as ih:
    for record in SeqIO.parse(ih, 'fasta'):
        if count % print_on == 0:
            percent = (count/total) * 100
            print(f'{percent:0.1f}% ({count}/{total})')

        name = record.id
        if name in included_samples:
            cleaned_name = name.replace('/', '__')
            out_file_path = (sequences_split_dir / f'{cleaned_name}.fasta.gz').absolute()
            input_file_data.append([name, str(out_file_path), pd.NA, pd.NA])
            with gzip.open(out_file_path, "wt") as oh:
                SeqIO.write(record, oh, "fasta")

        count += 1

input_file_df = pd.DataFrame(input_file_data, columns=['Sample', 'Assemblies', 'Reads1', 'Reads2'])
time_after = time.time()
print(f'Finished writing files to {sequences_split_dir}.')
print(f'Took {(time_after - time_before)/60:0.1f} minutes')
input_file_df.head(5)

0.0% (0/1095217)
1.8% (20000/1095217)
3.7% (40000/1095217)
5.5% (60000/1095217)
7.3% (80000/1095217)
9.1% (100000/1095217)
11.0% (120000/1095217)
12.8% (140000/1095217)
14.6% (160000/1095217)
16.4% (180000/1095217)
18.3% (200000/1095217)
20.1% (220000/1095217)
21.9% (240000/1095217)
23.7% (260000/1095217)
25.6% (280000/1095217)
27.4% (300000/1095217)
29.2% (320000/1095217)
31.0% (340000/1095217)
32.9% (360000/1095217)
34.7% (380000/1095217)
36.5% (400000/1095217)
38.3% (420000/1095217)
40.2% (440000/1095217)
42.0% (460000/1095217)
43.8% (480000/1095217)
45.7% (500000/1095217)
47.5% (520000/1095217)
49.3% (540000/1095217)
51.1% (560000/1095217)
53.0% (580000/1095217)
54.8% (600000/1095217)
56.6% (620000/1095217)
58.4% (640000/1095217)
60.3% (660000/1095217)
62.1% (680000/1095217)
63.9% (700000/1095217)
65.7% (720000/1095217)
67.6% (740000/1095217)
69.4% (760000/1095217)
71.2% (780000/1095217)
73.0% (800000/1095217)
74.9% (820000/1095217)
76.7% (840000/1095217)
78.5% (860000/1095217)
80.

Unnamed: 0,Sample,Assemblies,Reads1,Reads2
0,Switzerland/100835/2020,/home/CSCScience.ca/apetkau/workspace/genomics...,,
1,Switzerland/180011/2020,/home/CSCScience.ca/apetkau/workspace/genomics...,,
2,LR862464,/home/CSCScience.ca/apetkau/workspace/genomics...,,
3,LR882095,/home/CSCScience.ca/apetkau/workspace/genomics...,,
4,LR882100,/home/CSCScience.ca/apetkau/workspace/genomics...,,


## 3.1. Save input dataframe to file

In [7]:
import numpy as np
import math
from os import remove

if subset_input_files_path.exists():
    remove(subset_input_files_path)
    
input_file_df.to_csv(subset_input_files_path, sep='\t', index=False)
    
print(f'Wrote input files to {subset_input_files_path}')

Wrote input files to input/input-files.tsv


## 3.1.2. Save subset metadata to file

In [8]:
metadata_df.to_csv(subset_metadata_path, sep='\t', index=False)

print(f'Wrote subset metadata to {subset_metadata_path}')

Wrote subset metadata to input/metadata-subsample.tsv
