# Load metadata

In [1]:
import pandas as pd

df = pd.read_csv('input/metadata.tsv.gz', sep='\t')
df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,strain,virus,gisaid_epi_isl,genbank_accession,sra_accession,date,region,country,division,location,...,missing_data,divergence,nonACGTN,rare_mutations,snp_clusters,QC_missing_data,QC_mixed_sites,QC_rare_mutations,QC_snp_clusters,clock_deviation
0,ARG/Cordoba-1006-155/2020,ncov,?,MW553298,,2020-06-04,South America,Argentina,Argentina,,...,0.0,13.0,0.0,8.0,0.0,good,good,good,good,5.0
1,ARG/Cordoba-1083-6/2020,ncov,?,MW553296,,2020-06-04,South America,Argentina,Argentina,,...,0.0,14.0,0.0,9.0,0.0,good,good,good,good,6.0
2,ARG/Cordoba-11300-61/2020,ncov,?,MW553301,,2020-06-04,South America,Argentina,Argentina,,...,392.0,11.0,0.0,6.0,0.0,good,good,good,good,3.0
3,ARG/Cordoba-11301-61/2020,ncov,?,MW553314,,2020-06-04,South America,Argentina,Argentina,,...,0.0,13.0,0.0,8.0,0.0,good,good,good,good,5.0
4,ARG/Cordoba-11419-61/2020,ncov,?,MW553297,,2020-06-04,South America,Argentina,Argentina,,...,0.0,13.0,0.0,8.0,0.0,good,good,good,good,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095212,mink/NED/NB02_index/2020,ncov,EPI_ISL_447631,MT457398,,2020-04-25,Europe,Netherlands,Netherlands,,...,0.0,12.0,0.0,11.0,0.0,good,good,good,good,7.0
1095213,mink/NED/NB03_index/2020,ncov,EPI_ISL_447633,MT457400,,2020-05-06,Europe,Netherlands,Netherlands,,...,0.0,13.0,0.0,9.0,0.0,good,good,good,good,6.0
1095214,mink/NED/NB04_index/2020,ncov,EPI_ISL_447634,MT457401,,2020-05-06,Europe,Netherlands,Netherlands,,...,0.0,15.0,0.0,12.0,0.0,good,good,good,good,8.0
1095215,mink/NLD/1/2020,ncov,EPI_ISL_431778,MT396266,,2020-04-24,Europe,Netherlands,North Brabant,Milheeze,...,0.0,9.0,0.0,5.0,0.0,good,good,good,good,3.0


In [2]:
df.columns

Index(['strain', 'virus', 'gisaid_epi_isl', 'genbank_accession',
       'sra_accession', 'date', 'region', 'country', 'division', 'location',
       'region_exposure', 'country_exposure', 'division_exposure', 'segment',
       'length', 'host', 'age', 'sex', 'Nextstrain_clade', 'pango_lineage',
       'GISAID_clade', 'originating_lab', 'submitting_lab', 'authors', 'url',
       'title', 'paper_url', 'date_submitted', 'sampling_strategy',
       'missing_data', 'divergence', 'nonACGTN', 'rare_mutations',
       'snp_clusters', 'QC_missing_data', 'QC_mixed_sites',
       'QC_rare_mutations', 'QC_snp_clusters', 'clock_deviation'],
      dtype='object')

## Look for duplicates in table

In [12]:
len_all = len(df)
len_no_duplicates = df['strain'].agg('count')
print(f'len_all={len_all} == len_no_duplicates={len_no_duplicates}: {len_all == len_no_duplicates}')

len_all=1095217 == len_no_duplicates=1095217: True


# Extract sequences to separate files and prepare dataframe

In [22]:
from Bio import SeqIO
from pathlib import Path
from os import mkdir
import lzma
import gzip
import shutil

total = len(df)

in_file = Path('input/sequences.fasta.xz')
out_dir = Path('input/split')
input_file_file = 'input-files.tsv.gz'

if out_dir.exists():
    shutil.rmtree(out_dir)

if not out_dir.exists():
    mkdir(out_dir)

print_on = 150000
file_limit = 100

count = 0
input_file_data = []
with lzma.open(in_file, 'tr') as ih:
    for record in SeqIO.parse(ih, 'fasta'):
        if count % print_on == 0:
            percent = (count/total) * 100
            print(f'{percent:0.1f}% ({count}/{total})')
            
        if count >= file_limit:
            break

        cleaned_name = record.id.replace('/', '__')
        out_file_path = (out_dir / f'{cleaned_name}.fasta.gz').absolute()
        input_file_data.append([record.id, str(out_file_path), pd.NA, pd.NA])
        with gzip.open(out_file_path, "wt") as oh:
            SeqIO.write(record, oh, "fasta")

        count += 1
        
input_file_df = pd.DataFrame(input_file_data, columns=['Sample', 'Assemblies', 'Reads1', 'Reads2'])
input_file_df.to_csv(input_file_file, sep='\t', index=False, compression='gzip')

print(f'Finished writing files to {out_dir}. List of files written to {input_file_file}')
input_file_df

0.0% (0/1095217)
Finished writing files to input/split. List of files written to input-files.tsv.gz


Unnamed: 0,Sample,Assemblies,Reads1,Reads2
0,Wuhan-Hu-1/2019,/home/CSCScience.ca/apetkau/workspace/genomics...,,
1,Wuhan/IPBCAMS-WH-01/2019,/home/CSCScience.ca/apetkau/workspace/genomics...,,
2,Wuhan/WH04/2020,/home/CSCScience.ca/apetkau/workspace/genomics...,,
3,Wuhan/WH01/2019,/home/CSCScience.ca/apetkau/workspace/genomics...,,
4,Wuhan/WIV02/2019,/home/CSCScience.ca/apetkau/workspace/genomics...,,
...,...,...,...,...
95,Switzerland/100791/2020,/home/CSCScience.ca/apetkau/workspace/genomics...,,
96,Switzerland/100153/2020,/home/CSCScience.ca/apetkau/workspace/genomics...,,
97,Switzerland/100155/2020,/home/CSCScience.ca/apetkau/workspace/genomics...,,
98,Switzerland/100152/2020,/home/CSCScience.ca/apetkau/workspace/genomics...,,
