# Load metadata

In [1]:
import pandas as pd

df = pd.read_csv('input/metadata.tsv.gz', sep='\t')
df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,strain,virus,gisaid_epi_isl,genbank_accession,sra_accession,date,region,country,division,location,...,missing_data,divergence,nonACGTN,rare_mutations,snp_clusters,QC_missing_data,QC_mixed_sites,QC_rare_mutations,QC_snp_clusters,clock_deviation
0,ARG/Cordoba-1006-155/2020,ncov,?,MW553298,,2020-06-04,South America,Argentina,Argentina,,...,0.0,13.0,0.0,8.0,0.0,good,good,good,good,5.0
1,ARG/Cordoba-1083-6/2020,ncov,?,MW553296,,2020-06-04,South America,Argentina,Argentina,,...,0.0,14.0,0.0,9.0,0.0,good,good,good,good,6.0
2,ARG/Cordoba-11300-61/2020,ncov,?,MW553301,,2020-06-04,South America,Argentina,Argentina,,...,392.0,11.0,0.0,6.0,0.0,good,good,good,good,3.0
3,ARG/Cordoba-11301-61/2020,ncov,?,MW553314,,2020-06-04,South America,Argentina,Argentina,,...,0.0,13.0,0.0,8.0,0.0,good,good,good,good,5.0
4,ARG/Cordoba-11419-61/2020,ncov,?,MW553297,,2020-06-04,South America,Argentina,Argentina,,...,0.0,13.0,0.0,8.0,0.0,good,good,good,good,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095212,mink/NED/NB02_index/2020,ncov,EPI_ISL_447631,MT457398,,2020-04-25,Europe,Netherlands,Netherlands,,...,0.0,12.0,0.0,11.0,0.0,good,good,good,good,7.0
1095213,mink/NED/NB03_index/2020,ncov,EPI_ISL_447633,MT457400,,2020-05-06,Europe,Netherlands,Netherlands,,...,0.0,13.0,0.0,9.0,0.0,good,good,good,good,6.0
1095214,mink/NED/NB04_index/2020,ncov,EPI_ISL_447634,MT457401,,2020-05-06,Europe,Netherlands,Netherlands,,...,0.0,15.0,0.0,12.0,0.0,good,good,good,good,8.0
1095215,mink/NLD/1/2020,ncov,EPI_ISL_431778,MT396266,,2020-04-24,Europe,Netherlands,North Brabant,Milheeze,...,0.0,9.0,0.0,5.0,0.0,good,good,good,good,3.0


In [2]:
df.columns

Index(['strain', 'virus', 'gisaid_epi_isl', 'genbank_accession',
       'sra_accession', 'date', 'region', 'country', 'division', 'location',
       'region_exposure', 'country_exposure', 'division_exposure', 'segment',
       'length', 'host', 'age', 'sex', 'Nextstrain_clade', 'pango_lineage',
       'GISAID_clade', 'originating_lab', 'submitting_lab', 'authors', 'url',
       'title', 'paper_url', 'date_submitted', 'sampling_strategy',
       'missing_data', 'divergence', 'nonACGTN', 'rare_mutations',
       'snp_clusters', 'QC_missing_data', 'QC_mixed_sites',
       'QC_rare_mutations', 'QC_snp_clusters', 'clock_deviation'],
      dtype='object')

## Look for duplicates in table

In [3]:
len_all = len(df)
len_no_duplicates = df['strain'].agg('count')
print(f'len_all={len_all} == len_no_duplicates={len_no_duplicates}: {len_all == len_no_duplicates}')

len_all=1095217 == len_no_duplicates=1095217: True


# Extract sequences to separate files and prepare dataframe

In [4]:
from Bio import SeqIO
from pathlib import Path
from os import mkdir
import lzma
import gzip
import shutil
import time

total = len(df)

in_file = Path('input/sequences.fasta.xz')
out_dir = Path('input/split')

if out_dir.exists():
    shutil.rmtree(out_dir)

if not out_dir.exists():
    mkdir(out_dir)

print_on = 2000

time_before = time.time()
count = 0
input_file_data = []
with lzma.open(in_file, 'tr') as ih:
    for record in SeqIO.parse(ih, 'fasta'):
        if count % print_on == 0:
            percent = (count/total) * 100
            print(f'{percent:0.1f}% ({count}/{total})')

        cleaned_name = record.id.replace('/', '__')
        out_file_path = (out_dir / f'{cleaned_name}.fasta.gz').absolute()
        name = cleaned_name # record.id
        input_file_data.append([name, str(out_file_path), pd.NA, pd.NA])
        with gzip.open(out_file_path, "wt") as oh:
            SeqIO.write(record, oh, "fasta")

        count += 1

input_file_df = pd.DataFrame(input_file_data, columns=['Sample', 'Assemblies', 'Reads1', 'Reads2'])
time_after = time.time()
print(f'Finished writing files to {out_dir}.')
print(f'Took {(time_after - time_before)/60:0.1f} minutes')
input_file_df.head(5)

0.0% (0/1095217)
0.2% (2000/1095217)
0.4% (4000/1095217)
0.5% (6000/1095217)
0.7% (8000/1095217)
0.9% (10000/1095217)
1.1% (12000/1095217)
1.3% (14000/1095217)
1.5% (16000/1095217)
1.6% (18000/1095217)
1.8% (20000/1095217)
2.0% (22000/1095217)
2.2% (24000/1095217)
2.4% (26000/1095217)
2.6% (28000/1095217)
2.7% (30000/1095217)
2.9% (32000/1095217)
3.1% (34000/1095217)
3.3% (36000/1095217)
3.5% (38000/1095217)
3.7% (40000/1095217)
3.8% (42000/1095217)
4.0% (44000/1095217)
4.2% (46000/1095217)
4.4% (48000/1095217)
4.6% (50000/1095217)
4.7% (52000/1095217)
4.9% (54000/1095217)
5.1% (56000/1095217)
5.3% (58000/1095217)
5.5% (60000/1095217)
5.7% (62000/1095217)
5.8% (64000/1095217)
6.0% (66000/1095217)
6.2% (68000/1095217)
6.4% (70000/1095217)
6.6% (72000/1095217)
6.8% (74000/1095217)
6.9% (76000/1095217)
7.1% (78000/1095217)
7.3% (80000/1095217)
7.5% (82000/1095217)
7.7% (84000/1095217)
7.9% (86000/1095217)
8.0% (88000/1095217)
8.2% (90000/1095217)
8.4% (92000/1095217)
8.6% (94000/1095217)


Unnamed: 0,Sample,Assemblies,Reads1,Reads2
0,Wuhan-Hu-1__2019,/home/CSCScience.ca/apetkau/workspace/genomics...,,
1,Wuhan__IPBCAMS-WH-01__2019,/home/CSCScience.ca/apetkau/workspace/genomics...,,
2,Wuhan__WH04__2020,/home/CSCScience.ca/apetkau/workspace/genomics...,,
3,Wuhan__WH01__2019,/home/CSCScience.ca/apetkau/workspace/genomics...,,
4,Wuhan__WIV02__2019,/home/CSCScience.ca/apetkau/workspace/genomics...,,


# Save dataframe to file

In [5]:
import numpy as np
import math
from os import remove

input_files_file = Path('input/input-files.tsv')

if input_files_file.exists():
    remove(input_files_file)
    
input_file_df.to_csv(input_files_file, sep='\t', index=False)
    
print(f'Wrote input files to {input_files_file}')

Wrote input files to input/input-files.tsv


# ~Split dataframe into chunks~

In [6]:
# import numpy as np
# import math

# input_files_file_base_path = Path('input/input-list')

# if input_files_file_base_path.exists():
#     shutil.rmtree(input_files_file_base_path)
    
# if not input_files_file_base_path.exists():
#     mkdir(input_files_file_base_path)
    
# max_samples_per_chunk = 100
# number_chunks = int(math.ceil(len(input_file_df) / max_samples_per_chunk))

# count = 0
# for input_file_chunk_df in np.array_split(input_file_df, number_chunks):
#     input_files_file = input_files_file_base_path / f'input_{count}.tsv'
    
#     input_file_chunk_df.to_csv(input_files_file, sep='\t', index=False)
#     count = count + 1
    
# print(f'Split list of files into {number_chunks} chunks files written to {input_files_file_base_path}')