# Load metadata

In [None]:
import pandas as pd

df = pd.read_csv('input/metadata.tsv.gz', sep='\t')
df

In [None]:
df.columns

## Look for duplicates in table

In [None]:
len_all = len(df)
len_no_duplicates = df['strain'].agg('count')
print(f'len_all={len_all} == len_no_duplicates={len_no_duplicates}: {len_all == len_no_duplicates}')

# Extract sequences to separate files and prepare dataframe

In [None]:
from Bio import SeqIO
from pathlib import Path
from os import mkdir
import lzma
import gzip
import shutil
import time

total = len(df)

in_file = Path('input/sequences.fasta.xz')
out_dir = Path('input/split')

if out_dir.exists():
    shutil.rmtree(out_dir)

if not out_dir.exists():
    mkdir(out_dir)

print_on = 2000

time_before = time.time()
count = 0
input_file_data = []
with lzma.open(in_file, 'tr') as ih:
    for record in SeqIO.parse(ih, 'fasta'):
        if count % print_on == 0:
            percent = (count/total) * 100
            print(f'{percent:0.1f}% ({count}/{total})')

        cleaned_name = record.id.replace('/', '__')
        out_file_path = (out_dir / f'{cleaned_name}.fasta.gz').absolute()
        name = cleaned_name # record.id
        input_file_data.append([name, str(out_file_path), pd.NA, pd.NA])
        with gzip.open(out_file_path, "wt") as oh:
            SeqIO.write(record, oh, "fasta")

        count += 1

input_file_df = pd.DataFrame(input_file_data, columns=['Sample', 'Assemblies', 'Reads1', 'Reads2'])
time_after = time.time()
print(f'Finished writing files to {out_dir}.')
print(f'Took {(time_after - time_before)/60:0.1f} minutes')
input_file_df.head(5)

# Save dataframe to file

In [None]:
import numpy as np
import math
from os import remove

input_files_file = Path('input/input-files.tsv')

if input_files_file.exists():
    remove(input_files_file)
    
input_file_df.to_csv(input_files_file, sep='\t', index=False)
    
print(f'Wrote input files to {input_files_file}')

# ~Split dataframe into chunks~

In [None]:
# import numpy as np
# import math

# input_files_file_base_path = Path('input/input-list')

# if input_files_file_base_path.exists():
#     shutil.rmtree(input_files_file_base_path)
    
# if not input_files_file_base_path.exists():
#     mkdir(input_files_file_base_path)
    
# max_samples_per_chunk = 100
# number_chunks = int(math.ceil(len(input_file_df) / max_samples_per_chunk))

# count = 0
# for input_file_chunk_df in np.array_split(input_file_df, number_chunks):
#     input_files_file = input_files_file_base_path / f'input_{count}.tsv'
    
#     input_file_chunk_df.to_csv(input_files_file, sep='\t', index=False)
#     count = count + 1
    
# print(f'Split list of files into {number_chunks} chunks files written to {input_files_file_base_path}')