# Clonotype and sequence downsampling

For a variety of downstream analyses (rarefaction, repeat observation frequency, diversity estimation, etc), we need a set of downsampled clonotype and sequence datasets for each subject. Because we obtained a different number of sequences for each subject, the downsample sizes are relative to the total size of each subject's dataset (intervals of 10% of the total dataset).

The [`abutils`](https://www.github.com/briney/abutils) Python package is required for this notebook, and can be installed by running `pip install abutils`.

*NOTE: this notebook requires the use of several Unix command line tools, including `cat`, `shuf`, `sort` and `uniq`. Thus, it requires a Unix-based operating system to run correctly (MacOS and most flavors of Linux should be fine). Running this notebook on Windows 10 may be possible using the [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/about) but we have not tested this.*

In [2]:
from __future__ import print_function, division

import multiprocessing as mp
import os
import subprocess as sp
import sys
import tempfile

from abutils.utils.jobs import monitor_mp_jobs
from abutils.utils.pipeline import list_files, make_dir
from abutils.utils.progbar import progress_bar

### Subjects and directories

The input directories should contain deduplicated clonotypes or sequences. You can generate the deduplicated clonotype/sequence files using [**this**](LINK) Jupyter notebook, or you can download the deduplicated clontype data [**here**](LINK) and the deduplicated sequence data [**HERE**](LINK). If downloading the data (which will be downloaded as a compressed archive), decompress the archive in the `data` directory (in the same parent directory as this notebook) and you should be ready to go. If you want to store the deduplicated sequence data in some other location, adjust `input_clonotype_dir` and /or `input_sequence_dir` paths below as needed.

`iterations` is the number of replicate downsamplings (without replacement) that will be performed at each subsample size. 

In [3]:
# subjects
with open('./data/subjects.txt') as f:
    subjects = sorted(f.read().split())

# iterations
iterations = 10
    
# directories
input_clonotype_dir = './data/dedup_techrep-merged_vj-aa/'
input_sequence_dir = './data/dedup_techrep-merged_nt-seq/'
project_dir = './data/equal_fraction_downsampling/'
make_dir(project_dir)

### Downsampling

In [None]:
def downsample(input_files, downsample_size):
    counts = {}
    uniq_cmd = 'cat {} | shuf -n {} | sort -T /data/temp_dir/ | uniq -c'.format(' '.join(input_files), downsample_size)
    p = sp.Popen(uniq_cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
    stdout, stderr = p.communicate()
    for line in stdout.split('\n'):
        if not line.strip():
            continue
        count = int(line.split()[0])
        if count not in counts:
            counts[count] = 1
        else:
            counts[count] += 1
    return counts

def get_combined_sequence_count(files):
    wc_cmd = 'cat {} | wc -l'.format(' '.join(files))
    p = sp.Popen(wc_cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
    stdout, stderr = p.communicate()
    return int(stdout.strip().split()[0])

### Clonotypes

In [None]:
# clear the counts file
counts_file = os.path.join(project_dir, 'clonotype-downsampling_duplicate-counts_vj-aa.txt')
with open(counts_file, 'w') as f:
    f.write('')

# initialize multiprocessing
p = mp.Pool(maxtasksperchild=1)

for subject in subjects:
    # print header
    print('=' * (len(subject) + 4))
    print('  ' + subject)
    print('=' * (len(subject) + 4))
    print('')
    with open(counts_file, 'a') as f:
        f.write('#{}\n'.format(subject))
        
    # get the combined number of unique clonotype sequences
    input_files = list_files(os.path.join(input_clonotype_dir, subject))
    total_size = get_combined_sequence_count(input_files)
    
    # process the iterations of each downsample size in parallel
    for fraction in np.arange(0.1, 1.01, 0.1):
        downsample_size = int(round(float(total_size) * fraction, 0))
        with open(counts_file, 'a') as f:
            f.write('>{}\n'.format(fraction))
        async_results = []
        print('Subsample:', fraction, '({})'.format(downsample_size))
        for i in range(1, iterations + 1):
            async_results.append(p.apply_async(downsample, args=(input_files, downsample_size)))
        monitor_mp_jobs(async_results)
        results = [ar.get() for ar in async_results]
        for counts in results:
            ostring = ' '.join(['{}:{}'.format(k, counts[k]) for k in sorted(counts.keys())])
            with open(counts_file, 'a') as f:
                f.write('{}\n'.format(ostring))
        print('')
    print('\n')

p.close()
p.join()

### Sequences

In [None]:
# clear the counts file
counts_file = os.path.join(project_dir, 'sequence-downsampling_duplicate-counts_nt-seq.txt')
with open(counts_file, 'w') as f:
    f.write('')

# initialize multiprocessing
p = mp.Pool(maxtasksperchild=1)

for subject in subjects:
    # print header
    print('=' * (len(subject) + 4))
    print('  ' + subject)
    print('=' * (len(subject) + 4))
    print('')
    with open(counts_file, 'a') as f:
        f.write('#{}\n'.format(subject))
        
    # get the combined number of unique clonotype sequences
    input_files = list_files(os.path.join(input_sequence_dir, subject))
    total_size = get_combined_sequence_count(input_files)
    
    # process the iterations of each downsample size in parallel
    for fraction in np.arange(0.1, 1.01, 0.1):
        downsample_size = int(round(float(total_size) * fraction, 0))
        with open(counts_file, 'a') as f:
            f.write('>{}\n'.format(fraction))
        async_results = []
        print('Subsample:', fraction, '({})'.format(downsample_size))
        for i in range(1, iterations + 1):
            async_results.append(p.apply_async(downsample, args=(input_files, downsample_size)))
        monitor_mp_jobs(async_results)
        results = [ar.get() for ar in async_results]
        for counts in results:
            ostring = ' '.join(['{}:{}'.format(k, counts[k]) for k in sorted(counts.keys())])
            with open(counts_file, 'a') as f:
                f.write('{}\n'.format(ostring))
        print('')
    print('\n')

p.close()
p.join()