# Clonotype and sequence deduplication

Starting with annotated sequence data (in AbStar's `minimal` output format), reduces sequences to clonotypes and collapses dupicate clonotypes.

The [`abutils`](https://www.github.com/briney/abutils) Python package is required, and can be installed by running `pip install abutils`

*NOTE: this notebook requires the use of the Unix command line tool `sort`. Thus, it requires a Unix-based operating system to run correctly (MacOS and most flavors of Linux should be fine). Running this notebook on Windows 10 may be possible using the [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/about) but we have not tested this.*

In [4]:
from __future__ import print_function, division

import itertools
import multiprocessing as mp
import os
import subprocess as sp
import sys
import tempfile

from abutils.utils.jobs import monitor_mp_jobs
from abutils.utils.pipeline import list_files, make_dir
from abutils.utils.progbar import progress_bar

### Subjects, directories and data fields

The input data (annotated sequences in [abstar's](https://github.com/briney/abstar) `minimal` format) is too large to be stored in a Github repository. A compressed archive of the data can be downloaded [**here**](http://burtonlab.s3.amazonaws.com/GRP_github_data/techrep-merged_minimal_no-header.tar.gz). The data file is fairly large (about 400GB uncompressed), so make sure you have enough space before downloading. Decompressing the archive from within the `data` directory (located in the same parent directory as this notebook) will allow the code in this notebook to run without modification. If you would prefer to store the input data somewhere else, be sure to modify the `raw_input_dir` path below.

The data fields defined below correspond to the prosition in abstar's `minimal` format. If for some reason you have a differently formatted annotation file, change the field positions to suit your annotation file.

In [6]:
# subjects
with open('./data/subjects.txt') as f:
    subjects = sorted(f.read().split())

# directories
raw_input_dir = './data/techrep-merged_minimal_no-header/'
raw_clonotype_dir = './data/techrep-merged_vj-aa/'
dedup_clonotype_dir = './data/dedup_techrep-merged_vj-aa/'
dedup_sequence_dir = './data/dedup_techrep-merged_nt-seq/'
logfile = './data/dedup.log'

# data fields
prod_field = 3
v_field = 5
j_field = 9
cdr3aa_field = 12
vdjnt_field = 14

## Deduplication (biological replicates)

In [None]:
def dedup_bioreps(files, raw_clonotype_dir, unique_clonotype_dir,
                  raw_sequence_dir, unique_sequence_dir, log_file=None):
    # set up output directories
    make_dir(raw_clonotype_dir)
    make_dir(unique_clonotype_dir)
    make_dir(raw_sequence_dir)
    make_dir(unique_sequence_dir)
    
    # process minimal output files
    for _f in files:
        print(os.path.basename(_f))
        clonotype_output_data = []
        sequence_output_data = []
        raw_clonotype_file = os.path.join(raw_clonotype_dir, os.path.basename(_f))
        unique_clonotype_file = os.path.join(unique_clonotype_dir, os.path.basename(_f))
        raw_sequence_file = os.path.join(raw_sequence_dir, os.path.basename(_f))
        unique_sequence_file = os.path.join(unique_sequence_dir, os.path.basename(_f))
        
        # collect clonotype/sequence information
        with open(_f) as f:
            for line in f:
                data = line.strip().split(',')
                if data[prod_field] == 'no':
                    continue
                v_gene = data[v_field]
                j_gene = data[j_field]
                cdr3_aa = data[cdr3aa_field]
                vdj_nt = data[vdjnt_field]
                clonotype_output_data.append(' '.join([v_gene, j_gene, cdr3_aa]))
                sequence_output_data.append(' '.join([v_gene, j_gene, vdj_nt]))
        
        # write raw clonotype info to file
        raw_clontype_string = '\n'.join(clonotype_output_data)
        with open(raw_clonotype_file, 'w') as rf:
            rf.write(raw_clontype_string)
        raw_clonotype_count = len(clonotype_output_data)
        print('raw clonotypes:', raw_clonotype_count)
        # collapse duplicate clonotypes (without counts)
        uniq_cmd = 'sort -u -o {} -'.format(unique_clonotype_file)
        p = sp.Popen(uniq_cmd, stdout=sp.PIPE, stderr=sp.PIPE, stdin=sp.PIPE, shell=True)
        stdout, stderr = p.communicate(input=raw_clonotype_string)
        # count the number of unique clonotypes
        wc_cmd = 'wc -l {}'.format(unique_clonotype_file)
        q = sp.Popen(wc_cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
        _count, _ = q.communicate()
        unique_clonotype_count = int(_count.split()[0])
        print('unique clonotypes:', unique_clonotype_count)
        if log_file is not None:
            with open(log_file, 'a') as f:
                f.write('CLONOTYPES: {} {}\n'.format(raw_clonotype_count, unique_clonotype_count))
                
        # write raw sequence info to file
        raw_sequence_string = '\n'.join(sequence_output_data)
        with open(raw_sequence_file, 'w') as rf:
            rf.write(raw_sequence_string)
        raw_sequence_count = len(sequence_output_data)
        print('raw sequences:', raw_sequence_count)
        # collapse duplicate sequences (without counts)
        uniq_cmd = 'sort -u -o {} -'.format(unique_sequence_file)
        p = sp.Popen(uniq_cmd, stdout=sp.PIPE, stderr=sp.PIPE, stdin=sp.PIPE, shell=True)
        stdout, stderr = p.communicate(input=raw_sequence_string)
        # count the number of unique sequences
        wc_cmd = 'wc -l {}'.format(unique_sequence_file)
        q = sp.Popen(wc_cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
        _count, _ = q.communicate()
        unique_sequence_count = int(_count.split()[0])
        print('unique sequences:', unique_sequence_count)
        if log_file is not None:
            with open(log_file, 'a') as f:
                f.write('SEQUENCES: {} {}\n'.format(raw_sequence_count, unique_sequence_count))
        
        print('')

In [None]:
# clear the logfile
with open(logfile, 'w') as f:
    f.write('')

# iteratively process each subject
for subject in subjects:
    print(subject)
    with open(logfile, 'a') as f:
        f.write('#' + subject + '\n')
    files = list_files('./data/techrep-merged_minimal_no-header/{}'.format(subject))
    raw_clonotype_dir = './data/techrep-merged_vj-aa/{}'.format(subject)
    unique_clonptype_dir = './data/dedup_techrep-merged_vj-aa/{}'.format(subject)
    raw_sequence_dir = './data/techrep-merged_vdj-nt/{}'.format(subject)
    unique_sequence_dir = './data/dedup_techrep-merged_vdj-nt/{}'.format(subject)
    dedup_bioreps(files, raw_clonotype_dir, unique_clonptype_dir,
                  raw_sequence_dir, unique_sequence_dir, log_file=logfile)
    print('')

## Deduplication (subject pools)

In the previous blocks of code, we created a unique clonotype file for each biological replicate for each subject. Here, we'd like to create a single file for each subject containing only unique clonotypes (regardless of which biological replicate they came from).

In [9]:
dedup_clonotype_subject_pool_dir = './data/dedup_subject_clonotype_pools/'
dedup_sequence_subject_pool_dir = './data/dedup_subject_sequence_pools/'
make_dir(dedup_clonotype_subject_pool_dir)
make_dir(dedup_sequence_subject_pool_dir)

First we want to create a unique clonotype file for each subject that also contains the number of times we saw each clonotype (using the deduplicated biological replicates, so the clonotype count essentially tallies the number of biological replicates in which we observed each clonotype)

In [None]:
for subject in subjects:
    print(subject)
    
    # clonotypes
    input_clonotype_files = list_files(os.path.join(dedup_clonotype_dir, subject))
    ofile = os.path.join(dedup_clonotype_subject_pool_dir, '{}_dedup_pool_vj-aa_with-counts.txt'.format(subject))
    uniq_cmd = 'cat {} | sort | uniq -c > {}'.format(' '.join(input_clonotype_files), ofile)
    c = sp.Popen(uniq_cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
    stdout, stderr = c.communicate()
    
    # sequences
    input_sequence_files = list_files(os.path.join(dedup_sequence_dir, subject))
    ofile = os.path.join(dedup_sequence_subject_pool_dir, '{}_dedup_pool_vdj-nt_with-counts.txt'.format(subject))
    uniq_cmd = 'cat {} | sort | uniq -c > {}'.format(' '.join(input_sequence_files), ofile)
    s = sp.Popen(uniq_cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
    stdout, stderr = s.communicate()

Now the same process, but without counts:

In [None]:
for subject in subjects:
    print(subject)
        
    # clonotypes
    input_clonotype_files = list_files(os.path.join(dedup_clonotype_dir, subject))
    ofile = os.path.join(dedup_clonotype_subject_pool_dir, '{}_dedup_pool_vj-aa.txt'.format(subject))
    uniq_cmd = 'cat {} | sort | uniq > {}'.format(' '.join(input_clonotype_files), ofile)
    c = sp.Popen(uniq_cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
    stdout, stderr = c.communicate()
    
    # sequences
    input_sequence_files = list_files(os.path.join(dedup_sequence_dir, subject))
    ofile = os.path.join(dedup_sequence_subject_pool_dir, '{}_dedup_pool_vdj-nt.txt'.format(subject))
    uniq_cmd = 'cat {} | sort | uniq > {}'.format(' '.join(input_sequence_files), ofile)
    s = sp.Popen(uniq_cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
    stdout, stderr = s.communicate()

## Deduplication (cross-subject pools)

Finally, we'd like to create unique clonotype files (with counts) for every groupwise combination of our 10 subjects. Each group can contain two or more subjects, meaning the total number of possible groupwise combinations is quite large. We'll use the `multiprocessing` package to parallelize the process which should speed things up substantially, although even with parallelization, this will take some time.

***NOTE:*** *The output from the following code blocks will be quite large (deduplicated clonotype files are >2TB in total, deduplicated sequence files are >20TB in total). Make sure you have sufficient storage and that the output paths below (`dedup_cross_subject_clonotype_pool_dir` and `dedup_cross_subject_sequence_pool_dir` are correct before starting.*

In [None]:
# directories
dedup_cross_subject_clonotype_pool_dir = './data/dedup_cross-subject_clonotype_pools/'
dedup_cross_subject_sequence_pool_dir = './data/dedup_cross-subject_sequence_pools/'
make_dir(dedup_cross_subject_clonotype_pool_dir)
make_dir(dedup_cross_subject_sequence_pool_dir)

# deduplicated subject pool files
dedup_clonotype_subject_files = [f for f in list_files(dedup_clonotype_subject_pool_dir) if '_dedup_pool_vj-aa.txt' in f]
dedup_sequence_subject_files = [f for f in list_files(dedup_sequence_subject_pool_dir) if '_dedup_pool_vdj-nt.txt' in f]

# every possible groupwise combination of subjects (2 or more subjects per group)
subject_combinations_by_size = {}
for size in range(2, 11):
    subject_combinations_by_size[size] = [sorted(c) for c in itertools.combinations(subjects, size)]

In [None]:
def dedup_cross_subject_pool(subjects, files, output_dir):
    files = sorted(list(set([f for f in dedup_subject_files if os.path.basename(f).split('_')[0] in subjects])))
    output_file = os.path.join(output_dir, '{}_dedup_pool_vj-aa_with-counts.txt'.format('-'.join(subjects)))
    uniq_cmd = 'cat {} | sort -T {} | uniq -c > {}'.format(' '.join(files), temp_dir, output_file)
    p = sp.Popen(uniq_cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
    stdout, stderr = p.communicate()

### Clonotypes

In [None]:
p = mp.Pool(maxtasksperchild=1)

for size in sorted(subject_combinations_by_size.keys()):
    subject_combinations = subject_combinations_by_size[size]
    async_results = []
    print('{}-subject pools:'.format(size))
    progress_bar(0, len(subject_combinations))
    for sub_comb in subject_combinations:
        files = sorted(list(set([f for f in dedup_clonotype_subject_files if os.path.basename(f).split('_')[0] in sub_comb])))
        async_results.append(p.apply_async(dedup_cross_subject_pool,
                                           args=(sub_comb, files, dedup_cross_subject_clonotype_pool_dir)))
    monitor_mp_jobs(async_results)
    print('\n')

p.close()
p.join()

### Sequences

Just one more warning that the following code block will produce a very large amount of data (>20TB) and will take many hours to run even on a fairly robust server (an `m4.16xlarge` AWS EC2 instance, for example).

In [None]:
p = mp.Pool(maxtasksperchild=1)

for size in sorted(subject_combinations_by_size.keys()):
    subject_combinations = subject_combinations_by_size[size]
    async_results = []
    print('{}-subject pools:'.format(size))
    progress_bar(0, len(subject_combinations))
    for sub_comb in subject_combinations:
        files = sorted(list(set([f for f in dedup_sequence_subject_files if os.path.basename(f).split('_')[0] in sub_comb])))
        async_results.append(p.apply_async(dedup_cross_subject_pool,
                                           args=(sub_comb, files, dedup_cross_subject_sequence_pool_dir)))
    monitor_mp_jobs(async_results)
    print('\n')

p.close()
p.join()