# Estimating repertoire diversity

In [72]:
from __future__ import print_function, division

import ast
from datetime import datetime
import json
import multiprocessing as mp
import os
import pickle
import subprocess as sp
import sys
import tempfile
import time

import numpy as np
import pandas as pd

from scipy import stats

from abutils.utils.jobs import monitor_mp_jobs
from abutils.utils.pipeline import list_files, make_dir
from abutils.utils.progbar import progress_bar

### Subjects

In [58]:
with open('./data/subjects.txt') as f:
    subjects = sorted(f.read().split())

## Diversity estimation functions

In order to perform the diversity estimate using Recon, you must first download and install [Recon](https://arnaoutlab.github.io/Recon/). If necessary, update the `recon_path` variable below, which should point to the `recon.py` file.  

Additionally, because Recon requires Python 2.7 and your default Python executable (if you're able to run this notebook) is likely Python 3.x, we need directions to a Python 2.7 excecutable. If you only have Python 3.x and would prefer to keep your default Python executable unchanged, one way to easily install Python 2.7 on a system with a pre-existing Python 3.x install is to use [Anaconda](https://www.anaconda.com/download/). When installing Python 2.7 via Anaconda, respond `no` when prompted to "prepend the Anaconda2 install location to PATH". This will ensure that your default Python executable will remain unchanged. Make sure to update the `python_path` variable below to point to a Python 2.7 executable.

In [59]:
recon_path = '~/recon.py'
python_path = '~/anaconda2/bin/python'
recon_raw_data_path = './data/user-calculated_raw-recon-data'
diversity_output_dir = './data/user-calculated_diversity-estimation'

recon_path = os.path.expanduser(recon_path)
python_path = os.path.expanduser(python_path)
make_dir(recon_raw_data_path)
make_dir(diversity_output_dir)

In [60]:
def do_chao2(counts, m=6):
    m = float(m)
    s_obs = float(sum(counts.values()))
    if all([1 in counts, 2 in counts]):
        q1 = float(counts[1])
        q2 = float(counts[2])
    else:
        q1 = float(counts['1'])
        q2 = float(counts['2'])
    return s_obs + ((m - 1) / m) * ((q1 * (q1 - 1)) / (2 * (q2 + 1)))


def do_recon(counts=None, infile=None, outfile='/dev/null', with_ci=False):
    if counts is not None and infile is None:
        inhandle = tempfile.NamedTemporaryFile(dir='/tmp', delete=False, mode='w')
        infile = inhandle.name
        _dlist = ['{}\t{}'.format(k, v) for k, v in sorted(counts.items(), key=lambda x: int(x[0]))]
        inhandle.write('\n'.join(_dlist))
        inhandle.close()
    recon_cmd = '{} {} -R -c -o {} {}'.format(python_path, recon_path, outfile, infile)
    p = sp.Popen(recon_cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
    stdout, stderr = p.communicate()
    data = ast.literal_eval(stdout.decode().strip().split('\n')[1])
    count = data[2] + sum(data[3].values())
    return count


def confidence_interval(data, interval=0.95):
    mu = np.mean(data)
    sigma = np.std(data)
    i = stats.norm.interval(interval, loc=mu, scale=sigma)
    return i

## Clonotype diversity estimation (by subject)

### Load data

In [81]:
# input data location
data_path = './data/equal_fraction_downsampling/'
clonotype_data_filename = 'clonotype-downsampling_duplicate-counts_vj-aa.txt'
clonotype_data_file = os.path.join(data_path, clonotype_data_filename)

# read data file
clonotype_data = {}
with open(clonotype_data_file) as f:
    subject_samples = f.read().split('#')[1:]
    for ss in subject_samples:
        subject = ss.split('\n')[0]
        clonotype_data[subject] = {}
        subsamples = '\n'.join(ss.split('\n'))[1:].split('>')
        for subsample in subsamples[1:]:
            subsample_size = float(subsample.split('\n')[0])
            clonotype_data[subject][subsample_size] = []
            vals = subsample.split('\n')[1:]
            for val in vals:
                if not val.split():
                    continue
                d = [v.split(':') for v in val.split()]
                clonotype_data[subject][subsample_size].append({int(k): int(v) for k, v in d})

### Estimate diversity

While diversity estimation with the Chao2 estimator is quite fast (only a second or two to process all downsamples from all subjects), Recon is much more compute intensive. Even with multiprocessing, the code block below requires about half an hour to run on a modern Macbook Pro.

In [83]:
# chao diversity
print('========')
print('  CHAO')
print('========')
print('')
chao_diversity = {}
for subject in clonotype_data:
    print(subject)
    chao_diversity[subject] = {}
    for subsample in clonotype_data[subject]:
        chao_diversity[subject][subsample] = []
        for iteration in clonotype_data[subject][subsample]:
            div = do_chao2(iteration)
            chao_diversity[subject][subsample].append(div)
chao_file = os.path.join(diversity_output_dir, 'single-subject_clonotypes_chao2.json')
with open(chao_file, 'w') as f:
    json.dump(chao_diversity, f)
print('\n\n')

# recon diversity
print('=========')
print('  RECON')
print('=========')
print('')
recon_diversity = {}
p = mp.Pool(maxtasksperchild=1)
for subject in data:
    print(subject)
    start_time = datetime.now()
    subsample_count = len(clonotype_data[subject].keys())
    progress_bar(0, subsample_count, start_time=start_time)
    recon_diversity[subject] = {}
    for scount, subsample in enumerate(clonotype_data[subject].keys()):
        async_results = []
        for i, iteration in enumerate(clonotype_data[subject][subsample]):
            outfile = os.path.join(recon_raw_data_path, '{}_{}_{}'.format(subject, subsample, i))
            async_results.append(p.apply_async(do_recon, args=(iteration, None, outfile)))
        recon_diversity[subject][subsample] = [ar.get() for ar in async_results]
        progress_bar(scount + 1, subsample_count, start_time=start_time)
    print('')
recon_file = os.path.join(diversity_output_dir, 'single-subject_clonotypes_recon.json')
with open(recon_file, 'w') as f:
    json.dump(recon_diversity, f)

  CHAO

316188
326650
326651
326713
326737
326780
326797
326907
327059
D103





## Clonotype diversity estimation (multi-subject pools)

### Load data

In [38]:
# input files
cross_subject_data_dir = './data/cross-subject_clonotype_duplicate-counts/'
files = [f for f in list_files(cross_subject_data_dir) if 'occurrence-counts.txt' in f]

# organize by group size
files_by_subject_count = {i: [] for i in range(1, 11)}
for f in files:
    num = len(os.path.basename(f).split('_')[0].split('-'))
    files_by_subject_count[num].append(f)
    
# load data
clonotype_counts = {}
for num_subs in files_by_subject_count.keys():
    clonotype_counts[num_subs] = []
    for ifile in files_by_subject_count[num_subs]:
        _counts = {}
        with open(ifile) as f:
            for line in f:
                if not line.strip():
                    continue
                n, c = line.strip().split()
                _counts[n] = int(c)
        clonotype_counts[num_subs].append(_counts)

### Estimate diversity

In [54]:
def cross_subject_diversity(counts, estimator=None):
    means = []
    lognorm_means = []
    lowers = []
    uppers = []
    lognorm_lowers = []
    lognorm_uppers = []
    raw_xs = []
    raw_ys = []
    
    p = mp.Pool(maxtasksperchild=1)

    for num_subjects in sorted(list(counts.keys())):
        print('Group size:', num_subjects)
        divs = []
        async_results = []
        progress_bar(0, len(counts[num_subjects]))
        for _counts in counts[num_subjects]:
            if estimator.lower() == 'chao':
                func = do_chao2
                m = int(num_subjects) if int(num_subjects) > 1 else 6
                args = (_counts, int(m))
            else:
                func = do_recon
                args = (_counts, )
            async_results.append(p.apply_async(func, args=args))
        monitor_mp_jobs(async_results)
        divs = [ar.get() for ar in async_results]
        
        raw_xs.extend([num_subjects] * len(divs))
        raw_ys.extend(divs)
        if len(divs) == 1:
            mean = divs[0]
            lognorm_mean = np.log10(mean)
            lower = divs[0]
            upper = divs[0]
            lognorm_lower, lognorm_upper = np.log10(upper), np.log10(lower)
        else:
            mean = np.mean(divs)
            lognorm_mean = np.log10(mean)
            lower, upper = confidence_interval(divs)
            lognorm_lower, lognorm_upper = confidence_interval([np.log10(d) for d in divs])
        means.append(mean)
        lognorm_means.append(lognorm_mean)
        lowers.append(lower)
        uppers.append(upper)
        lognorm_lowers.append(lognorm_lower)
        lognorm_uppers.append(lognorm_upper)
        print('\n')
    ret = {'raw_xs': raw_xs, 'raw_ys': raw_ys,
           'means': means, 'uppers': uppers, 'lowers': lowers,
           'lognorm_means': lognorm_means, 'lognorm_uppers': lognorm_uppers, 'lognorm_lowers': lognorm_lowers}
    return ret

In [55]:
chao_diversity = cross_subject_diversity(clonotype_counts, estimator='chao')

with open('./data/diversity_estimation/cross-subject_clonotypes_chao2.json', 'w') as f:
    json.dump(chao_diversity, f)

Group size: 1
(10/10) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 2
(45/45) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 3
(120/120) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 4
(210/210) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 5
(252/252) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 6
(210/210) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 7
(120/120) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 8
(45/45) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 9
(10/10) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 10
(1/1) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  




In [56]:
recon_diversity = cross_subject_diversity(clonotype_counts, estimator='recon')

with open('./data/diversity_estimation/cross-subject_clonotypes_recon.json', 'w') as f:
    json.dump(recon_diversity, f)

Group size: 1
(10/10) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 2
(45/45) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 3
(120/120) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 4
(210/210) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 5
(252/252) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 6
(210/210) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 7
(120/120) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 8
(45/45) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 9
(10/10) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 10
(1/1) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  




## Sequence diversity estimation (by subject)

### Load data

In [79]:
# input data location
data_path = './data/equal_fraction_downsampling/'
sequence_data_filename = 'sequence-downsampling_duplicate-counts_nt-seq.txt'
sequence_data_file = os.path.join(data_path, sequence_data_filename)

# read data file
sequence_data = {}
with open(sequence_data_file) as f:
    subject_samples = f.read().split('#')[1:]
    for ss in subject_samples:
        subject = ss.split('\n')[0]
        sequence_data[subject] = {}
        subsamples = '\n'.join(ss.split('\n'))[1:].split('>')
        for subsample in subsamples[1:]:
            subsample_size = float(subsample.split('\n')[0])
            sequence_data[subject][subsample_size] = []
            vals = subsample.split('\n')[1:]
            for val in vals:
                if not val.split():
                    continue
                d = [v.split(':') for v in val.split()]
                sequence_data[subject][subsample_size].append({int(k): int(v) for k, v in d})

### Estimate diversity

While diversity estimation with the Chao2 estimator is quite fast (only a second or two to process all downsamples from all subjects), Recon is much more compute intensive. Even with multiprocessing, the code block below requires just under an hour to run on a modern Macbook Pro.

In [85]:
# chao diversity
print('========')
print('  CHAO')
print('========')
print('')
chao_diversity = {}
for subject in sequence_data:
    print(subject)
    chao_diversity[subject] = {}
    for subsample in sequence_data[subject]:
        chao_diversity[subject][subsample] = []
        for iteration in sequence_data[subject][subsample]:
            div = do_chao2(iteration)
            chao_diversity[subject][subsample].append(div)
chao_file = os.path.join(diversity_output_dir, 'single-subject_sequences_chao2.json')
with open(chao_file, 'w') as f:
    json.dump(chao_diversity, f)
print('\n\n')


# recon diversity
print('=========')
print('  RECON')
print('=========')
print('')
recon_diversity = {}
p = mp.Pool(maxtasksperchild=1)
for subject in data:
    print(subject)
    start_time = datetime.now()
    subsample_count = len(sequence_data[subject].keys())
    progress_bar(0, subsample_count, start_time=start_time)
    recon_diversity[subject] = {}
    for scount, subsample in enumerate(sequence_data[subject].keys()):
        async_results = []
        for i, iteration in enumerate(sequence_data[subject][subsample]):
            outfile = os.path.join(recon_raw_data_path, '{}_{}_{}'.format(subject, subsample, i))
            async_results.append(p.apply_async(do_recon, args=(iteration, None, outfile)))
        recon_diversity[subject][subsample] = [ar.get() for ar in async_results]
        progress_bar(scount + 1, subsample_count, start_time=start_time)
    print('')
recon_file = os.path.join(diversity_output_dir, 'single-subject_sequences_recon.json')
with open(recon_file, 'w') as f:
    json.dump(recon_diversity, f)

  CHAO

316188
326650
326651
326713
326737
326780
326797
326907
327059
D103



  RECON

316188
(10/10) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  (03:56)  
326650
(10/10) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  (03:37)  
326651
(10/10) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  (04:14)  
326713
(10/10) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  (04:08)  
326737
(10/10) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  (03:19)  
326780
(10/10) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  (04:02)  
326797
(10/10) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  (03:54)  
326907
(10/10) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  (03:35)  
327059
(10/10) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  (03:55)  
D103
(10/10) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  (03:05)  


## Sequence diversity estimation (cross-subject pools)

### Load data

In [88]:
# input files
cross_subject_data_dir = './data/cross-subject_sequence_duplicate-counts/'
files = [f for f in list_files(cross_subject_data_dir) if 'occurrence-counts.txt' in f]

# organize by group size
files_by_subject_count = {i: [] for i in range(1, 11)}
for f in files:
    num = len(os.path.basename(f).split('_')[0].split('-'))
    files_by_subject_count[num].append(f)
    
# load data
sequence_counts = {}
for num_subs in files_by_subject_count.keys():
    sequence_counts[num_subs] = []
    for ifile in files_by_subject_count[num_subs]:
        _counts = {}
        with open(ifile) as f:
            for line in f:
                if not line.strip():
                    continue
                n, c = line.strip().split()
                _counts[n] = int(c)
        sequence_counts[num_subs].append(_counts)

### Estimate diversity

In [89]:
chao_diversity = cross_subject_diversity(sequence_counts, estimator='chao')

with open('./data/diversity_estimation/cross-subject_sequences_chao2.json', 'w') as f:
    json.dump(chao_diversity, f)

Group size: 1
(10/10) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 2
(45/45) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 3
(120/120) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 4
(210/210) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 5
(252/252) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 6
(210/210) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 7
(120/120) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 8
(45/45) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 9
(10/10) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 10
(1/1) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  




In [90]:
recon_diversity = cross_subject_diversity(sequence_counts, estimator='recon')

with open('./data/diversity_estimation/cross-subject_sequences_recon.json', 'w') as f:
    json.dump(recon_diversity, f)

Group size: 1
(10/10) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 2
(45/45) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 3
(120/120) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 4
(210/210) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 5
(252/252) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 6
(210/210) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 7
(120/120) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 8
(45/45) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 9
(10/10) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


Group size: 10
(1/1) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%  


