# CDR3 entropy in shared and unshared clonotypes

Starting with unique cross-subject clonotype datasets, computes per-position entropy for CDR3s in unshared or shared (found in at least 6 of 10 samples) clonotypes.

The following Python packages are required:
  * numpy
  * pandas

and can be installed by running `pip install numpy pandas`

In [1]:
from __future__ import print_function

from collections import Counter
import os
import subprocess as sp
import sys

import numpy as np
import pandas as pd

## Get sequences

The raw dataset (unique cross-subject clonotypes) is too large to be included in this Github repo. Instead, a compressed archive containing all of the required data can be downloaded [**HERE**](http://burtonlab.s3.amazonaws.com/GRP_github_data/dedup_10-subject_pools.tar.gz). Decompressing the archive in the `./data` directory will allow the following code blocks to run without modification.

***NOTE:*** *The required data files are relatively large (~30GB in total), so ensure adequate storage space is available before downloading.*

In [4]:
def get_sequences(seq_files):
    all_seqs = {}
    for seq_type in seq_files.keys():
        seq_file = seq_files[seq_type]
        seqs = {'shared': {i: [] for i in range(7, 15)},
                'unshared': {i: [] for i in range(7, 15)}}
        with open(seq_file) as f:
            for line in f:
                sline = line.strip().split()
                if not sline:
                    continue
                try:
                    c = int(sline[0])
                    if c == 1:
                        s = 'unshared'
                    elif c in range(6, 11):
                        s = 'shared'
                    else:
                        continue
                    aa = sline[3]
                    l = len(aa)
                    if l not in range(7, 15):
                        continue
                    seqs[s][l].append(aa)
                except IndexError:
                    continue
        all_seqs[seq_type] = seqs
    # downselect sequences so that shared and unshared pools are the same size
    selected_seqs = {t: {'shared': {}, 'unshared': {}} for t in all_seqs.keys()}
    for seq_type in ['observed', 'subject-specific synthetic']:
        for length in range(7, 15):
            num_seqs = min([len(all_seqs[seq_type][t][length]) for t in ['shared', 'unshared']])
            for shared_type in ['shared', 'unshared']:
                s = all_seqs[seq_type][shared_type][length]
                if len(s) > num_seqs:
                    s = np.random.choice(s, size=num_seqs, replace=False)
                selected_seqs[seq_type][shared_type][length] = s
    return all_seqs, selected_seqs

In [6]:
files = {'observed': './data/dedup_10-subject_pools/10-subject_dedup_pool_with-counts.txt',
         'subject-specific synthetic': './data/dedup_10-subject_pools/10-sample_dedup_pool_synthetic_subject-specific-models_with-counts.txt'}

all_seqs, seq_dict = get_sequences(files)

## Compute shared/unshared CDR3 entropy

In [8]:
def calculate_entropies(seq_dict, seq_type):
    edata = []
    for s in seq_dict.keys():
        for l in seq_dict[s].keys():
            seqs = seq_dict[s][l]
            for residues in list(zip(*seqs))[3:-3]:
                e = entropy(residues)
                edata.append({'sample': '{} ({})'.format(seq_type, s), 'seq_type': seq_type,
                              'Shannon entropy': e, 'CDR3 length': l, 'shared': s})
    return edata


def entropy(residues):
    n_residues = len(residues)
    if n_residues <= 1:
        return 0.
    counts = np.asarray(Counter(residues).values(), dtype=np.float64)
    probs = counts[np.nonzero(counts)] / n_residues
    n_classes = len(probs)
    if n_classes <= 1:
        return 0.
    return - np.sum(probs * np.log(probs)) / np.log(n_classes)

In [None]:
entropy_data = []
print('Getting sequences...')
for seq_type in seq_dict.keys():
    print(seq_type)
    entropies = calculate_entropies(seq_dict[seq_type], seq_type)
    entropy_data += entropies

entropy_df = pd.DataFrame(entropy_data)
entropy_df.to_csv('./data/per-position_shannon_entropies.csv')

## Shared CDR3 sequence properties

In [None]:
shared_seqs = []
for n in range(6, 11):
    shared_seqs += seqs[n]