In [None]:
import itertools
import numpy as np
import pandas as pd
import scipy.stats
import pyrepseq as prs
import pyrepseq.plotting as pp
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.cluster.hierarchy as hc
import tidytcells as tt
import warnings

from tqdm.notebook import tqdm

# Load metadata and subset to TST_D7 samples

In [None]:
meta = pd.read_csv('data/metadata.csv')
meta = meta[meta['tissue']=='TST_D7']
len(meta)

# Combine and standardise data for TST_D7 samples

In [None]:
chains = sorted(meta['chain'].unique())
meta_chains = {}
# list of dataframes
dfs_chains = {}
# concatenated dataframes
dfc_chains = {}

for chain in chains:
    meta_chain = meta[meta['chain']==chain]
    meta_chains[chain] = meta_chain
    
    dfs = []
    for i, row in tqdm(meta_chain.iterrows(), total=meta_chain.shape[0]):
        df = pd.read_csv("data/"+row['Filename_processed'], sep='\t')
        c = chain[0].capitalize()
        df = df[['v_call', 'j_call',
                 'junction_aa', 'duplicate_count',
                 'sequence']]
        mapper = dict(zip(["v_call", "junction_aa","j_call","duplicate_count", 'sequence'],
                                    [f"TR{c}V", f"CDR3{c}",f"TR{c}J", "clonal_count", f'CDR3{c}_NT']))
        df = prs.standardize_dataframe(df, mapper)
        total_count = np.sum(df['clonal_count'])
        df['clonal_frequency'] = df['clonal_count']/total_count
        df['tissue'] = row['tissue']
        df['chain'] = row['chain']
        df['UIN'] = row['UIN']
        df['sample'] = df['UIN'] + "_" + df['tissue'] + "_" + df['chain']
        df['bioidentity'] = df[f'TR{c}V'] + df[f'CDR3{c}'] + df[f'TR{c}J']
        dfs.append(df)
    dfc = pd.concat(dfs).reset_index(drop=True)
    dfs_chains[chain] = dfs
    dfc_chains[chain] = dfc

# Define limits for down-sampling

In [None]:
maxsize = 10000
minsize = 5000

# Perform down-sampling (and print out excluded samples)

In [None]:
dfc_chains_subsampled = {}

for chain in chains:
    
    dfs = []
    for sample, df in dfc_chains[chain].groupby('sample'):
        df = df.dropna(how='all').reset_index(drop=True)
        total_counts = df['clonal_count'].sum()
        if total_counts>maxsize:
            index, counts = prs.subsample(df['clonal_count'], maxsize)
            df_subsampled = df.loc[index]
            df_subsampled['clonal_count'] = counts
        elif total_counts < minsize:
            print(sample)
            continue
        else:
            df_subsampled = df
        dfs.append(df_subsampled)
        
    dfc = pd.concat(dfs).reset_index(drop=True)
    dfc_chains_subsampled[chain] = dfc

# Print out number of included samples

In [None]:
dfc_chains_subsampled['alpha'].drop_duplicates('sample')['chain'].value_counts(), dfc_chains_subsampled['beta'].drop_duplicates('sample')['chain'].value_counts()

# Save to file

In [None]:
for chain in chains:
    df = dfc_chains_subsampled[chain].copy()
    chain_letter = chain[0].upper()
    df[f'TR{chain_letter}Vshort'] = df[f'TR{chain_letter}V'].copy()
    df[f'TR{chain_letter}V'] = df[f'TR{chain_letter}V'].apply(lambda s: str(s)+'*01')
    df = df[df[f'TR{chain_letter}V'].str.startswith(f'TR{chain_letter}V')].copy()
    df.to_csv(f'data/combined_subsampled_{minsize}_{maxsize}_{chain}.csv.gz', index=False)