In [None]:
import itertools
import numpy as np
import pandas as pd
import scipy.stats
import pyrepseq as prs
import pyrepseq.plotting as pp
import matplotlib.pyplot as plt
import seaborn as sns
from statannotations.Annotator import Annotator
import scipy.cluster.hierarchy as hc
import tidytcells as tt
import warnings

from ipywidgets import IntProgress
from IPython.display import display

# Analysis of exact coincidences

In [None]:
# load metadata
meta = pd.read_csv('data/TCRseq_metadata.csv')
meta = meta[meta['tissue'].isin(['TST_D7','TST_D2','Blood'])]

In [None]:
# load data
dfc_chains = {}

dfc_chains['beta'] = pd.read_csv(f'data/combined_subsampled_beta.csv.gz')
dfc_chains['alpha'] = pd.read_csv(f'data/combined_subsampled_alpha.csv.gz')

In [None]:
chains = sorted(meta['chain'].unique())
meta_chains = {}
for chain in chains:
    meta_chain = meta[meta['chain']==chain]
    meta_chains[chain] = meta_chain

pc_by_sample_chains = {}
for chain in chains:
    pc_by_sample = dfc_chains[chain].groupby('sample')['bioidentity'].apply(prs.pc)
    pc_by_sample.name = 'pc'
    pc_by_sample = pd.merge(meta_chains[chain], pc_by_sample, on='sample')
    pc_by_sample['logpc'] = np.log10(pc_by_sample['pc']+1e-7)
    pc_by_sample_chains[chain] = pc_by_sample

In [None]:
df = pc_by_sample_chains["alpha"]
df.to_csv(f'data/pc_withindonor_alpha_down-sampled.csv')
df1 = pc_by_sample_chains["beta"]
df1.to_csv(f'data/pc_withindonor_beta_down-sampled.csv')

# Cross-sample coincidences

In [None]:
# remove duplicate nucleotide sequences per sample
for chain in chains:
    dfc_chains[chain] = dfc_chains[chain].drop_duplicates([f'sequence', 'sample'])

In [None]:
df = dfc_chains['beta']
df.head()

In [None]:
df = dfc_chains['beta']

f = IntProgress(min=0, max=len(chains)*len(df['tissue'].unique())) # instantiate the bar
display(f) # display the bar

pc_cross_chain = {}
for chain in chains:
    df = dfc_chains[chain]

    df = df[df['sample'].isin(meta['sample'])]

    pcs = []
    for tissue in df['tissue'].unique():
        pc = prs.pcDelta_grouped_cross(df[df['tissue']== tissue],
                                  'sample', 'bioidentity', bins=0, condensed=True)
        pc_nt = prs.pcDelta_grouped_cross(df[df['tissue']== tissue],
                                  'sample', f'sequence', bins=0, condensed=True)
        pc = pc - pc_nt
        pc.reset_index(inplace=True)
        pc['tissue'] = tissue
        pcs.append(pc)
        f.value += 1
    pc_cross = pd.concat(pcs)
    pc_cross['logpc'] = np.log10(1e-8+pc_cross[0])
    pc_cross_chain[chain] = pc_cross

In [None]:
df = pc_cross_chain["alpha"]
df.to_csv(f'data/pc_crossdonor_alpha_down-sampled.csv')
df1 = pc_cross_chain["beta"]
df1.to_csv(f'data/pc_crossdonor_beta_down-sampled.csv')

# HLA dependence

In [None]:
chain = 'beta' # choose here from ('alpha','beta')
mhc_class = 'II' # choose here from ('I','II','both')

# load hla data
hlas = pd.read_csv('data/hladata.csv', index_col=0)
# filter MHC class
if mhc_class == 'both':
    pass
elif mhc_class == 'I':
    hlas = hlas[hlas.columns[~hlas.columns.str.startswith('D')]]
elif mhc_class == 'II':
    hlas = hlas[hlas.columns[hlas.columns.str.startswith('D')]]
else:
    raise NotImplementedError("mhc_class needs to be in ['both', 'I', 'II']")

hladists = {}
pcs = {}

for tissue in ['TST_D7', 'TST_D2', 'Blood']:


    df = dfc_chains[chain]
    df = df[df['tissue']== tissue]
    df['ntidentity'] = df['bioidentity'] + df['sequence']
    pc = prs.pcDelta_grouped_cross(df, 'UIN', 'bioidentity', bins=0, condensed=False)
    pc_nt = prs.pcDelta_grouped_cross(df, 'UIN', 'ntidentity', bins=0, condensed=False)
    pc = pc - pc_nt
    
    sample_ids = pc.index
    joined_ids = list(set(hlas.index).intersection(sample_ids))
    print(set(sample_ids)-set(hlas.index))
    print(len(joined_ids), len(hlas.index), len(sample_ids))
    pc_joined = pc.loc[joined_ids][joined_ids]
    hlas_joined = hlas.loc[joined_ids]
    
    metric = prs.overlap
    hladist_condensed = prs.pdist(hlas_joined.to_numpy(),
                                  metric=metric, dtype=np.float64)
    dfhladist = pd.DataFrame(prs.squareform(hladist_condensed),
                             columns=hlas_joined.index, index=hlas_joined.index)
    
    hla_flat = prs.squareform(dfhladist)
    np.fill_diagonal(pc_joined.values, 0)
    pc_flat = prs.squareform(pc_joined)
    hladists[tissue] = hla_flat
    pcs[tissue] = pc_flat
    
    file = pd.DataFrame(hladists[tissue])
    file.to_csv(f'data/hladist_{tissue}_{chain}_down-sampled_mhc{mhc_class}.csv')
    file1 = pd.DataFrame(pcs[tissue])
    file1.to_csv(f'data/tcrsharingprob_{tissue}_{chain}_down-sampled_mhc{mhc_class}.csv')