In [None]:
import os
import time
import pandas as pd
from os.path import join, dirname, abspath
import contextlib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl


import igraph
import seaborn as sns
import scipy.stats

import pyrepseq as prs
import pyrepseq.plotting as pp
from metaclonotypist import *

plt.style.use('seaborn-v0_8-paper')

# Change here to define MHC class and path to Gliph2

In [None]:
mhc_class = 'II'
GLIPH2_PATH = '/home/innate2adaptive/gliph2/'

# Load data and run GLIPH2 analysis

In [None]:
# load TCRseq data
chain = 'beta'
chain_letter = chain[0].upper()

df = pd.read_csv(f'data/combined_subsampled_5000_10000_{chain}.csv.gz')

# Drop rows with missing values
df = df.dropna(subset=[f'TR{chain_letter}V', f'CDR3{chain_letter}'])
df = df[df[f'CDR3{chain_letter}'].apply(len)>5]
df.head()

In [None]:
# load HLA data
hla = pd.read_csv('data/hladata.csv', index_col=0)
hlas = flatten_hlas(hla)
hlas.head()

In [None]:
testmethod = 'fisher'
mincount = 2
min_donors = 4

In [None]:
# filter clones < mincount
df = df[df['clonal_count']>=mincount]
# only keep samples found in both datasets
print(set(df['UIN'])-set(hlas.index))
df = df[df['UIN'].isin(hlas.index)]
hlas = hlas.loc[list(set(df['UIN']))]
len(df['UIN'].unique()), len(df)

In [None]:
# filter hlas < min_donors
hlas = hlas[hlas.columns[hlas.sum(axis=0)>=min_donors]]
# filter MHC class
if mhc_class == 'both':
    pass
elif mhc_class == 'I':
    hlas = hlas[hlas.columns[~hlas.columns.str.startswith('D')]]
elif mhc_class == 'II':
    hlas = hlas[hlas.columns[hlas.columns.str.startswith('D')]]
else:
    raise NotImplementedError("mhc_class needs to be in ['both', 'I', 'II']")
len(hlas.columns)

In [None]:
# reformat hla data frame to match Gliph requirements
hla['#subject'] = hla.index
hla = hla[ ['#subject'] + [ col for col in hla.columns if col != '#subject' ] ]

hla.to_csv('hla_file.txt', index=False, header=True, sep='\t')

In [None]:
from path import Path

def GLIPH2(data,
           global_convergence_cutoff=1,
           all_aa_interchangeable=1,
           local_min_pvalue=0.001,
           outfile=None):
    
    with Path(GLIPH2_PATH):
    
        data = data.copy()
        data['CDR3a'] = 'NA'
        data['subject:condition'] = 'NA'
        data = data[['CDR3B', 'TRBV',
                     'TRBJ', 'CDR3a',
                     'UIN', 'clonal_count']]

        data.to_csv('metarepertoire.txt', index=False, header=False, sep='\t')

        print('Clustering {} sequences with GLIPH2.'.format(len(data)))

        parameters = \
    f"""# Ignored line
    out_prefix=gliph2_beta_mhc{mhc_class}
    cdr3_file=metarepertoire.txt
    hla_file=hla_file.txt
    refer_file=ref_CD48_v2.0.fa
    v_usage_freq_file=ref_V_CD48_v2.0.txt
    cdr3_length_freq_file=ref_L_CD48_v2.0.txt
    local_min_pvalue={local_min_pvalue}
    p_depth = 1000
    global_convergence_cutoff = {global_convergence_cutoff}
    simulation_depth=1000
    kmer_min_depth=3
    local_min_OVE=10
    algorithm=GLIPH2
    all_aa_interchangeable={all_aa_interchangeable}
    """
        with open('parameters_custom', 'w') as f:
            f.write(parameters)

        # Perform gliph2 algorithm on test sequences
        t0 = time.time()
        os.system('./irtools.centos -c parameters_custom')
        t1 = time.time()
        t = t1 - t0

        print('Elapsed time: {} seconds.'.format(t))

        # Reformat gliph2 clustering results
        clusters = []
        # nodelist = {'CDR3': [], 'cluster': []}
        with open(f'gliph2_beta_mhc{mhc_class}_cluster.txt', 'r') as f:
            results = f.read().splitlines()
        c = 0
        for line in results:
            columns = line.split(' ')
            motif = columns[3]
            cluster = columns[4:]
            if len(cluster) >= 2:
                nodes = pd.DataFrame({'junction_aa': cluster})
                nodes['cluster'] = c
                nodes['motif'] = motif
                clusters.append(nodes)
                c += 1

        if outfile:
            print('Saving output to: \n --> {}'.format(outfile))
            nodelist.to_csv(outfile, sep='\t', index=False)

    clusters = pd.concat(clusters)
    exclusive = clusters.drop_duplicates('junction_aa', keep='first')
    return exclusive, t, clusters

In [None]:
exclusive, t, clusters = GLIPH2(df, all_aa_interchangeable=False)

# Perform HLA association analysis

In [None]:
clusters = pd.read_csv(f'{GLIPH2_PATH}/gliph2_beta_mhc{mhc_class}_cluster.csv')
clusters = clusters[clusters['number_subject']>=min_donors]
len(clusters['index'].unique())

In [None]:
clusters_rn = clusters.rename(columns=dict(index='cluster', Sample='Sample.ID'))

In [None]:
cluster_association = hla_association(clusters_rn, hlas,
                                      method=testmethod)

In [None]:
cluster_association.to_csv(f'{GLIPH2_PATH}/gliph2_beta_mhc{mhc_class}_clusterassociation.csv')

In [None]:
nmetaclones = len(cluster_association[cluster_association['significant']]['cluster'].unique())
cluster_association['significant'].sum(), nmetaclones

In [None]:
cluster_association_noinf = cluster_association.replace(np.inf, 400, inplace=False)

In [None]:
fig, ax = plt.subplots(figsize=(2.5, 2.5))
sns.scatterplot(ax=ax, data=cluster_association_noinf,
                x='odds_ratio',
                y=-np.log10(cluster_association_noinf['pvalue']),
                hue='significant',
                s=5)
ax.set_xscale('log')
ax.set_ylabel('p value')
ax.set_xlabel('odds ratio')
fig.tight_layout()

In [None]:
hla_metaclones = cluster_association[cluster_association['significant']]
hla_metaclones.to_csv(f'{GLIPH2_PATH}/gliph2_beta_mhc{mhc_class}_output.csv')

# Retain only most significant HLA association per cluster

In [None]:
hla_metaclones_unique = hla_metaclones.sort_values('pvalue'
                            ).drop_duplicates(subset='cluster', keep='first'
                            ).reset_index(drop=True)

# Add clustered CDR3 amino acid sequences

In [None]:
hla_metaclones_unique['CDR3s'] = hla_metaclones_unique['cluster'].apply(lambda x:
                                    '|'.join(clusters_rn[(clusters_rn['cluster']==x)][f'TcRb']))

In [None]:
hla_metaclones_unique = hla_metaclones_unique.merge(clusters_rn[['cluster', 'pattern']], on='cluster', how='left').drop_duplicates()

In [None]:
hla_metaclones_unique.to_csv(f'{GLIPH2_PATH}/gliph2_beta_mhc{mhc_class}_output2.csv')