# Get unique HLAs alleles

This notebook looks for all HLA alleles in the dataset and returns a list of unique alleles.
From the list of unique alleles, we generate fasta files with the sequences of the alleles to be used for AlphaFold2

Current datasets included:
- data/raw/pHLA_binding/NetMHCpan_train/allelelist                  # NetMHCpan alleles in the training set
- data/raw/pHLA_binding/CD8_benchmark_filtered/* (using filenames)  # NetMHCpan validation
- data/raw/pHLA_binding/train_2023_MixMHCpred2.2.txt                # MixMHCpred2.2 training set
- data/raw/pHLA_binding/val_2023_MixMHCpred2.2/*                    # MixMHCpred2.2 validation set
- data/raw/pHLA_stability/Stability.data                            # Stability training dataset

Datasets to be included:
- data/raw/TCell_recognition/train_2020_PRIME1.0.csv                # PRIME1.0 training dataset
- data/raw/TCell_recognition/train_2023_PRIME2.0.csv                # PRIME2.0 training dataset
- data/raw/neoantigen_immunogenicity/HLA_allotypes.txt              # Muller et al. alleles for training and validation

In [1]:
import os
import re
import glob

w_dir = '..'

all_alleles_file = os.path.join(w_dir, 'data/processed/all_hla_seq_binding_groove.fasta')

netmhcpan_train_alleles_file = os.path.join(w_dir, 'data/raw/pHLA_binding/NetMHCpan_train/allelelist')
netmhcpan_val_alleles_files = glob.glob(os.path.join(w_dir, 'data/raw/pHLA_binding/CD8_benchmark_filtered/*'))
mixmhcpred_train_alleles_file = os.path.join(w_dir, 'data/raw/pHLA_binding/train_2023_MixMHCpred2.2.txt')
mixmhcpred_val_alleles_file = os.path.join(w_dir, 'data/raw/pHLA_binding/val_2023_MixMHCpred2.2_alleles_by_file.tsv')
stability_alleles_file = os.path.join(w_dir, 'data/raw/pHLA_stability/Stability.data')

In [2]:
def standardize_hla_class_1_name(hla_name: str) -> str:
    """
    Standardize HLA names to the format HLA*<gene>*<serogroup>:<Missense variants>
    If the name is not a valid HLA class 1 name, it returns None
    Examples covered:
        'HLA-A01:01' -> 'HLA-A*01:01'
        'A*01:01:01:01' -> 'HLA-A*01:01'
    :param hla_name: str
        HLA name to be standardized
    :return: str
        Standardized HLA name
    """
    hla_no_gene_separator_regex = re.compile(r'HLA-[ABC]\d{2}:\d{2}')  # HLA-A01:01
    hla_no_hla_regex = re.compile(r'[ABC]\*\d{2}:\d{2}')  # A*01:01:01:01
    hla_no_hla_no_sep_regex = re.compile(r'[ABC]\d{2}\d{2}')  # A0101
    hla_class2_regex = re.compile(r'[DPQR]{2}.+\*\d{2,4}:\d{2}')  # DPA1*01:03:01:01
    hla_nk_regex = re.compile(r'HLA-[EG]\d{2}:\d{2}')  # HLA-E01:01
    hla_nk_no_hla_no_sep_regex = re.compile(r'[EG]\d{2}\d{2}')  # E0101
    expected_hla_regex = re.compile(r'HLA-[ABC]\*\d{2}:\d{2}')  # HLA-A*01:01
    
    if expected_hla_regex.match(hla_name):
        return hla_name[0:11]
    elif hla_no_gene_separator_regex.match(hla_name):
        hla_with_separator = hla_name[0:5] + '*' + hla_name[5:]
        return hla_with_separator
    elif hla_no_hla_regex.match(hla_name):
        gene = hla_name[0]
        serogroup = hla_name[2:4]
        missense_variants = hla_name[5:7]
        return f'HLA-{gene}*{serogroup}:{missense_variants}'
    elif hla_no_hla_no_sep_regex.match(hla_name):
        gene = hla_name[0]
        serogroup = hla_name[1:3]
        missense_variants = hla_name[3:5]
        return f'HLA-{gene}*{serogroup}:{missense_variants}'
    elif hla_class2_regex.match(hla_name):
        return None
    elif hla_nk_regex.match(hla_name) or hla_nk_no_hla_no_sep_regex.match(hla_name):
        return None
    else:
        raise NotImplementedError(f'HLA name {hla_name} does not follow the expected pattern')
    
    
def read_netmhcpan_allelelist(file_path: str, filter_by_hla: bool = True) -> set:
    """
    Reads a file with a list of alleles and returns a set of unique alleles.
    File format:
        A19-A19	BoLA-6:01402,BoLA-2:01601
        A20-A20	BoLA-3:02701,BoLA-2:02601
        Apher1	HLA-A03:01,HLA-A29:02,HLA-B44:02,HLA-B44:03,HLA-C12:03,HLA-C16:01
        Apher6	HLA-A02:01,HLA-A03:01,HLA-B07:02,HLA-C07:02
    :param file_path: str
        String with the path to the file
    :param filter_by_hla: bool
        If True, only HLA alleles are returned 
    :return:
        Set with allele names
    """
    alleles = set()
    with open(file_path, 'r') as f:
        for l in f.readlines():
            alleles_list = l.split()[1].split(',')
            for a in alleles_list:
                if filter_by_hla and a.startswith('HLA'):
                    alleles.add(standardize_hla_class_1_name(a))
                elif not filter_by_hla:
                    alleles.add(standardize_hla_class_1_name(a))
    return alleles

def get_alleles_from_netmhcpan_val_files(files: list) -> set:
    """
    Get the alleles from the filenames of the NetMHCpan validation files
    Examples of filenames:
        'CD8_benchmark_filtered/CEDVPSGKL_HLA-B40:01'
        'CD8_benchmark_filtered/CEDVPSGKL_HLA-B40:01_1' # Skipped
    :param files: list
        List of filepaths
    :return: 
        Set with allele names
    """
    alleles = set()
    for f in files:
        allele_str = f.split('_')[-1]
        if len(allele_str) > 1:
            alleles.add(standardize_hla_class_1_name(allele_str))
    return alleles

def read_train_mixmhcpred_alleles(file_path: str) -> set:
    """
    Reads a file with a list of alleles and returns a set of unique alleles.
    File format:
        Peptide	Allele
        AAAHTHRY	A0101
        ADMGHLKY	A0101
    :param file_path: str
        String with the path to the file
    :return:
        Set with allele names
    """
    alleles = set()
    with open(file_path, 'r') as f:
        for l in f.readlines()[1:]: # Skip header
            alleles.add(standardize_hla_class_1_name(l.split()[1]))
    return alleles

def read_val_mixmhcpred_alleles(file_path: str) -> set:
    """
    Reads a file with a list of alleles and returns a set of unique alleles.
    File format:
         Filename	HLAs
         3779-AMM_I.csv	A0201,B3503,B3508,C0401
         3795-BMT_I.csv	A0201,A2601,B0702,B3901,C0702,C1203
    :param file_path: str
        String with the path to the file
    :return:    
        Set with allele names
    """
    alleles = set()
    with open(file_path, 'r') as f:
        for l in f.readlines()[1:]: # Skip header
            alleles_list = l.split()[1].split(',')
            for a in alleles_list:
                alleles.add(standardize_hla_class_1_name(a))
    return alleles

def read_stability_alleles(file_path: str) -> set:
    alleles = set()
    with open(file_path, 'r') as f:
        for l in f.readlines()[1:]: # Skip header
            alleles.add(standardize_hla_class_1_name(l.split()[0]))
    return alleles
    
def read_hla_fasta(file_path: str) -> dict:
    """
    Reads a fasta file with HLA alleles and returns a dictionary with the allele name as key and the sequence as value.
    
    Sample fasta:
        >HLA:HLA00001 A*01:01:01:01 365 bp
        GSHSMRYFFTSVSRPGRGEPRFIAVGYV...AEQRRVYLEGRCVDGLRRYLENGKETLQR
        >HLA:HLA00002 A*01:01:01:02 365 bp
        ...
        
    :param file_path: str
        String with the path to the file
    :return:
        Dictionary with allele names and sequences
    """
    alleles = {}
    with open(file_path, 'r') as f:
        allele = ''
        seq = ''
        for l in f.readlines():
            if l.startswith('>'):
                allele = standardize_hla_class_1_name(l.split()[1])
                if allele is not None and allele in alleles:
                    allele = None # Skip allele if it is already in the dictionary
                    #raise ValueError(f'Allele {allele} already in dictionary')    
                seq = ''
            else:
                if allele is None:
                    allele = ''
                    continue
                seq += l.strip()
                alleles[allele] = seq
                allele = ''  # Reset allele since all sequences are in the same line
                
    return alleles

In [3]:
all_seqs_dict = read_hla_fasta(all_alleles_file)

In [4]:
stability_alleles = read_stability_alleles(stability_alleles_file)

In [5]:
netmhcpan_train_alleles = read_netmhcpan_allelelist(netmhcpan_train_alleles_file)
netmhcpan_val_alleles = get_alleles_from_netmhcpan_val_files(netmhcpan_val_alleles_files)

In [6]:
mixmhcpred_train_alleles = read_train_mixmhcpred_alleles(mixmhcpred_train_alleles_file)
mixmhcpred_val_alleles = read_val_mixmhcpred_alleles(mixmhcpred_val_alleles_file)

## Make fasta files

In [7]:
done_alleles_in_datasets = set(list(netmhcpan_train_alleles) + list(stability_alleles))

all_alleles_in_datasets = set(list(netmhcpan_train_alleles) + list(stability_alleles) + list(netmhcpan_val_alleles) + list(mixmhcpred_train_alleles) + list(mixmhcpred_val_alleles))
len(all_alleles_in_datasets)

187

In [8]:
fasta_files_output_dir = os.path.join(w_dir, 'data/processed/hla_seq_binding_groove')
os.makedirs(fasta_files_output_dir, exist_ok=True)
for allele in all_alleles_in_datasets:
    if allele in all_seqs_dict:
        with open(os.path.join(fasta_files_output_dir, f'{allele}.fasta'), 'w') as f:
            f.write(f'>{allele}\n{all_seqs_dict[allele]}\n')
    else:
        print(f'{allele} not found')

None not found


In [9]:
all_alleles_in_datasets - done_alleles_in_datasets

{'HLA-A*34:01',
 'HLA-A*34:02',
 'HLA-A*36:01',
 'HLA-B*07:04',
 'HLA-B*18:05',
 'HLA-B*35:07',
 'HLA-B*38:02',
 'HLA-B*40:06',
 'HLA-B*40:32',
 'HLA-C*03:02',
 'HLA-C*04:03',
 'HLA-C*14:03',
 'HLA-C*16:02'}