# Baseline with 6 most probable alleles in German population

In [9]:
#for analysis
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

#for logging
import logging
logging.basicConfig(level=logging.INFO)
logging.getLogger('matplotlib.font_manager').disabled = True

### Parameters for NetMHCpan Run

In [10]:
#use cleaned db: adjusted to our use case (778 donors out of all)
PATH_TO_TUEDB = '../../DATA/db_dump_311023_cleaned.tsv'
PATH_TO_ALL_TUEDB_ALLELES = '../../DATA/all_tuedb_alleles.txt'
PATH_TO_NETMHCPAN_INPUT_FOLDER = '../../DATA/epitopeprediction-input'
PATH_TO_SAMPLESHEET = f'{PATH_TO_NETMHCPAN_INPUT_FOLDER}/samplesheet.csv'
PATH_TO_ALLELE_FREQUENCY = '../../DATA/dkms_allele_freq_german.tsv'

### Load TuedDB Database for Patients (peptide list and HLA typing)
This is how the tuedb database dataframe looks like.

In [11]:
tuedb_df = pd.read_csv(PATH_TO_TUEDB, sep='\t', index_col=0)
tuedb_df = tuedb_df.sort_values(by=['donor_code'])
tuedb_df.head()

Unnamed: 0,peptide_sequence,donor_code,all_hla_alleles_donor,mhc_class,peptide_modifications,no_alleles
6646695,TTDLFGRDLSY,04-001,"['A0101', 'A2402', 'B0801', 'B1402', 'C0701', ...",I,,6
6646540,AYLEAHETF,04-001,"['A0101', 'A2402', 'B0801', 'B1402', 'C0701', ...",I,,6
6646539,NRFQIATV,04-001,"['A0101', 'A2402', 'B0801', 'B1402', 'C0701', ...",I,,6
6646538,TAASRLVTL,04-001,"['A0101', 'A2402', 'B0801', 'B1402', 'C0701', ...",I,,6
6646537,FTDVRAAVY,04-001,"['A0101', 'A2402', 'B0801', 'B1402', 'C0701', ...",I,,6


### Fetch Allele frequency

Germany DKMS - German donors mit n = 3,456,066
The german dataset is by far the largest allele frequency dataset, which is why I chose it.

Download the allele frequencies for the DKMS dataset on allelefrequencies.net
Select Settings: Locus: All loci, Population: Germany DKMS - German donors (n=3,456,066)
Search with Button: Use printer-friendly version.

Or use Downloadlink: http://allelefrequencies.net/hla6006a_scr.asp?hla_locus=&hla_locus_type=Classical&hla_allele1=&hla_allele2=&hla_selection=&hla_pop_selection=&hla_population=3767&hla_country=&hla_dataset=&hla_region=&hla_ethnic=&hla_study=&hla_sample_size=&hla_sample_size_pattern=equal&hla_sample_year=&hla_sample_year_pattern=equal&hla_level=&hla_level_pattern=equal&hla_show=&hla_order=order_1&standard=a


Cite: Gonzalez-Galarza FF, McCabe A, Santos EJMD, Jones J, Takeshita L, Ortega-Rivera ND, Cid-Pavon GMD, Ramsbottom K, Ghattaoraya G, Alfirevic A, Middleton D, Jones AR. Allele frequency net database (AFND) 2020 update: gold-standard data classification, open access genotype data and new query tools. Nucleic Acids Res. 2020 Jan 8;48(D1):D783-D788. doi: 10.1093/nar/gkz1029. PMID: 31722398; PMCID: PMC7145554.

In [12]:
#read out the allele frequencies
allele_freq_df = pd.read_csv(PATH_TO_ALLELE_FREQUENCY, sep='\t')
allele_freq_dict = dict(zip(allele_freq_df['Allele'], allele_freq_df['Allele Frequency']))

#load txt file with all alleles from tuedb
all_tuedb_alleles = np.loadtxt(PATH_TO_ALL_TUEDB_ALLELES, dtype=str)

# Update the allele_freq_dict with missing alleles and set frequency to 0
for allele in all_tuedb_alleles:
    allele_freq_dict.setdefault(allele, 0)

sum_allele_freq_dict = sum(allele_freq_dict.values())

### Get most probable German Alleles

In [13]:
allele_freq_dict_a = {}
allele_freq_dict_b = {}
allele_freq_dict_c = {}

for key, value in allele_freq_dict.items():
    if key.startswith('A'):
        allele_freq_dict_a[key] = value
    elif key.startswith('B'):
        allele_freq_dict_b[key] = value
    elif key.startswith('C'):
        allele_freq_dict_c[key] = value

# Get top 2 items for each dictionary
top_two_a = sorted(allele_freq_dict_a.items(), key=lambda x: x[1], reverse=True)[:2]
top_two_b = sorted(allele_freq_dict_b.items(), key=lambda x: x[1], reverse=True)[:2]
top_two_c = sorted(allele_freq_dict_c.items(), key=lambda x: x[1], reverse=True)[:2]

keys_a = [item[0] for item in top_two_a]
keys_b = [item[0] for item in top_two_b]
keys_c = [item[0] for item in top_two_c]

# Combining all keys into one list
most_probable_german_alleles = keys_a + keys_b + keys_c
print(f'The most probable alleles in Germany are: {most_probable_german_alleles}')

The most probable alleles in Germany are: ['A*02:01', 'A*01:01', 'B*07:02', 'B*08:01', 'C*07:01', 'C*07:02']


In [14]:
def bruteforce_most_probable_alleles(tp_alleles: list, most_probable_german_alleles: list) -> float:
    """
    Perform a brute-force prediction with the 6 most common alleles in german population

    Args:
        tp_alleles (list): List of experimentally typed alleles for the donor.
        most_probable_german_alleles (list): List of most probable German HLA alleles.

    Returns:
        float: Percentage of correctly predicted alleles based on the experimentally typed alleles.
    """
    prediction = most_probable_german_alleles
    match = 0
    for allele in tp_alleles:
        if allele in prediction:
            match += 1
    match_of_prediction = match / len(tp_alleles)
    logging.debug(f'Prediction matching with {round(match_of_prediction*100,2)} %')

    return match_of_prediction

### Main
#### Settings

In [15]:
ONLY_2_DIGIT = False

In [16]:
tuedb_samplesheet = pd.read_csv(PATH_TO_SAMPLESHEET, sep=',')
all_tuedb_alleles_list = np.loadtxt(PATH_TO_ALL_TUEDB_ALLELES, dtype=str)

count_match_6 = []
count_match_56 = []
count_match_456 = []

if ONLY_2_DIGIT:
    most_probable_german_alleles = list(set([allele[:4] for allele in most_probable_german_alleles]))

for index, row in tuedb_samplesheet.iterrows():
    donor_code = row['sample']
    tp_alleles = eval(tuedb_df[tuedb_df['donor_code'] == donor_code]['all_hla_alleles_donor'].values[0])
    tp_alleles = [f'{a[0]}*{a[1:3]}:{a[3:]}' for a in tp_alleles]
    
    logging.debug(f'Donor: {donor_code}')

    if ONLY_2_DIGIT:
        tp_alleles = list(set([allele[:4] for allele in tp_alleles]))

    match_of_prediction = bruteforce_most_probable_alleles(tp_alleles, most_probable_german_alleles)
    
    if len(tp_alleles) == 6:
        count_match_6.append(match_of_prediction)
        count_match_56.append(match_of_prediction)
        count_match_456.append(match_of_prediction)
    if len(tp_alleles) == 5:
        count_match_56.append(match_of_prediction)
        count_match_456.append(match_of_prediction)
    if len(tp_alleles) == 4:
        count_match_456.append(match_of_prediction)

#print(f'On average (only looking at 6 allele donors) {np.mean(count_match_6)*100:.2f} % of alleles from {len(count_match_6)} donors with 6 alleles were predicted correctly.')
#print(f'On average (only looking at 5 and 6 allele donors) {np.mean(count_match_56)*100:.2f} % of alleles from {len(count_match_56)} donors with 5 or 6 alleles were predicted correctly.')
print(f'On average (only looking at 4 and 5 and 6 allele donors) {np.mean(count_match_456)*100:.2f} % of alleles from {len(count_match_456)} donors with 4 or 5 or 6 alleles were predicted correctly.')


On average (only looking at 4 and 5 and 6 allele donors) 28.64 % of alleles from 753 donors with 4 or 5 or 6 alleles were predicted correctly.
