## Imports

In [1]:
import pandas as pd
import re

## Load and check data scheme

In [2]:
df = pd.read_csv('./data/binding_affinity_data.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,MHC_sequence,MHC_type,peptide_sequence,label
0,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,ERLKEVQKR,1
1,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,KPRKTAEVAGKTL,1
2,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,KEARRIIKK,1
3,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,EEKITEAKEL,0
4,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,SLPSSRAARVPG,0


In [4]:
df['MHC_type'].value_counts()

HLA-B*27:05    182731
HLA-A*02:01    119689
HLA-B*15:01     68836
HLA-B*07:02     64781
HLA-B*57:01     59723
                ...  
HLA-A*32:07       350
HLA-A*26:03       330
HLA-A*68:23       320
HLA-A*32:15       290
HLA-B*44:27       270
Name: MHC_type, Length: 112, dtype: int64

In [5]:
df['label'].value_counts()

0    1433899
1     359166
Name: label, dtype: int64

In [6]:
HLA_pattern = re.compile(r"^HLA-(?P<gene>.)\*(?P<allele>(?P<allele_group>\d\d):(?P<allele_id>\d\d))$")

In [7]:
def extract_hla_info(mhc_type):
    match = HLA_pattern.search(mhc_type)
    if not match:
        raise Exception("hla pattern didn't match")

    return pd.Series({
        'HLA_gene': match.group('gene'),
        "HLA_allele": match.group("allele"),
        'HLA_allele_group': match.group("allele_group"),
        "HLA_allele_id": match.group("allele_id")
    })

df[['HLA_gene', 'HLA_allele', 'HLA_allele_group', 'HLA_allele_id']] = df['MHC_type'].apply(extract_hla_info)

In [8]:
df

Unnamed: 0,MHC_sequence,MHC_type,peptide_sequence,label,HLA_gene,HLA_allele,HLA_allele_group,HLA_allele_id
0,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,ERLKEVQKR,1,B,27:05,27,05
1,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,KPRKTAEVAGKTL,1,B,27:05,27,05
2,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,KEARRIIKK,1,B,27:05,27,05
3,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,EEKITEAKEL,0,B,27:05,27,05
4,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,SLPSSRAARVPG,0,B,27:05,27,05
...,...,...,...,...,...,...,...,...
1793060,MAVMAPRTLLLLLLGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,HLA-A*32:15,TLSKSTAII,0,A,32:15,32,15
1793061,MRVTAPRTVLLLLSGALALTETWAGSHSMRYFYTAMSRPGRGEPRF...,HLA-B*15:09,KAFLPAMTK,0,B,15:09,15,09
1793062,MRVTAPRTVLLLLSGALALTETWAGSHSMRYFYTAMSRPGRGEPRF...,HLA-B*15:09,VRIPRNSPL,0,B,15:09,15,09
1793063,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFYTSVSRPGRGEPRF...,HLA-A*66:01,GEHANDYAE,0,A,66:01,66,01


In [15]:
df['HLA_allele_group'].value_counts()

27    267644
02    234794
07    109796
03     94490
15     91269
57     80450
44     65823
35     64460
68     60380
40     59538
08     54100
01     51956
11     45289
24     43949
04     42618
29     29410
39     28588
06     27200
51     27131
14     26845
31     22969
58     22104
05     22006
49     19819
37     18035
32     17608
16     17306
38     15583
23     15027
13     12742
45     12734
18     12281
30     11154
26     10335
46      8206
12      8080
50      7133
54      5854
33      5395
69      4781
53      3275
56      2870
41      2475
17      2154
52      2055
73      2040
83      1284
80       710
25       480
48       470
66       370
Name: HLA_allele_group, dtype: int64

In [16]:
df['HLA_allele_group'].value_counts()

27    267644
02    234794
07    109796
03     94490
15     91269
57     80450
44     65823
35     64460
68     60380
40     59538
08     54100
01     51956
11     45289
24     43949
04     42618
29     29410
39     28588
06     27200
51     27131
14     26845
31     22969
58     22104
05     22006
49     19819
37     18035
32     17608
16     17306
38     15583
23     15027
13     12742
45     12734
18     12281
30     11154
26     10335
46      8206
12      8080
50      7133
54      5854
33      5395
69      4781
53      3275
56      2870
41      2475
17      2154
52      2055
73      2040
83      1284
80       710
25       480
48       470
66       370
Name: HLA_allele_group, dtype: int64

In [18]:
groups = df['HLA_allele_group'].unique().tolist()

In [20]:
df['HLA_group_idx'] = df['HLA_allele_group'].apply(lambda x: groups.index(x))

In [23]:
df.head()

Unnamed: 0,MHC_sequence,MHC_type,peptide_sequence,label,HLA_gene,HLA_allele,HLA_allele_group,HLA_allele_id,HLA_group_idx
0,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,ERLKEVQKR,1,B,27:05,27,5,0
1,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,KPRKTAEVAGKTL,1,B,27:05,27,5,0
2,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,KEARRIIKK,1,B,27:05,27,5,0
3,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,EEKITEAKEL,0,B,27:05,27,5,0
4,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,SLPSSRAARVPG,0,B,27:05,27,5,0


In [24]:
df.to_csv("./data/binding_affinity_hla_cleaned.csv")