In [37]:
import mhcflurry, seaborn, numpy, pandas, pickle, sklearn, collections, scipy, time
import mhcflurry.data
import mhcflurry.imputation
import fancyimpute

import sklearn.metrics
import sklearn.cross_validation

In [46]:
min_peptides_to_consider_allele = 50
max_ic50 = 50000

In [48]:
all_train_data = mhcflurry.data.load_allele_datasets("/Users/tim/sinai/git/mhcflurry/bdata.2009.mhci.public.1.txt")


In [50]:
alleles = [
    "HLA-A0201",
    "HLA-A0301",
    "HLA-A0203",
    "HLA-A2602",
    "HLA-A2603",
    'HLA-B7301',
]
alleles = [allele for allele in all_train_data if len(all_train_data[allele].Y) >= min_peptides_to_consider_allele]

In [None]:
#sorted(dict((allele, len(all_train_data[allele].Y)) for allele in all_train_data).items(), key=lambda pair: -1*pair[1])

In [51]:
#train_data = dict((allele, data)
#                  for (allele, data) in all_train_data.items()
#                  if len(data.Y) >= min_peptides_to_consider_allele)
train_data = dict((allele, all_train_data[allele]) for allele in alleles)
print("Training data: %d / %d alleles" % (len(train_data), len(all_train_data)))

test_data = mhcflurry.data.load_allele_datasets("/Users/tim/sinai/git/mhcflurry/bdata.2013.mhci.public.blind.1.txt")


Training data: 79 / 103 alleles


In [52]:
def log_to_ic50(log_value):
        """
        Convert neural network output to IC50 values between 0.0 and
        self.max_ic50 (typically 5000, 20000 or 50000)
        """
        return max_ic50 ** (1.0 - log_value)

def make_scores(y, y_pred, threshold_nm=500):
    ic50_y = log_to_ic50(y)
    ic50_y_pred = log_to_ic50(y_pred) 
    return dict(
        auc=sklearn.metrics.roc_auc_score(ic50_y <= threshold_nm, y_pred),
        f1=sklearn.metrics.f1_score(ic50_y <= threshold_nm, ic50_y_pred <= threshold_nm),
        tau=scipy.stats.kendalltau(y_pred, y)[0],
    )    

def mean_with_std(grouped_column, decimals=3):
    pattern = "%%0.%df" % decimals
    return pandas.Series([
        (pattern + " +/ " + pattern) % (m, s) if not pandas.isnull(s) else pattern % m
        for (m, s) in zip(grouped_column.mean(), grouped_column.std())
    ], index = grouped_column.mean().index)

def allele_data_to_df(data):
    d = data._asdict()
    d["X_index"] = [x for x in d["X_index"]]
    d["X_binary"] = [x for x in d["X_binary"]]
    df = pandas.DataFrame(d).set_index('peptides')
    return df

def make_2d_array(thing):
    return numpy.array([list(x) for x in thing])

def df_to_allele_data(df):
    d = dict((col, df[col].values) for col in df)
    d["X_index"] = make_2d_array(d["X_index"])
    return mhcflurry.data.AlleleData(peptides = df.index.values, **d)


In [61]:
models_params_list = []
for embedding_input_dim in [21]:
    for embedding_output_dim in [5, 10, 32, 64, 128]:
        for layer_sizes in [[4], [8], [16], [64], [128]]:
            for activation in ["tanh"]:
                models_params_list.append(dict(
                    embedding_input_dim=embedding_input_dim,
                    embedding_output_dim=embedding_output_dim,
                    layer_sizes=layer_sizes,
                    activation=activation))

print("%d models" % len(models_params_list))
models_params_explored = set.union(*[set(x) for x in models_params_list])
models_params_explored


25 models


{'activation', 'embedding_input_dim', 'embedding_output_dim', 'layer_sizes'}

In [63]:
cv_df = collections.defaultdict(list)
start = time.time()
#for (allele, data) in list(train_data.items())[:1]:
for (allele, data) in train_data.items():
    data_df = allele_data_to_df(data)
    cv = sklearn.cross_validation.StratifiedKFold(log_to_ic50(data.Y) < 500, n_folds = 3)
    for (fold_num, (train_indices, test_indices)) in enumerate(cv):
        for impute in [True, False]:
            train_df = data_df.iloc[train_indices]
            test_df = data_df.iloc[test_indices]
            if impute:
                full_train_allele_data = dict(train_data)
                full_train_allele_data[allele] = df_to_allele_data(train_df)
                train_imputed_dict = mhcflurry.imputation.create_imputed_datasets(
                    full_train_allele_data,
                    fancyimpute.mice.MICE(),
                    min_observations_per_peptide=3,
                    min_observations_per_allele=50)
                X_pretrain = train_imputed_dict[allele].X_index
                Y_pretrain = train_imputed_dict[allele].Y
            else:
                X_pretrain = Y_pretrain = None

            for (i, model_params) in enumerate(models_params_list):
                print("%10s fold %3d [%3d / %3d] impute=%s" % (allele, fold_num, i, len(models_params_list), impute))
                model = mhcflurry.Class1BindingPredictor.from_hyperparameters(
                    max_ic50=max_ic50,
                    **model_params)
                fit_time = -time.time()
                model.fit(
                    make_2d_array(train_df.X_index),
                    train_df.Y,
                    X_pretrain=X_pretrain,
                    Y_pretrain=Y_pretrain,
                )
                fit_time += time.time()
                predictions = model.predict(make_2d_array(test_df.X_index))
                train_predictions = model.predict(make_2d_array(train_df.X_index))
                cv_df["allele"].append(allele)
                cv_df["allele_size"].append(len(data.Y))
                cv_df["train_size"].append(len(train_indices))
                cv_df["model_params"].append(model_params)
                #cv_df["model"].append(model)
                cv_df["impute"].append(impute)
                cv_df["imputed_size"].append(len(Y_pretrain) if Y_pretrain is not None else None)
                cv_df["fit_time"].append(fit_time)

                for (param, param_value) in model_params.iteritems():
                    cv_df[param].append(param_value)
                for (key, value) in make_scores(test_df.Y, predictions).items():
                    cv_df["test_%s" % key].append(value)
                for (key, value) in make_scores(train_df.Y, train_predictions).items():
                    cv_df["train_%s" % key].append(value)

cv_df = pandas.DataFrame(cv_df)
cv_df["layer0_size"] = [x[0] for x in cv_df.layer_sizes]
print(time.time() - start)
cv_df

Collected 711 binding values for 19057 alleles
Dropping 9696 peptides with <3 observations
Dropping 0 alleles with <50 observations: []
[MICE] Completing matrix with shape (9361, 79)
[MICE] Starting imputation round 1/110, elapsed time 0.018
[MICE] Starting imputation round 2/110, elapsed time 1.569
[MICE] Starting imputation round 3/110, elapsed time 3.085
[MICE] Starting imputation round 4/110, elapsed time 4.541
[MICE] Starting imputation round 5/110, elapsed time 6.060
[MICE] Starting imputation round 6/110, elapsed time 7.428
[MICE] Starting imputation round 7/110, elapsed time 8.772
[MICE] Starting imputation round 8/110, elapsed time 10.086
[MICE] Starting imputation round 9/110, elapsed time 11.382
[MICE] Starting imputation round 10/110, elapsed time 12.841
[MICE] Starting imputation round 11/110, elapsed time 14.211
[MICE] Starting imputation round 12/110, elapsed time 15.564
[MICE] Starting imputation round 13/110, elapsed time 16.844
[MICE] Starting imputation round 14/110,

KeyboardInterrupt: 

In [64]:
cv_df2 = pandas.DataFrame(cv_df)
cv_df2["layer0_size"] = [x[0] for x in cv_df2.layer_sizes]
cv_df2

Unnamed: 0,activation,allele,allele_size,embedding_input_dim,embedding_output_dim,fit_time,impute,imputed_size,layer_sizes,model_params,test_auc,test_f1,test_tau,train_auc,train_f1,train_size,train_tau,layer0_size
0,tanh,HLA-B0802,486,21,5,0.516064,True,9361,[4],"{u'activation': u'tanh', u'embedding_output_di...",0.958791,0.545455,0.365852,0.994641,0.666667,323,0.423319,4
1,tanh,HLA-B0802,486,21,5,0.524809,True,9361,[8],"{u'activation': u'tanh', u'embedding_output_di...",0.966117,0.000000,0.381375,0.996249,0.761905,323,0.400697,8
2,tanh,HLA-B0802,486,21,5,0.586464,True,9361,[16],"{u'activation': u'tanh', u'embedding_output_di...",0.977106,0.727273,0.328311,0.999732,0.909091,323,0.416132,16
3,tanh,HLA-B0802,486,21,5,0.835091,True,9361,[64],"{u'activation': u'tanh', u'embedding_output_di...",0.946886,0.250000,0.361160,0.995713,0.800000,323,0.400371,64
4,tanh,HLA-B0802,486,21,5,1.178641,True,9361,[128],"{u'activation': u'tanh', u'embedding_output_di...",0.974359,0.444444,0.328311,0.998928,0.761905,323,0.413437,128
5,tanh,HLA-B0802,486,21,10,0.555257,True,9361,[4],"{u'activation': u'tanh', u'embedding_output_di...",0.966117,0.545455,0.316398,0.998124,0.869565,323,0.408946,4
6,tanh,HLA-B0802,486,21,10,0.656072,True,9361,[8],"{u'activation': u'tanh', u'embedding_output_di...",0.974359,0.727273,0.366213,0.998392,0.909091,323,0.399064,8
7,tanh,HLA-B0802,486,21,10,0.790235,True,9361,[16],"{u'activation': u'tanh', u'embedding_output_di...",0.949634,0.250000,0.326506,1.000000,0.909091,323,0.436141,16
8,tanh,HLA-B0802,486,21,10,1.058073,True,9361,[64],"{u'activation': u'tanh', u'embedding_output_di...",0.951465,0.400000,0.298710,0.998392,0.909091,323,0.420951,64
9,tanh,HLA-B0802,486,21,10,1.363177,True,9361,[128],"{u'activation': u'tanh', u'embedding_output_di...",0.964286,0.500000,0.310262,1.000000,0.857143,323,0.411722,128


In [65]:
cv_df2.to_csv("cv3.csv")

In [44]:
group_columns = ["allele", "allele_size", "impute"]
group_columns.extend(models_params_explored)
group_columns.append("layer0_size")
group_columns.remove("layer_sizes")
print(mean_with_std(cv_df.groupby(group_columns).test_auc)) #.sort(inplace=False, ascending=False)



allele     allele_size  impute  embedding_input_dim  activation  embedding_output_dim  layer0_size
HLA-A0201  6961         False   21                   tanh        6                     8              0.912 +/ 0.029
                                                                                       16             0.911 +/ 0.010
                                                                                       64             0.917 +/ 0.039
                                                                                       128            0.906 +/ 0.036
                                                                 10                    8              0.935 +/ 0.018
                                                                                       16             0.917 +/ 0.032
                                                                                       64             0.925 +/ 0.019
                                                                                  

In [42]:
def best_by(score):
    means = cv_df.groupby(group_columns)[score].mean().reset_index()
    max_rows = []
    for allele in means.allele.unique():
        max_rows.append(means.ix[means.allele == allele][score].argmax())
    return means.ix[max_rows]

In [45]:
best_by('test_auc')


Unnamed: 0,allele,allele_size,impute,embedding_input_dim,activation,embedding_output_dim,layer0_size,test_auc
8,HLA-A0201,6961,False,21,tanh,32,8,0.941194


In [None]:
best_by('test_tau')

In [None]:
best_by('test_f1')