In [1]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import os
import sys

#Model
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import auc, roc_auc_score, precision_recall_curve, precision_score, fbeta_score

curr_dir = os.getcwd()

In [11]:
splits_dict[1]["domains"]

['RNase_T',
 'Neur_chan_LBD',
 'SH2',
 'zf-CXXC',
 'Carb_anhydrase',
 'F5_F8_type_C',
 'FGF',
 'Glyco_transf_7C',
 'Hydrolase',
 'MreB_Mbl',
 'Rhodanese',
 'Asp',
 'HRM',
 'MHC_II_beta',
 'TPR_1',
 'PUF',
 'S4',
 'Bcl-2',
 'CUB',
 'E1-E2_ATPase',
 'FCH',
 'FG-GAP',
 'Hemopexin',
 'Integrin_alpha2',
 'Kunitz_BPTI',
 'Ldl_recept_a',
 'Notch',
 'THAP',
 'VWA_2',
 'ZZ',
 'GTP_EFTU_D2',
 'HLH',
 'MH1',
 '7TM_GPCR_Srsx',
 'BTB',
 'FERM_M',
 'MHC_I',
 'Abhydrolase_6',
 'Acyl-CoA_dh_M',
 'Biotin_lipoyl',
 'C2',
 'Cyt-b5',
 'GST_N',
 'Glycos_transf_2',
 'Ion_trans',
 'Kringle',
 'Mito_carr',
 'PI-PLC-X',
 'TSP_1',
 'TTL',
 'Trypsin_2',
 'UCH_1',
 'hEGF',
 'ABC_membrane',
 'ANATO',
 'ARID',
 'Acyl-CoA_dh_2',
 'Arrestin_N',
 'CIMR',
 'Ephrin_lbd',
 'HSP20',
 'LRR_5',
 'Na_Ca_ex',
 'PLAT',
 'RabGAP-TBC',
 'TUDOR',
 'VPS9']

In [7]:
curr_dir = os.getcwd()
date = "08.06.18"

#Read input and sort by domain
features_table = pd.read_csv(curr_dir+"/domain_features.csv", sep="\t", index_col=0)
features_table.sort_index(inplace=True)
labels = pd.read_csv(curr_dir+"/train_domain_labels_"+date+".csv", sep="\t", index_col=0)
labels.sort_index(inplace=True)

#Verify input
for i in range(0,features_table.shape[0]):
    if features_table.index[i] != labels.index[i]:
        print(features_table.index[i])
        print(labels.index[i])
        print("Error: Domains do not match")
        
domain_list = list(features_table.index)
features_table.index = range(0,features_table.shape[0])
labels.index = range(0,labels.shape[0])

#Read splits dict and flatten to domains
with open(curr_dir+"/../10.Prediction/CV_splits/pfam-v31/domain_5_folds_combined_dna0.5_rna0.5_ion0.75_prec_dict.pik", 'rb') as handle:
    splits_dict = pickle.load(handle)

In [11]:
np.arange(len(domain_list))

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [15]:
#Perform 5 fold CV split
ligands = ["dna", "rna", "peptide", "ion", "sm"]
for ligand in ligands:
    ligand_label = labels[ligand+"_label"]
    roc_auc = []
    auprc = []
    #Use domains splits
    for fold in splits_dict:
        #Recover indices of train/test
        test_index = map(domain_list.index, splits_dict[fold]["domains"])
        train_index = [i not in test_index for i in range(len(domain_list))]
        train_index = np.arange(len(domain_list))[train_index]
        
        #Train model
        X_train, X_test = features_table.iloc[train_index,:], features_table.iloc[test_index,:]
        y_train, y_test = ligand_label.iloc[train_index], ligand_label.iloc[test_index]
        model = XGBClassifier()
        model.fit(X_train, y_train)
        probs_list = []

        probs = model.predict_proba(X_test)
        for l in probs:
            probs_list.append(l[1])
        roc_auc.append(roc_auc_score(y_test, probs[:, 1]))
        precision, recall, _ = precision_recall_curve(y_test, probs[:, 1])
        auprc.append(auc(recall, precision))
    print("---------- "+ligand+" ----------")
    print("AUC: "+str(np.mean(roc_auc)))
    print("AUPRC: "+str(np.mean(auprc)))

---------- dna ----------
AUC: 0.84120391261
AUPRC: 0.45105197446
---------- rna ----------
AUC: 0.807791577375
AUPRC: 0.350945281887
---------- peptide ----------
AUC: 0.604338371201
AUPRC: 0.353005454003
---------- ion ----------
AUC: 0.767369571037
AUPRC: 0.635175274678
---------- sm ----------
AUC: 0.747541171094
AUPRC: 0.654181630781
