## Importamos bibliotecas

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from scipy.stats import uniform
from xgboost import XGBClassifier
from sklearn.utils import class_weight
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold, RepeatedKFold
from sklearn.metrics import roc_curve, auc, roc_auc_score, classification_report, confusion_matrix, accuracy_score, f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
def scaler(set):
    scaler = StandardScaler()
    set = scaler.fit_transform(set)
    return set

def generating_metrics(model_ehr, x, y):
    """Function to generate metrics: auc_score, sensitivity, specificity, f1, accuracy"""
    y_pred = model_ehr.predict(x)
    acc = accuracy_score(y,y_pred)
    tn, fp, fn, tp = confusion_matrix(y,y_pred).ravel()
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y, y_pred)
    sensitivity = tp / (tp+fn)
    specificity = tn / (tn+fp)
    auc_score = auc(false_positive_rate, true_positive_rate)
    f1 = f1_score(y, y_pred)
    return auc_score, sensitivity, specificity, f1, acc, false_positive_rate, true_positive_rate

def curvaROC(title, fpr_test, tpr_test, auc_test):
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr_test, tpr_test, label='Testing (AUC=%0.2f)' % auc_test, color='darkorange')
    plt.xlabel('1 - Specificity')
    plt.ylabel('Sensitivity')
    plt.legend(loc='best')
    plt.title(title)
    plt.show()

In [3]:
df = pd.read_csv('base_agregada.csv', header=0,na_filter=True)
print("Tamaño:", df.shape)
print(df.columns)

Tamaño: (46, 12)
Index(['id', 'CDR3long', 'N1long', 'N2long', 'NtP3V', 'NtP5D', 'NtP3D',
       'NtP5J', 'TamRelCluster', 'CaCo_n', 'IdLG', 'G4_num'],
      dtype='object')


## Separamos en train y test

In [4]:
X = df.drop(['id','CaCo_n'],axis = 1)
y = df['CaCo_n'].astype(int)
y = y.values.reshape(y.shape[0],1)

### Utilizando 10 variables
X_train_10, X_test_10, y_train_10, y_test_10 = train_test_split(X, y, test_size=0.20,random_state=42)

### Utilizando G4_num y TamRelCluster
X_2 = df[['TamRelCluster','G4_num']]
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y, test_size=0.20,random_state=42)

### Utilizando G4_num, TamRelCluster y CDR3long
X_3 = df[['CDR3long','TamRelCluster','G4_num']]
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_3, y, test_size=0.20,random_state=42)

## Definimos los clasificadores y los parámetros

In [8]:
c_validation = RepeatedKFold(n_splits=3, n_repeats=5, random_state=42)
estimator_xgb = xgb.XGBClassifier()
parameters={"n_estimators": np.arange(50,200,10), 
            "learning_rate": np.arange(0.1,1.0,0.1),
            "colsample_bytree" : np.arange(0.1,1.0,0.1),
            "subsample" : np.arange(0.1,1.0,0.1),
            "reg_alpha" : np.arange(0,20,1), 
            "reg_lambda": np.arange(0,20,1),
            "objective": ['binary:logistic'],
            "max_depth": np.arange(0,4,1), 
            "gamma":range(0,5),
            "eval_metric": ['auc']}

randomSearch = RandomizedSearchCV(estimator=estimator_xgb, param_distributions = parameters,
                          n_iter=100, scoring='roc_auc',cv=c_validation,random_state=42, refit = True)

LR = LogisticRegression()

gridSearch = GridSearchCV(estimator=estimator_xgb, param_grid=parameters, cv = c_validation, 
                    scoring='roc_auc', refit = True)

## Comparación de modelos
### Random Search

In [6]:
### Con 10 variables
model_rs10 = randomSearch.fit(X_train_10,y_train_10)
auc_rs10, sens_rs10, spec_rs10, f1_rs10, acc_rs10,fpr_rs10, tpr_rs10 = generating_metrics(model_rs10, X_test_10, y_test_10)
print("auc_rs10: {}, sens_rs10: {}, spec_rs10: {}, f1_rs10: {}, acc_rs10: {}".format(auc_rs10, sens_rs10, spec_rs10, f1_rs10, acc_rs10))
### Con G4_num y CDR3long
model_rs2 = randomSearch.fit(X_train_2,y_train_2)
auc_rs2, sens_rs2, spec_rs2, f1_rs2, acc_rs2,fpr_rs2, tpr_rs2 = generating_metrics(model_rs2, X_test_2, y_test_2)
print("auc_2rs: {}, sens_rs2: {}, spec_rs2: {}, f1_rs2: {}, acc_rs2: {}".format(auc_rs2, sens_rs2, spec_rs2, f1_rs2, acc_rs2))
### Con CDR3long, G4_num y CDR3long
model_rs3 = randomSearch.fit(X_train_3,y_train_3)
auc_rs3, sens_rs3, spec_rs3, f1_rs3, acc_rs3,fpr_rs3, tpr_rs3 = generating_metrics(model_rs3, X_test_3, y_test_3)
print("auc_rs3: {}, sens_rs3: {}, spec_rs3: {}, f1_rs3: {}, acc_rs3: {}".format(auc_rs3, sens_rs3, spec_rs3, f1_rs3, acc_rs3))

auc_rs10: 0.2916666666666667, sens_rs10: 0.25, spec_rs10: 0.3333333333333333, f1_rs10: 0.22222222222222224, acc_rs10: 0.3
auc_2rs: 0.5, sens_rs2: 0.5, spec_rs2: 0.5, f1_rs2: 0.4444444444444445, acc_rs2: 0.5
auc_rs3: 0.3333333333333333, sens_rs3: 0.5, spec_rs3: 0.16666666666666666, f1_rs3: 0.36363636363636365, acc_rs3: 0.3


### Grid Search

In [None]:
### Con 10 variables
model_gs10 = gridSearch.fit(X_train_10,y_train_10)
auc_gs10, sens_gs10, spec_gs10, f1_gs10, acc_gs10,fpr_gs10, tpr_gs10 = generating_metrics(model_gs10, X_test_10, y_test_10)
print("auc_gs10: {}, sens_gs10: {}, spec_gs10: {}, f1_gs10: {}, acc_gs10: {}".format(auc_gs10, sens_gs10, spec_gs10, f1_gs10, acc_gs10))
### Con G4_num y CDR3long
model_gs2 = gridSearch.fit(X_train_2,y_train_2)
auc_gs2, sens_gs2, spec_gs2, f1_gs2, acc_gs2,fpr_gs2, tpr_gs2 = generating_metrics(model_gs2, X_test_2, y_test_2)
print("auc_gs2: {}, sens_gs2: {}, spec_gs2: {}, f1_gs2: {}, acc_gs2: {}".format(auc_gs2, sens_gs2, spec_gs2, f1_gs2, acc_gs2))
### Con CDR3long, G4_num y CDR3long
model_gs3 = gridSearch.fit(X_train_3,y_train_3)
auc_gs3, sens_gs3, spec_gs3, f1_gs3, acc_gs3,fpr_gs3, tpr_gs3 = generating_metrics(model_gs3, X_test_3, y_test_3)
print("auc_gs3: {}, sens_gs3: {}, spec_gs3: {}, f1_gs3: {}, acc_gs3: {}".format(auc_gs3, sens_gs3, spec_gs3, f1_gs3, acc_gs3))

### XGBClassifier

In [9]:
### Con 10 variables
model_xgb10 = estimator_xgb.fit(X_train_10,y_train_10)
auc_xgb10, sens_xgb10, spec_xgb10, f1_xgb10, acc_xgb10,fpr_xgb10, tpr_xgb10 = generating_metrics(model_xgb10, X_test_10, y_test_10)
print("auc_xgb10: {}, sens_xgb10: {}, spec_xgb10: {}, f1_xgb10: {}, acc_xgb10: {}".format(auc_xgb10, sens_xgb10, spec_xgb10, f1_xgb10, acc_xgb10))
### Con G4_num y CDR3long
model_xgb2 = estimator_xgb.fit(X_train_2,y_train_2)
auc_xgb2, sens_xgb2, spec_xgb2, f1_xgb2, acc_xgb2,fpr_xgb2, tpr_xgb2 = generating_metrics(model_xgb2, X_test_2, y_test_2)
print("auc_xgb2: {}, sens_xgb2: {}, spec_xgb2: {}, f1_xgb2: {}, acc_xgb2: {}".format(auc_xgb2, sens_xgb2, spec_xgb2, f1_xgb2, acc_xgb2))
### Con CDR3long, G4_num y CDR3long
model_xgb3 = estimator_xgb.fit(X_train_3,y_train_3)
auc_xgb3, sens_xgb3, spec_xgb3, f1_xgb3, acc_xgb3,fpr_xgb3, tpr_xgb3 = generating_metrics(model_xgb3, X_test_3, y_test_3)
print("auc_xgb3: {}, sens_xgb3: {}, spec_xgb3: {}, f1_xgb3: {}, acc_xgb3: {}".format(auc_xgb3, sens_xgb3, spec_xgb3, f1_xgb3, acc_xgb3))

auc_xgb10: 0.20833333333333331, sens_xgb10: 0.25, spec_xgb10: 0.16666666666666666, f1_xgb10: 0.2, acc_xgb10: 0.2
auc_xgb2: 0.5, sens_xgb2: 0.5, spec_xgb2: 0.5, f1_xgb2: 0.4444444444444445, acc_xgb2: 0.5
auc_xgb3: 0.5, sens_xgb3: 0.5, spec_xgb3: 0.5, f1_xgb3: 0.4444444444444445, acc_xgb3: 0.5


### Logistic regression

In [10]:
### Con 10 variables
model_lr10 = LR.fit(X_train_10,y_train_10)
auc_lr10, sens_lr10, spec_lr10, f1_lr10, acc_lr10,fpr_lr10, tpr_lr10 = generating_metrics(model_lr10, X_test_10, y_test_10)
print("auc_lr10: {}, sens_lr10: {}, spec_lr10: {}, f1_lr10: {}, acc_lr10: {}".format(auc_lr10, sens_lr10, spec_lr10, f1_lr10, acc_lr10))
### Con G4_num y CDR3long
model_lr2 = LR.fit(X_train_2,y_train_2)
auc_lr2, sens_lr2, spec_lr2, f1_lr2, acc_lr2,fpr_lr2, tpr_lr2 = generating_metrics(model_lr2, X_test_2, y_test_2)
print("auc_lr2: {}, sens_lr2: {}, spec_lr2: {}, f1_lr2: {}, acc_lr2: {}".format(auc_lr2, sens_lr2, spec_lr2, f1_lr2, acc_lr2))
### Con CDR3long, G4_num y CDR3long
model_lr3 = LR.fit(X_train_3,y_train_3)
auc_lr3, sens_lr3, spec_lr3, f1_lr3, acc_lr3,fpr_lr3, tpr_lr3 = generating_metrics(model_lr3, X_test_3, y_test_3)
print("auc_lr3: {}, sens_lr3: {}, spec_lr3: {}, f1_lr3: {}, acc_lr3: {}".format(auc_lr3, sens_lr3, spec_lr3, f1_lr3, acc_lr3))

auc_lr10: 0.41666666666666663, sens_lr10: 0.5, spec_lr10: 0.3333333333333333, f1_lr10: 0.4, acc_lr10: 0.4
auc_lr2: 0.7916666666666667, sens_lr2: 0.75, spec_lr2: 0.8333333333333334, f1_lr2: 0.75, acc_lr2: 0.8
auc_lr3: 0.5416666666666667, sens_lr3: 0.75, spec_lr3: 0.3333333333333333, f1_lr3: 0.5454545454545454, acc_lr3: 0.5
