In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix
from sklearn.model_selection import cross_val_score, LeaveOneOut, cross_val_predict
from pickle import dump

In [2]:
# Adjust these to address the current train data and the path where the model needs to be saved + the name
DATA_PATH = "../data/train_data/INSRTR_SAR.csv"
MODEL_PATH = "../insrtr/models/gbt_classifier_v2.pkl"

In [3]:
def encode_categories(dataframe_to_encode):
    """
    Encodes categories using cat.codes from scikit.

    Parameters
    ----------
    dataframe_to_encode: input dataframe

    Returns
    -------
    dataframe_to_encode: modified dataframe
    """

    cols_to_encode = ["Enzyme", "resi_type", "resi_dssp", "prev_resi_type", "prev_resi_dssp", "next_resi_type",
                      "next_resi_dssp", "loop_seq"]
    encoded_cols = [col + "_encoded" for col in cols_to_encode]
    # Apply astype and cat.codes to each column
    encoded = dataframe_to_encode[cols_to_encode].apply(lambda column: column.astype("category").cat.codes)
    # Use assign to create new columns in dataframe
    dataframe_to_encode = dataframe_to_encode.assign(**dict(zip(encoded_cols, encoded.T.values)))
    return dataframe_to_encode


def create_x_y(dataframe_to_split, target=""):
    """
    Splits the data into descriptive space (x) and target space (y).

    Parameters
    ----------
    dataframe_to_split: input dataframe
    target: string that represents the target column

    Returns
    -------
    x_array: descriptive space numpy array
    y_array: target space numpy array

    """
    x_array = dataframe_to_split.drop(
        ["max_fold_decrease_with_peptide", "%WT_activity", "works", "fluctuation", "relative_MSA_conservation",
         "Enzyme_encoded", "Enzyme", "resi_type", "resi_dssp", "prev_resi_type", "prev_resi_dssp",
         "next_resi_type", "next_resi_dssp", "loop_seq"], axis=1).iloc[:, :].copy().values
    y_array = dataframe_to_split[target].values.ravel()
    return x_array, y_array


def save_model(classifier, filename):
    """
    Saves a scikit-learn model to disk using the pickle module.

    Parameters
    ----------
    classifier: trained model that should be saved
    filename: where the pickle file is saved

    Returns
    -------
    """
    with open(filename, "wb") as file:
        dump(classifier, file)

In [4]:
# Initialize model and cross-validation setting
clf = GradientBoostingClassifier(n_estimators=800, random_state=42, learning_rate=4)
cv = LeaveOneOut()

# Read data, drop embeddings and dummy columns, filer out rows with missing target
df = pd.read_csv(DATA_PATH, usecols=lambda col: "esm2" not in col and "Unnamed" not in col).dropna(
    subset=["works"])
df = encode_categories(df)

# Parameter tuning scenario
# First do a test where the entire set is split into two parts, 80-20 - train and test sets.
# The train set will be used for leave-one-out cross-validation (LOOCv) to tune the model, the second for testing.
# Then do k-fold cross-validation on entire dataset.
# Results should not vary significantly.
df_shuff = df.sample(frac=1, random_state=48).reset_index(drop=True)
n_test = int(len(df_shuff) * 0.2)
tune_set = df_shuff[n_test:]
test_set = df_shuff[:n_test]
x_tune, y_tune = create_x_y(tune_set, "works")
# LOOCV on train set
scores = cross_val_score(clf, x_tune, y_tune, scoring="accuracy", cv=cv, n_jobs=-1)
avg_score = scores.mean()
print("Leave-one-out cross-validation score on tuning set: {:.4f}".format(avg_score))
# Try model on test set
x_test, y_test = create_x_y(test_set, "works")
clf.fit(x_tune, y_tune)
clf.predict(x_test)
test_score = clf.score(x_test, y_test)
print("Test set score: {:.4f}".format(test_score))
# Try 23-fold cross validation on entire dataset
x, y = create_x_y(df, "works")
scores = cross_val_score(clf, x, y, scoring="accuracy", cv=23, n_jobs=-1)
avg_score = scores.mean()
print("23-fold cross-validation score on entire dataset: {:.4f}".format(avg_score))

# LOOCV on entire dataset
x, y = create_x_y(df, "works")
scores = cross_val_score(clf, x, y, scoring="accuracy", cv=cv, n_jobs=-1)
avg_score = scores.mean()
print("LOOCV score on entire dataset: {:.4f}".format(avg_score))
# Get predictions from LOOCV - label and probability, calculate AUC and show confusion matrix
y_pred_label = cross_val_predict(clf, x, y, cv=cv, n_jobs=-1)
conf_mat = confusion_matrix(y, y_pred_label, labels=["N", "Y"])
print("Confusion matrix: \n", conf_mat)
y_pred_proba = cross_val_predict(clf, x, y, cv=cv, method="predict_proba", n_jobs=-1)
y_pred_proba = y_pred_proba[:, 1]
auc_score = roc_auc_score(y, y_pred_proba)
print("AUC score: {:.4f}".format(auc_score))

# Retrain model on entire set and save it as pickle file
clf.fit(x, y)
save_model(clf, MODEL_PATH)

Leave-one-out cross-validation score on tuning set: 0.5789
Test set score: 0.5556
23-fold cross-validation score on entire dataset: 0.5942
LOOCV score on entire dataset: 0.6596
Confusion matrix: 
 [[17  6]
 [10 14]]
AUC score: 0.6902
