In [None]:
%pip install pandas scikit-learn xgboost

In [None]:
# General configuration

import os
import sys

import pandas as pd

sys.path.append(os.path.abspath("src"))
from helper import path

DATASET_YEAR = 2007
USE_ORIG_INPUT = True

pd.options.display.width = 120
pd.options.display.max_rows = 50
pd.options.display.float_format = "{:.2f}".format

In [None]:
# Path management

# NOTE: We stub the nonexistent original 2020 dataset with the 2019 dataset
REAL_YEAR = DATASET_YEAR
if USE_ORIG_INPUT:
    DATASET_YEAR = DATASET_YEAR if DATASET_YEAR != 2020 else 2019

CWD_INPUT_PATH = path(f"input/{DATASET_YEAR}")
CWD_OUTPUT_PATH = path("output")
ORIG_INPUT_PATH = path(f"../PLBAffinity/CASF/input/{DATASET_YEAR}")
ORIG_INPUT_OUTPUT_PATH = path("output/orig_input")

# Use the input files from the original and generate the output with python
INPUT_PATH = CWD_INPUT_PATH if not USE_ORIG_INPUT else ORIG_INPUT_PATH
OUTPUT_PATH = CWD_OUTPUT_PATH if not USE_ORIG_INPUT else ORIG_INPUT_OUTPUT_PATH

AA_PATH = path(f"{INPUT_PATH}/CASF_{DATASET_YEAR}_aa.csv")
LIGAND_PATH = path(f"{INPUT_PATH}/CASF_{DATASET_YEAR}_ligprop.csv")
ACTIVITIES_PATH = path(f"{INPUT_PATH}/CASF_{DATASET_YEAR}_activities.csv")
PERFORMANCE_PATH = path(f"{OUTPUT_PATH}/{REAL_YEAR}.csv")

# Only necessary for USE_ORIG_INPUT
ACTIVITIES_TEST_PATH = path(f"{INPUT_PATH}/CASF_{DATASET_YEAR}_activities_test.csv")
ACTIVITIES_TRAIN_PATH = path(f"{INPUT_PATH}/CASF_{DATASET_YEAR}_activities_train.csv")

# The original input for the year 2019 has different filenames and no test/train split
if USE_ORIG_INPUT and DATASET_YEAR in [2019, 2020]:
    AA_PATH = path(f"{INPUT_PATH}/refined_{DATASET_YEAR}_aa.csv")
    LIGAND_PATH = path(f"{INPUT_PATH}/CASF_{DATASET_YEAR}_ligprop.csv")
    ACTIVITIES_PATH = path(f"{CWD_INPUT_PATH}/CASF_{DATASET_YEAR}_activities.csv")

In [None]:
# Data preparation

from sklearn.model_selection import train_test_split

x_aa = pd.read_csv(AA_PATH, index_col=0)
x_ligprop = pd.read_csv(LIGAND_PATH, index_col=0)

if USE_ORIG_INPUT and DATASET_YEAR not in [2019, 2020]:
    y_train = pd.read_csv(ACTIVITIES_TRAIN_PATH, index_col=0)
    y_test = pd.read_csv(ACTIVITIES_TEST_PATH, index_col=0)
else:
    y_activities = pd.read_csv(ACTIVITIES_PATH, index_col=0)
    y_train, y_test = train_test_split(y_activities, test_size=0.2, random_state=42)

# print(x_aa[:2], x_ligprop[:2], y_train[:2], y_test[:2])
# print(x_aa.shape, x_ligprop.shape, y_train.shape, y_test.shape)

In [None]:
import numpy as np
import warnings
from scipy.stats import pearsonr
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# Ignore UserWarning: A worker stopped while some jobs were given to the executor.
module = "joblib.externals.loky.process_executor"
warnings.filterwarnings("ignore", category=UserWarning, module=module)

GRID_SEARCH = True
GRID_SEARCH_VERBOSE = 1
RANDOM_SEED = 47567

PARAM_GRID = {
    "n_estimators": [500, 1000, 1500, 2000] if GRID_SEARCH else [1500],
    "max_depth": [2, 4, 6, 8] if GRID_SEARCH else [6],
    "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3] if GRID_SEARCH else [0.01],
    "gamma": [0.05],
    "subsample": [0.5],
    "colsample_bytree": [1],
    "min_child_weight": [1],
}

np.random.seed(RANDOM_SEED)


def build_xgb_regressor(x_train, y_train):
    xgb_model = XGBRegressor(objective="reg:squarederror")
    grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=PARAM_GRID,
        scoring="neg_root_mean_squared_error",
        cv=5,
        n_jobs=-1,
        verbose=GRID_SEARCH_VERBOSE,
    )

    grid_search.fit(x_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_


def estimate_regressor_performance(y_test, y_hat):
    pearson_r, _ = pearsonr(y_test, y_hat)
    rmse = np.sqrt(np.mean((y_test - y_hat) ** 2))
    return pearson_r, rmse

In [None]:
%%time
x_train_ligprop = x_ligprop.loc[y_train.index]
x_test_ligprop = x_ligprop.loc[y_test.index]

ligprop_model, ligprop_params = build_xgb_regressor(x_train_ligprop, y_train)
print(ligprop_params)

y_hat_ligprop = ligprop_model.predict(x_test_ligprop)
ligprop_perf = estimate_regressor_performance(y_test.values.reshape(-1), y_hat_ligprop)
print(ligprop_perf)


In [None]:
%%time
x_train_aa = x_aa.loc[y_train.index]
x_test_aa = x_aa.loc[y_test.index]

aa_model, aa_params = build_xgb_regressor(x_train_aa, y_train)
print(aa_params)

y_hat_aa = aa_model.predict(x_test_aa)
aa_perf = estimate_regressor_performance(y_test.values.reshape(-1), y_hat_aa)
print(aa_perf)


In [None]:
%%time
x_train_ligprop_aa = pd.concat([x_train_ligprop, x_train_aa], axis=1)
x_train_ligprop_aa.columns = range(1, x_train_ligprop_aa.shape[1] + 1)

x_test_ligprop_aa = pd.concat([x_test_ligprop, x_test_aa], axis=1)
x_test_ligprop_aa.columns = range(1, x_test_ligprop_aa.shape[1] + 1)

ligprop_aa_model, ligprop_aa_params = build_xgb_regressor(x_train_ligprop_aa, y_train)
print(ligprop_aa_params)

y_hat_ligprop_aa = ligprop_aa_model.predict(x_test_ligprop_aa)
ligprop_aa_perf = estimate_regressor_performance(y_test.values.reshape(-1), y_hat_ligprop_aa)
print(ligprop_aa_perf)


In [None]:
performance = (
    [aa_perf, ligprop_perf, ligprop_aa_perf]  # type: ignore
    if not (USE_ORIG_INPUT and REAL_YEAR == 2020)
    else [(0, 0), (0, 0), (0, 0)]
)

df = pd.DataFrame(
    performance,
    index=["Amino", "Ligand", "Amino+Ligand"],
    columns=["PearsonR", "RMSE"],
)

df.to_csv(PERFORMANCE_PATH, index=True, sep=",")
print(df)