In [1]:
train_len = 295246830
one_len = 1589906
zero_len = 293656924
test_len = 1674896
protein_map = {'BRD4': 1, 'HSA': 2, 'sEH': 3}
vocab = {'C': 6825082866, '#': 81527490, '@': 511451694, 'H': 456489972, '=': 1406606874, 'O': 2554179786,
         'N': 2469595230, 'c': 12257477022, '-': 438483636, '.': 216945504, 'l': 491088828, 'B': 123330132,
         'r': 121915914, 'n': 1997759694, 'D': 295246830, 'y': 295246830, 'o': 67918650, 's': 156618468,
         'S': 90662574, 'F': 492710238, '+': 65206260, 'i': 1414026, '/': 11547096, 'I': 23972994}

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import LongType, IntegerType, StructType, StructField, ArrayType, DoubleType, StringType
from pyspark.ml.linalg import SparseVector

from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, StringIndexerModel, OneHotEncoderModel
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

import os
import pandas as pd
import numpy as np
import joblib

from xgboost.spark import SparkXGBClassifier
from functools import wraps
import xgboost as xgb

from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

from concurrent.futures import ThreadPoolExecutor
from joblib import Parallel, delayed
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski, rdmolops, AllChem, rdchem, rdEHTTools, rdMolDescriptors
# from tqdm.auto import tqdm
from tqdm import tqdm
from padelpy import from_smiles
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
# # for 256 Gb and 64 Cores
# spark = (
#     SparkSession
#     .builder
#     .appName("leash belka3")
#     .config("spark.driver.memory", "48g")  # Increased driver memory
#     .config("spark.executor.memory", "48g")  # Increased executor memory
#     .config("spark.executor.instances", "16")  # 16 executors
#     .config("spark.executor.cores", "4")  # 4 cores per executor
#     .config("spark.driver.maxResultSize", "4g")  # Driver result size limit
#     .config("spark.local.dir", "temp")  # Specify a directory with enough space
#     # .config("spark.local.dir", "/scratch/23m1521/temp")  # Specify a directory with enough space
#     .config("spark.shuffle.file.buffer", "128k")  # Shuffle buffer size
#     .config("spark.memory.fraction", "0.8")  # Memory fraction for tasks
#     .config("spark.shuffle.memoryFraction", "0.6")  # Shuffle memory fraction
#     .config("spark.executor.javaOptions", "-Xmx48g")  # JVM heap size for executors
#     .master("local[64]")  # Use all 64 cores on the machine
#     .getOrCreate()
# )

# spark
spark = (
    SparkSession
    .builder
    .appName("leash belka3467")
    .config("spark.driver.memory", "64g")  # Increased driver memory for large jobs
    .config("spark.executor.memory", "64g")  # Increased executor memory
    .config("spark.executor.instances", "32")  # 32 executors
    .config("spark.executor.cores", "2")  # 2 cores per executor
    .config("spark.driver.maxResultSize", "8g")  # Driver result size limit
    .config("spark.local.dir", "temp")  # Ensure high-speed storage
    .config("spark.shuffle.file.buffer", "1024k")  # Larger shuffle buffer for better IO
    .config("spark.memory.fraction", "0.85")  # Increased memory for tasks
    .config("spark.shuffle.memoryFraction", "0.7")  # Increased shuffle memory
    .config("spark.executor.javaOptions", "-Xmx64g")  # JVM heap size for executors
    .master("local[*]")  # Use all 64 cores on the machine
    .getOrCreate()
)
spark


# # GPU
# spark = (
#     SparkSession
#     .builder
#     .appName("leash belka3467")
#     .config("spark.driver.memory", "64g")
#     .config("spark.executor.memory", "64g")
#     .config("spark.executor.instances", "2")  # 2 executors, one per GPU
#     .config("spark.executor.cores", "32")  # Divide cores equally between executors (64/2)
#     .config("spark.driver.maxResultSize", "8g")
#     .config("spark.local.dir", "temp")
#     .config("spark.shuffle.file.buffer", "1024k")
#     .config("spark.memory.fraction", "0.85")
#     .config("spark.shuffle.memoryFraction", "0.7")
#     .config("spark.executor.javaOptions", "-Xmx64g")
#     .config("spark.executor.resource.gpu.amount", "1") # Assign 1 GPU per executor
#     .config("spark.master", "local[*]") # Important: Use local cluster mode to enable GPU scheduling
#     .getOrCreate()
# )
# spark

# SparkSession for 128 GB RAM and 64 cores
# spark = (
#     SparkSession
#     .builder
#     .appName("Optimized Spark for 128GB RAM and 64 Cores")
#     .config("spark.driver.memory", "64g")  # 64GB for driver memory
#     .config("spark.executor.memory", "64g")  # 64GB for executor memory
#     .config("spark.executor.instances", "16")  # 16 executors
#     .config("spark.executor.cores", "4")  # 4 cores per executor (total = 64 cores)
#     .config("spark.driver.maxResultSize", "8g")  # Driver result size limit
#     .config("spark.local.dir", "temp")  # Temp directory with enough space
#     .config("spark.shuffle.file.buffer", "512k")  # Increased shuffle buffer size
#     .config("spark.memory.fraction", "0.8")  # Memory fraction for tasks
#     .config("spark.shuffle.memoryFraction", "0.6")  # Shuffle memory fraction
#     .config("spark.executor.javaOptions", "-Xmx64g")  # JVM heap size for executors
#     .master("local[64]")  # Use all 64 cores on the machine
#     .getOrCreate()
# )

# spark

# SynapseML 
# spark = (
#     SparkSession
#     .builder
#     .appName("leash belka3")
#     .config("spark.driver.memory", "48g")  # Increased driver memory
#     .config("spark.executor.memory", "48g")  # Increased executor memory
#     .config("spark.executor.instances", "16")  # 16 executors
#     .config("spark.executor.cores", "4")  # 4 cores per executor
#     .config("spark.driver.maxResultSize", "4g")  # Driver result size limit
#     .config("spark.local.dir", "temp")  # Specify a directory with enough space
#     .config("spark.shuffle.file.buffer", "128k")  # Shuffle buffer size
#     .config("spark.memory.fraction", "0.8")  # Memory fraction for tasks
#     .config("spark.shuffle.memoryFraction", "0.6")  # Shuffle memory fraction
#     .config("spark.executor.javaOptions", "-Xmx48g")  # JVM heap size for executors
#     .config("spark.jars.packages", "com.microsoft.azure:synapseml_2.12:1.0.8")
#     .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven")
#     .master("local[64]")  # Use all 64 cores on the machine
#     .getOrCreate()
# )

# spark

# spark = (
#     SparkSession
#     .builder
#     .appName("leash belka3")
#     .config("spark.driver.memory", "64g")  # Increased driver memory
#     .config("spark.executor.memory", "64g")  # Increased executor memory
#     .config("spark.executor.instances", "8")  # Reduced number of executors
#     .config("spark.executor.cores", "8")  # Increased cores per executor
#     .config("spark.driver.maxResultSize", "4g")  # Driver result size limit
#     .config("spark.local.dir", "temp")  # Specify a directory with enough space
#     .config("spark.shuffle.file.buffer", "128k")  # Shuffle buffer size
#     .config("spark.memory.fraction", "0.8")  # Memory fraction for tasks
#     .config("spark.shuffle.memoryFraction", "0.7")  # Shuffle memory fraction
#     .config("spark.executor.javaOptions", "-Xmx64g")  # JVM heap size for executors
#     .config("spark.sql.shuffle.partitions", "1000")  # Increase shuffle partitions
#     .config("spark.ui.enabled", "true")  # Enable Spark UI
#     .master("local[8]")  # Reduced number of cores for local mode
#     .getOrCreate()
# )

# spark


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/04 19:11:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/04 19:11:25 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [3]:
datadir = "/home/23m1521/ashish/kaggle/full_feat_tok_df_vectors.parquet"
chunks_path = sorted([os.path.join(datadir, i) for i in os.listdir(datadir) if i.endswith(".parquet")])
total_chunks = len(chunks_path)
print(total_chunks)

200


In [7]:
def load_df_chunk(path):
    return spark.read.format('parquet').load(path)

def add_sample_weights(df):
    class_counts = df.groupBy("binds").count().collect()
    total_count = sum(row["count"] for row in class_counts)
    class_weights = {row["binds"]: total_count / (2 * row["count"]) for row in class_counts}
    return df.withColumn("sample_weights", when(col("binds") == 0, class_weights[0]).when(col("binds") == 1, class_weights[1]))

def get_scale_pos_weight(df):
    class_counts = dict(df.groupBy("binds").count().collect())
    return class_counts[0]/class_counts[1]
    
def make_dataset(df, chunk_df_count):
    def process_row(row):
        return (row['vectors'].toArray(), row['binds'], row['sample_weights'])
    features, labels, weights = [], [], []
    for feature, label, weight in tqdm(df.rdd.map(process_row).toLocalIterator(), total=chunk_df_count):
        features.append(feature)
        labels.append(label)
        weights.append(weight)
    return features, labels, weights

def make_dataset2(df, test=False):
    df = df.toPandas()
    df.vectors = df.vectors.map(lambda x: x.toArray())
    if test == True:
        return df.id.values, np.array([i for i in df.vectors.values]), None, None
    return df.id.values, np.array([i for i in df.vectors.values]), df.binds.values, df.sample_weights.values

def save_checkpoint(model, params, i, evals_result,  path, save_name):
    os.makedirs(path, exist_ok=True)
    model.save_model(os.path.join(path, f"{save_name}.json"))
    joblib.dump({"params": params, 'i': i, "evals_result": evals_result}, os.path.join(path, f"{save_name}_params.joblib"))
    print("Model saved at", path)

def load_checkpoint(path, save_name):
    model = xgb.Booster()
    model.load_model(os.path.join(path, f"{save_name}.json"))
    ckpt = joblib.load(os.path.join(path, f"{save_name}_params.joblib"))
    params, i = ckpt['params'], ckpt['i']
    print("Model loaded from", path)
    return model, params, i

def train_xgb(dmatrix, xgb_model=None):
    lr = [0.1, 0.07, 0.04, 0.01, 0.007]
    best_params1 = {
    'objective': 'binary:logistic',
    'eval_metric': ['logloss', 'aucpr'],
    'subsample': 1.0, 
    'rate_drop': 0.4,
    'skip_drop': 0.5,
    'min_child_weight': 3, 
    'max_depth': 3, 
    'lambda': 5, 
    'gamma': 0, 
    'eta': lr[0], 
    'seed': 42,
    'colsample_bytree': 1.0, 
    'alpha': 5,
    'device': 'cuda',
    # 'device': 'cpu'
    }
    evals_result = {'train': {'logloss': [], 'aucpr': []}}
    
    bst = xgb.train(
        best_params1, 
        dmatrix, 
        num_boost_round=100,
        evals=[(dmatrix, 'train')], 
        evals_result=evals_result, 
        verbose_eval=False,
        xgb_model=xgb_model
        # early_stopping_rounds=500,
        )
    return bst, evals_result, best_params1

def delete_df_chunk(df):
    df.unpersist()
    del df

def spark_suppress_logs(level="ERROR", reset_level="INFO"):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            spark.sparkContext.setLogLevel(level)
            try:
                return func(*args, **kwargs)
            finally:
                spark.sparkContext.setLogLevel(reset_level)
        return wrapper
    return decorator


@spark_suppress_logs()
def incrementally_train():
    with tqdm(total=total_chunks, dynamic_ncols=True) as pbar1:
        sample_count = 0
        xgb_model = None
        ckpt_dir = "Incrementally_train_XGB_ckpt"
        
        for i, chunk_path in enumerate(chunks_path):
            # if i > 1:
            #     break
        
            pbar1.set_description(f"Chunk: {i+1}")
            
            
            # --- Load chunk --------------------------------------
            chunk_df = load_df_chunk(chunk_path)
            chunk_df = add_sample_weights(chunk_df)
            chunk_df = chunk_df.repartition(1)
            chunk_df_count = chunk_df.count()
            
            # --- Getting Dataset ---------------------------------
            _, features, labels, weights = make_dataset2(chunk_df)
            dtrain = xgb.DMatrix(data=features, label=labels, weight=weights, nthread=25)
            
            # # --- Train ------------------------------------------
            if xgb_model is None:
                xgb_model, evals_result, params = train_xgb(dtrain)
            else:
                xgb_model, evals_result, params = train_xgb(dtrain, xgb_model)
            save_checkpoint(xgb_model, params, i, evals_result, ckpt_dir, f"_{i+1}_ckpt")
            
            # # --- Model Evaluation -------------------------------
            loss = np.mean(evals_result['train']['logloss'])
            aucpr = np.mean(evals_result['train']['aucpr'])
            print(f"Chunk {i+1} trained. Loss: {loss}, AUCPR: {aucpr}")
            
            
            # --- Clean up ----------------------------------------
            # delete_df_chunk(chunk_df)
            del chunk_df, features, labels, weights, dtrain, _
            sample_used = sample_count + chunk_df_count
            sample_used_percentage = (sample_used / train_len) * 100
            remaining_samples = train_len - sample_used
            remaining_samples_percentage = (remaining_samples / train_len) * 100
            pbar1.set_postfix_str(
                f"{sample_used} ({sample_used_percentage:.2f}%) samples used," 
                f"{remaining_samples} ({remaining_samples_percentage:.2f}%) samples remaining"
            )
            pbar1.update(1)
            sample_count += chunk_df_count
            
        # --- Save final model -----------------------------------
        save_checkpoint(xgb_model, params, i, evals_result, ckpt_dir, f"Final_ckpt")


@spark_suppress_logs()
def incrementally_inference(model, chunks_path, save_dir, test=False):
    with tqdm(total=total_chunks, dynamic_ncols=True) as pbar1:
        sample_count = 0
        os.makedirs(save_dir, exist_ok=True)
        # id_array = np.array([], dtype=np.uint64)
        # preds_array = np.array([], dtype=np.float128)
        id_array = np.array([])
        preds_array = np.array([])
        
        for i, chunk_path in enumerate(chunks_path):
            pbar1.set_description(f"Chunk: {i+1}")
            
            
            # --- Load chunk --------------------------------------
            chunk_df = load_df_chunk(chunk_path)
            # chunk_df = add_sample_weights(chunk_df)
            chunk_df = chunk_df.repartition(1)
            chunk_df_count = chunk_df.count()
            
            # --- Getting Dataset ---------------------------------
            ids, features, labels, weights = make_dataset2(chunk_df, test=True)
            dtest = xgb.DMatrix(data=features, nthread=25)
            
            # # --- Predict ---------------------------------------
            preds = model.predict(dtest)

            preds_array = np.concatenate([preds_array, preds])
            id_array = np.concatenate([id_array, ids])
            
            # --- Save predictions -------------------------------
            chunk_save_path = os.path.join(save_dir, f"chunk_{i + 1}.npz")
            if test:
                np.savez(chunk_save_path, ids=ids, preds=preds)
            else:
                np.savez(chunk_save_path, ids=ids, labels=labels, preds=preds)
    
            
            # --- Clean up ----------------------------------------
            del chunk_df, features, labels, weights, dtest, ids, preds
            sample_used = sample_count + chunk_df_count
            sample_used_percentage = (sample_used / train_len) * 100
            remaining_samples = train_len - sample_used
            remaining_samples_percentage = (remaining_samples / train_len) * 100
            pbar1.set_postfix_str(
                f"{sample_used} ({sample_used_percentage:.2f}%) samples used," 
                f"{remaining_samples} ({remaining_samples_percentage:.2f}%) samples remaining"
            )
            pbar1.update(1)
            sample_count += chunk_df_count
            
    if test:
        return {'id': id_array, 'binds': preds_array}


def load_npzs(d):
    id_array = np.array([], dtype=np.uint64)
    preds_array = np.array([], dtype=np.float128)
    if os.path.exists(d):
        for f in os.listdir(d):
            if f.endswith(".npz"):
                try:
                    with np.load(os.path.join(d, f)) as data:
                        preds_array = np.concatenate([preds_array, data["preds"]])
                        id_array = np.concatenate([id_array, data["ids"]])
                except: pass
    return id_array, preds_array


from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rand

def stratified_split(df, stratify_col, train_ratio=0.8, seed=None):
    if stratify_col not in df.columns:
        print(f"Stratify column '{stratify_col}' not found in DataFrame.")
        return None

    train_df = None
    test_df = None

    for value in df.select(stratify_col).distinct().collect():
        value = value[0]  # Extract the actual value from the Row object
        
        # Sample a fraction of the data for the current value
        sampled_df = df.filter(col(stratify_col) == value).sample(withReplacement=False, fraction=train_ratio, seed=seed)
        
        # Create the test set by excluding the sampled data
        remaining_df = df.filter(~col("id").isin([row.id for row in sampled_df.select("id").collect()]) if "id" in df.columns else ~df.isin(sampled_df))

        if train_df is None:
            train_df = sampled_df
            test_df = remaining_df
        else:
            train_df = train_df.union(sampled_df)
            test_df = test_df.union(remaining_df)
    return train_df, test_df

In [5]:
# spark.sparkContext.setLogLevel("ERROR")
# incrementally_train()
# spark.sparkContext.setLogLevel("INFO")

In [6]:
# # --- Load chunk --------------------------------------
# chunk_df = load_df_chunk(chunks_path[1])
# train_df, test_df = chunk_df.randomSplit([0.8, 0.2], seed=42)
# train_df, test_df = add_sample_weights(train_df), add_sample_weights(test_df)
# train_df, test_df = train_df.repartition(1), test_df.repartition(1)
# train_df_count, test_df_count = train_df.count(), test_df.count()
# print(train_df_count, test_df_count)

# class_counts = dict(chunk_df.groupBy("binds").count().collect())
# scale_pos_weight = class_counts[0]/class_counts[1]
# print(scale_pos_weight, class_counts)

# # --- Getting Dataset ---------------------------------
# train_ids, train_features, train_labels, train_weights = make_dataset2(train_df)
# test_ids, test_features, test_labels, test_weights = make_dataset2(test_df)

In [7]:
# --- Load chunk --------------------------------------
chunk_df = load_df_chunk(chunks_path[1])
chunk_df = add_sample_weights(chunk_df)
chunk_df = chunk_df.repartition(1)
chunk_df_count = chunk_df.count()
print(chunk_df_count)

class_counts = dict(chunk_df.groupBy("binds").count().collect())
scale_pos_weight = class_counts[0]/class_counts[1]
print(scale_pos_weight, class_counts)

# --- Getting Dataset ---------------------------------
ids, features, labels, weights = make_dataset2(chunk_df)
# dtrain = xgb.DMatrix(data=features, label=labels, weight=weights, nthread=25)

                                                                                

1476728
186.8549802824068 {0: 1468867, 1: 7861}


                                                                                

In [8]:
STUDY_NAME = f"XGB_HPC"
STUDY_NAME

'XGB_HPC'

In [9]:
!rm -vrf "/home/23m1521/ashish/kaggle/db_XGB_HPC.sqlite3"

removed '/home/23m1521/ashish/kaggle/db_XGB_HPC.sqlite3'


In [10]:
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score #use average precision score
import xgboost as xgb
import numpy as np

def objective(trial):
    # Define hyperparameter search space for Optuna using the same values as before
    max_depth = trial.suggest_int("max_depth", 3, 19, step=2)  # Corrected range for step=2
    eta = trial.suggest_categorical("eta", [0.01, 0.05, 0.1, 0.2, 0.3, 0.5])
    subsample = trial.suggest_categorical("subsample", [0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
    colsample_bytree = trial.suggest_categorical("colsample_bytree", [0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
    gamma = trial.suggest_categorical("gamma", [0, 0.1, 0.2, 0.5, 1.0])
    min_child_weight = trial.suggest_categorical("min_child_weight", [1, 3, 5, 7, 10])
    lambda_val = trial.suggest_categorical("lambda", [0, 1, 5, 10])
    alpha = trial.suggest_categorical("alpha", [0, 1, 5, 10])
    n_estimators = trial.suggest_categorical("n_estimators", [50, 100, 500, 1000, 3000])


    # Stratified K-Fold for Cross-Validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Evaluate the model using cross-validation using average precision and DMatrix
    aucpr_score = 0
    for train_index, test_index in cv.split(features, labels):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]

        # Create DMatrices
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)

        # XGBoost parameters (using trial suggestions)
        params = {
            'objective': 'binary:logistic',
            'eval_metric': 'aucpr',
            'seed': 42,
            'scale_pos_weight': scale_pos_weight,
            'tree_method': 'hist',  # Ensures GPU acceleration
            'nthread': -1,
            'max_depth': max_depth,
            'eta': eta,
            'subsample': subsample,
            'colsample_bytree': colsample_bytree,
            'gamma': gamma,
            'min_child_weight': min_child_weight,
            'lambda': lambda_val,
            'alpha': alpha,
            'device': 'cuda:1'
        }

        evals_result = {'test': {'aucpr': []}}
        bst = xgb.train(
            params,
            dtrain,
            num_boost_round=n_estimators,
            evals=[(dtest, 'test')],
            evals_result=evals_result,
            verbose_eval=False
        )

        aucpr_score += np.mean(evals_result['test']['aucpr'])
        
        del X_train, X_test, y_train, y_test, dtrain, dtest, bst, evals_result

    aucpr_score /= cv.n_splits

    # Return the negative average precision for minimization (minimize error)
    return -aucpr_score


study = optuna.create_study(storage=f"sqlite:///db_{STUDY_NAME}.sqlite3",
                            sampler=optuna.samplers.RandomSampler(seed=42),
                            # pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
                            study_name=STUDY_NAME,
                            load_if_exists=True,
                            direction='minimize')
study.optimize(objective, 
               n_trials=100,
               timeout=None,
               n_jobs = 10,
               gc_after_trial=True,
               show_progress_bar = True)

# Print the best parameters and score
print("Best Parameters:", study.best_params)
print("Best Score (Negative Average Precision):", study.best_value)

[I 2025-01-03 06:00:20,387] A new study created in RDB with name: XGB_HPC


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-01-03 06:09:11,185] Trial 5 finished with value: -0.26471143980647943 and parameters: {'max_depth': 5, 'eta': 0.01, 'subsample': 0.7, 'colsample_bytree': 0.9, 'gamma': 1.0, 'min_child_weight': 5, 'lambda': 5, 'alpha': 5, 'n_estimators': 500}. Best is trial 5 with value: -0.26471143980647943.
[I 2025-01-03 06:11:07,183] Trial 8 finished with value: -0.35034939755464173 and parameters: {'max_depth': 13, 'eta': 0.1, 'subsample': 0.9, 'colsample_bytree': 0.6, 'gamma': 0, 'min_child_weight': 5, 'lambda': 10, 'alpha': 1, 'n_estimators': 100}. Best is trial 8 with value: -0.35034939755464173.
[I 2025-01-03 06:17:13,610] Trial 1 finished with value: -0.33134229471177273 and parameters: {'max_depth': 5, 'eta': 0.5, 'subsample': 0.9, 'colsample_bytree': 1.0, 'gamma': 1.0, 'min_child_weight': 1, 'lambda': 0, 'alpha': 10, 'n_estimators': 1000}. Best is trial 8 with value: -0.35034939755464173.
[I 2025-01-03 06:17:48,087] Trial 9 finished with value: -0.3145836479253691 and parameters: {'ma

In [None]:
# Get the best model
best_params = study.best_params
best_xgb_clf = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='aucpr',
        seed=42,
        scale_pos_weight=scale_pos_weight,
        device='cuda',
        tree_method='hist',  # Ensures GPU acceleration
        use_label_encoder=False,
        n_jobs=10,
        **best_params
)
best_xgb_clf.fit(features, labels)

In [None]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

# Define parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10, 15, 20],
    'eta': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.5, 1.0],
    'min_child_weight': [1, 3, 5, 7, 10],
    'lambda': [0, 1, 5, 10],
    'alpha': [0, 1, 5, 10],
    'n_estimators': [50, 100, 500, 1000, 3000],
}

# XGBClassifier with GPU-specific settings
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='prauc',
    seed=42,
    scale_pos_weight=scale_pos_weight,
    device='cuda',
    tree_method='hist',  # Ensures GPU acceleration
    # predictor='gpu_predictor',  # GPU prediction
    n_jobs=10
)

# RandomizedSearchCV setup
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_grid,
    scoring='average_precision',
    n_iter=100,  # Number of parameter settings sampled
    cv=5,       # 5-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=10
)

# Fit the model
random_search.fit(features, labels)

# Print the best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

## Model Evaluation

In [6]:
# full_feat_tok_df_vectors = spark.read.format('parquet').load(chunks_path[1])
# print(full_feat_tok_df_vectors.count())
# full_feat_tok_df_vectors.show()

In [8]:
params = joblib.load("Incrementally_train_XGB2_ckpt/Final_ckpt_params.joblib")['params']
model = xgb.Booster(params=params)
model.load_model("Incrementally_train_XGB2_ckpt/Final_ckpt.json")

In [9]:
full_preds = incrementally_inference(model, chunks_path, save_dir="Incrementally_Inference3_Preds", test=False)

Parameters: { "num_boost_round", "rate_drop", "skip_drop" } are not used.

Chunk: 200: 100%|██████████| 200/200 [7:26:50<00:00, 134.05s/it, 295246830 (100.00%) samples used,0 (0.00%) samples remaining]          


: 

In [54]:
chunk_pred_df = pd.DataFrame(chunk_preds)
chunk_pred_df

Unnamed: 0,id,binds
0,65.0,0.152128
1,191.0,0.017978
2,418.0,0.010289
3,541.0,0.069637
4,558.0,0.064629
...,...,...
1476723,295246296.0,0.370344
1476724,295246331.0,0.045317
1476725,295246454.0,0.609166
1476726,295246584.0,0.015393


In [57]:
chunk_pred_df['id'] = chunk_pred_df['id'].astype(np.int64)
chunk_pred_df = chunk_pred_df.sort_values(by=['id'])
chunk_pred_df

Unnamed: 0,id,binds
0,65,0.152128
1,191,0.017978
2,418,0.010289
3,541,0.069637
4,558,0.064629
...,...,...
1476723,295246296,0.370344
1476724,295246331,0.045317
1476725,295246454,0.609166
1476726,295246584,0.015393


## Test Inference

In [5]:
test_feat_tok_df_vectors = spark.read.format('parquet').load("test_feat_tok_df_vectors.parquet")
print(test_feat_tok_df_vectors.count())
test_feat_tok_df_vectors.show()

1674896


                                                                                

+---------+--------------------+
|       id|             vectors|
+---------+--------------------+
|295246852|(190,[0,1,2,3,4,5...|
|295246961|(190,[0,1,2,3,4,5...|
|295247142|(190,[0,1,2,3,4,5...|
|295247169|(190,[0,1,2,3,4,5...|
|295247204|(190,[0,1,2,3,4,5...|
|295247213|(190,[0,1,2,3,4,5...|
|295247329|(190,[0,1,2,3,4,5...|
|295247347|(190,[0,1,2,3,4,5...|
|295247378|(190,[0,1,2,3,4,5...|
|295247397|(190,[0,1,2,3,4,5...|
|295247414|(190,[0,1,2,3,4,5...|
|295247424|(190,[0,1,2,3,4,5...|
|295247425|(190,[0,1,2,3,4,5...|
|295247435|(190,[0,1,2,3,4,5...|
|295247608|(190,[0,1,2,3,4,5...|
|295247672|(190,[0,1,2,3,4,5...|
|295247725|(190,[0,1,2,3,4,5...|
|295247799|(190,[0,1,2,3,4,5...|
|295247807|(190,[0,1,2,3,4,5...|
|295247924|(190,[0,1,2,3,4,5...|
+---------+--------------------+
only showing top 20 rows



In [6]:
params = joblib.load("Incrementally_train_XGB2_ckpt/Final_ckpt_params.joblib")['params']
model = xgb.Booster(params=params)
model.load_model("Incrementally_train_XGB2_ckpt/Final_ckpt.json")

In [17]:
full_preds = incrementally_inference(model, ["test_feat_tok_df_vectors.parquet"], save_dir="Incrementally_Inference_Test_Preds", test=True)

Chunk: 1:   0%|          | 1/200 [02:23<7:55:40, 143.42s/it, 1674896 (0.57%) samples used,293571934 (99.43%) samples remaining]


In [19]:
full_preds['id'][0], full_preds['binds'][0]

(295246852.0, 3.3264886071388000133e-12)

In [20]:
chunk_pred_df = pd.DataFrame(full_preds)
chunk_pred_df

Unnamed: 0,id,binds
0,295246852.0,3.326489e-12
1,295246961.0,6.416114e-18
2,295247142.0,1.403722e-10
3,295247169.0,9.945667e-10
4,295247204.0,1.951050e-11
...,...,...
1674891,296921421.0,3.620435e-23
1674892,296921567.0,1.376455e-07
1674893,296921570.0,4.055630e-04
1674894,296921682.0,4.409989e-07


In [21]:
chunk_pred_df['id'] = chunk_pred_df['id'].astype(np.int64)
chunk_pred_df = chunk_pred_df.sort_values(by=['id'])
chunk_pred_df

Unnamed: 0,id,binds
1532514,295246830,2.180438e-04
200651,295246831,1.023393e-05
852914,295246832,1.461992e-09
401038,295246833,6.264513e-13
852915,295246834,4.745824e-17
...,...,...
1129744,296921721,7.499293e-15
150626,296921722,2.210206e-18
325936,296921723,6.274705e-12
526651,296921724,5.774928e-13


In [22]:
def getPredictions(df, model):
    predictions = model.transform(df).select("id", "prediction", "probability").orderBy('id')
    predictions.show(truncate=False)
    return predictions

def makeSubmission(
    test_prob, 
    file_name,
    message
):
    import subprocess, os    
    os.makedirs("submission_csv", exist_ok=True)

    sub_df = pd.read_csv('sample_submission.csv.zip')
    sub_df.binds = test_prob
    sub_df.to_csv(file_name, index=False)
    display(pd.read_csv(file_name))
    
    command = [
        "kaggle", "competitions", "submit",
        "-c", "leash-BELKA",
        "-f", file_name,
        "-m", message
    ]
    
    subprocess.run(command)

In [23]:
makeSubmission(
    test_prob=chunk_pred_df.binds.values,
    file_name= f"submission_csv/_6_sub_XGBoost_Incrementally-HPS.csv",
    message = f"XGBoost Incrementally GPU with HPS"
)

Unnamed: 0,id,binds
0,295246830,2.180438e-04
1,295246831,1.023393e-05
2,295246832,1.461992e-09
3,295246833,6.264513e-13
4,295246834,4.745824e-17
...,...,...
1674891,296921721,7.499293e-15
1674892,296921722,2.210206e-18
1674893,296921723,6.274705e-12
1674894,296921724,5.774928e-13


100%|██████████| 55.8M/55.8M [00:16<00:00, 3.62MB/s]   


Successfully submitted to NeurIPS 2024 - Predict New Medicines with BELKA