# PyTorch Test Inference

In [None]:
import os
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from tqdm import tqdm, trange
from torch.utils.data import TensorDataset, DataLoader

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(os.cpu_count(), device)

In [34]:
class BinaryClassifier(nn.Module):
    def __init__(self, input_dim):
        super(BinaryClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 300),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(300, 64),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)
        
input_dim = 99
model = BinaryClassifier(input_dim).to(device)

checkpoint_path = "checkpoints/_2_PyTorch.pth"
checkpoint = torch.load(checkpoint_path, map_location=device)

state_dict = checkpoint['model_state_dict']
new_state_dict = {key.replace("module.", ""): value for key, value in state_dict.items()}

model.load_state_dict(new_state_dict)
model.eval()

  checkpoint = torch.load(checkpoint_path, map_location=device)


BinaryClassifier(
  (model): Sequential(
    (0): Linear(in_features=99, out_features=300, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=300, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=64, out_features=1, bias=True)
    (7): Sigmoid()
  )
)

In [14]:
test_df = pd.read_parquet('test_features.parquet')
test_df

Unnamed: 0,id,protein,a1,a2,a3,a4,a5,a6,a7,a8,...,d16,d17,d18,d19,d20,d21,d22,d23,d24,y
0,295246830,1,10,2,2,11,1,0,0,1,...,0,0,0,0,0,0,0,0,0,2
1,295246831,2,10,2,2,11,1,0,0,1,...,0,0,0,0,0,0,0,0,0,2
2,295246832,3,10,2,2,11,1,0,0,1,...,0,0,0,0,0,0,0,0,0,2
3,295246833,1,10,2,3,12,1,0,0,1,...,0,0,0,0,0,0,0,0,0,2
4,295246834,2,10,2,3,12,1,0,0,1,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1674891,296921721,2,8,0,9,14,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1674892,296921722,3,8,0,9,14,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1674893,296921723,1,8,0,1,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1674894,296921724,2,8,0,1,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [15]:
test_df = pd.get_dummies(test_df, columns=['protein'], dtype=int)
test_df = pd.concat([test_df.id, test_df.iloc[:,-3:], test_df.iloc[:,1:-3]], axis=1)
test_df

Unnamed: 0,id,protein_1,protein_2,protein_3,a1,a2,a3,a4,a5,a6,...,d16,d17,d18,d19,d20,d21,d22,d23,d24,y
0,295246830,1,0,0,10,2,2,11,1,0,...,0,0,0,0,0,0,0,0,0,2
1,295246831,0,1,0,10,2,2,11,1,0,...,0,0,0,0,0,0,0,0,0,2
2,295246832,0,0,1,10,2,2,11,1,0,...,0,0,0,0,0,0,0,0,0,2
3,295246833,1,0,0,10,2,3,12,1,0,...,0,0,0,0,0,0,0,0,0,2
4,295246834,0,1,0,10,2,3,12,1,0,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1674891,296921721,0,1,0,8,0,9,14,0,0,...,0,0,0,0,0,0,0,0,0,2
1674892,296921722,0,0,1,8,0,9,14,0,0,...,0,0,0,0,0,0,0,0,0,2
1674893,296921723,1,0,0,8,0,1,6,0,0,...,0,0,0,0,0,0,0,0,0,2
1674894,296921724,0,1,0,8,0,1,6,0,0,...,0,0,0,0,0,0,0,0,0,2


In [16]:
X_test = test_df.iloc[:, :-1]
X_test

Unnamed: 0,id,protein_1,protein_2,protein_3,a1,a2,a3,a4,a5,a6,...,d15,d16,d17,d18,d19,d20,d21,d22,d23,d24
0,295246830,1,0,0,10,2,2,11,1,0,...,0,0,0,0,0,0,0,0,0,0
1,295246831,0,1,0,10,2,2,11,1,0,...,0,0,0,0,0,0,0,0,0,0
2,295246832,0,0,1,10,2,2,11,1,0,...,0,0,0,0,0,0,0,0,0,0
3,295246833,1,0,0,10,2,3,12,1,0,...,0,0,0,0,0,0,0,0,0,0
4,295246834,0,1,0,10,2,3,12,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1674891,296921721,0,1,0,8,0,9,14,0,0,...,0,0,0,0,0,0,0,0,0,0
1674892,296921722,0,0,1,8,0,9,14,0,0,...,0,0,0,0,0,0,0,0,0,0
1674893,296921723,1,0,0,8,0,1,6,0,0,...,0,0,0,0,0,0,0,0,0,0
1674894,296921724,0,1,0,8,0,1,6,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
ids = torch.tensor(X_test.id.to_numpy(), dtype=torch.long)
X = torch.tensor(X_test.to_numpy()[:,1:], dtype=torch.float32)

dataset = TensorDataset(ids, X)
dataset[0], dataset[0][1].shape

((tensor(295246830),
  tensor([ 1.,  0.,  0., 10.,  2.,  2., 11.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,
           1.,  1.,  0.,  0.,  1.,  2.,  1.,  1.,  3.,  4.,  0.,  0.,  1.,  1.,
           1.,  1.,  4., 12.,  6.,  6., 15.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  3.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.])),
 torch.Size([99]))

In [18]:
dataloader = DataLoader(
    dataset,
    batch_size=1024,           # Batch size
    shuffle=False,            # Shuffle the data
    num_workers=os.cpu_count(),           # Number of subprocesses for data loading
    pin_memory=True,         # Pin memory for faster transfer to GPU
    drop_last=False,          # Drop the last incomplete batch
    prefetch_factor=2,       # Prefetch batches for faster loading
    persistent_workers=True  # Keep workers alive for faster data loading
)

In [54]:
results = {'id': torch.tensor([], dtype=torch.long), 'probs': torch.tensor([])}

with torch.inference_mode():
    for batch_idx, (ids_batch, X_batch) in tqdm(enumerate(dataloader), total=len(dataloader)):
        X_batch = X_batch.to(device)
        probs_batch = model(X_batch).squeeze()

        results['id'] = torch.concatenate([results['id'], ids_batch])
        results['probs'] = torch.concatenate([results['probs'], probs_batch])

100%|█████████████████████████████████████████████| 1636/1636 [00:04<00:00, 361.87it/s]


In [69]:
inference_df = pd.DataFrame(results).rename(columns={'probs': 'binds'})
inference_df

Unnamed: 0,id,binds
0,295246830,0.001360
1,295246831,0.001941
2,295246832,0.000182
3,295246833,0.001513
4,295246834,0.002237
...,...,...
1674891,296921721,0.000326
1674892,296921722,0.000165
1674893,296921723,0.000977
1674894,296921724,0.001435


In [68]:
sub_df = pd.read_csv('sample_submission.csv.zip')
sub_df

Unnamed: 0,id,binds
0,295246830,0.5
1,295246831,0.5
2,295246832,0.5
3,295246833,0.5
4,295246834,0.5
...,...,...
1674891,296921721,0.5
1674892,296921722,0.5
1674893,296921723,0.5
1674894,296921724,0.5


In [70]:
import subprocess, os

file_name = f"submission_csv/_3_submission_PyTorch_4.32.csv"
message = f"PyTorch Epoch: 4.34"
os.makedirs("submission_csv", exist_ok=True)

inference_df.to_csv(file_name, index=False)
display(pd.read_csv(file_name))

command = [
    "kaggle", "competitions", "submit",
    "-c", "leash-BELKA",
    "-f", file_name,
    "-m", message
]

subprocess.run(command)

Unnamed: 0,id,binds
0,295246830,0.001360
1,295246831,0.001941
2,295246832,0.000182
3,295246833,0.001513
4,295246834,0.002237
...,...,...
1674891,296921721,0.000326
1674892,296921722,0.000165
1674893,296921723,0.000977
1674894,296921724,0.001435


100%|██████████| 36.7M/36.7M [00:09<00:00, 3.91MB/s]  


Successfully submitted to NeurIPS 2024 - Predict New Medicines with BELKA

CompletedProcess(args=['kaggle', 'competitions', 'submit', '-c', 'leash-BELKA', '-f', 'submission_csv/_3_submission_PyTorch_4.32.csv', '-m', 'PyTorch Epoch: 4.34'], returncode=0)

# PySpark Test Inference

In [1]:
train_len = 295246830
one_len = 1589906
zero_len = 293656924
protein_map = {'BRD4': 1, 'HSA': 2, 'sEH': 3}
vocab = {'C': 6825082866, '#': 81527490, '@': 511451694, 'H': 456489972, '=': 1406606874, 'O': 2554179786,
         'N': 2469595230, 'c': 12257477022, '-': 438483636, '.': 216945504, 'l': 491088828, 'B': 123330132,
         'r': 121915914, 'n': 1997759694, 'D': 295246830, 'y': 295246830, 'o': 67918650, 's': 156618468,
         'S': 90662574, 'F': 492710238, '+': 65206260, 'i': 1414026, '/': 11547096, 'I': 23972994}

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import (LongType, IntegerType, 
StructType, StructField, ArrayType, DoubleType, StringType)
from pyspark.ml.linalg import SparseVector, DenseVector

from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, StringIndexerModel, OneHotEncoderModel
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.classification import GBTClassifier, GBTClassificationModel

import pandas as pd
import numpy as np
import joblib

from xgboost.spark import SparkXGBClassifier

from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

from concurrent.futures import ThreadPoolExecutor
from joblib import Parallel, delayed
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski, rdmolops, AllChem, rdchem, rdEHTTools, rdMolDescriptors
from tqdm.auto import tqdm
from padelpy import from_smiles
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
# # for 256 Gb and 64 Cores
# spark = (
#     SparkSession
#     .builder
#     .appName("leash belka3")
#     .config("spark.driver.memory", "48g")  # Increased driver memory
#     .config("spark.executor.memory", "48g")  # Increased executor memory
#     .config("spark.executor.instances", "16")  # 16 executors
#     .config("spark.executor.cores", "4")  # 4 cores per executor
#     .config("spark.driver.maxResultSize", "4g")  # Driver result size limit
#     .config("spark.local.dir", "temp")  # Specify a directory with enough space
#     # .config("spark.local.dir", "/scratch/23m1521/temp")  # Specify a directory with enough space
#     .config("spark.shuffle.file.buffer", "128k")  # Shuffle buffer size
#     .config("spark.memory.fraction", "0.8")  # Memory fraction for tasks
#     .config("spark.shuffle.memoryFraction", "0.6")  # Shuffle memory fraction
#     .config("spark.executor.javaOptions", "-Xmx48g")  # JVM heap size for executors
#     .master("local[64]")  # Use all 64 cores on the machine
#     .getOrCreate()
# )

# spark
spark = (
    SparkSession
    .builder
    .appName("leash belka343455")
    .config("spark.driver.memory", "64g")  # Increased driver memory for large jobs
    .config("spark.executor.memory", "64g")  # Increased executor memory
    .config("spark.executor.instances", "32")  # 32 executors
    .config("spark.executor.cores", "2")  # 2 cores per executor
    .config("spark.driver.maxResultSize", "8g")  # Driver result size limit
    .config("spark.local.dir", "temp")  # Ensure high-speed storage
    .config("spark.shuffle.file.buffer", "1024k")  # Larger shuffle buffer for better IO
    .config("spark.memory.fraction", "0.85")  # Increased memory for tasks
    .config("spark.shuffle.memoryFraction", "0.7")  # Increased shuffle memory
    .config("spark.executor.javaOptions", "-Xmx64g")  # JVM heap size for executors
    .master("local[*]")  # Use all 64 cores on the machine
    .getOrCreate()
)
spark

# SparkSession for 128 GB RAM and 64 cores
# spark = (
#     SparkSession
#     .builder
#     .appName("Optimized Spark for 128GB RAM and 64 Cores")
#     .config("spark.driver.memory", "64g")  # 64GB for driver memory
#     .config("spark.executor.memory", "64g")  # 64GB for executor memory
#     .config("spark.executor.instances", "16")  # 16 executors
#     .config("spark.executor.cores", "4")  # 4 cores per executor (total = 64 cores)
#     .config("spark.driver.maxResultSize", "8g")  # Driver result size limit
#     .config("spark.local.dir", "temp")  # Temp directory with enough space
#     .config("spark.shuffle.file.buffer", "512k")  # Increased shuffle buffer size
#     .config("spark.memory.fraction", "0.8")  # Memory fraction for tasks
#     .config("spark.shuffle.memoryFraction", "0.6")  # Shuffle memory fraction
#     .config("spark.executor.javaOptions", "-Xmx64g")  # JVM heap size for executors
#     .master("local[64]")  # Use all 64 cores on the machine
#     .getOrCreate()
# )

# spark

# SynapseML 
# spark = (
#     SparkSession
#     .builder
#     .appName("leash belka3")
#     .config("spark.driver.memory", "48g")  # Increased driver memory
#     .config("spark.executor.memory", "48g")  # Increased executor memory
#     .config("spark.executor.instances", "16")  # 16 executors
#     .config("spark.executor.cores", "4")  # 4 cores per executor
#     .config("spark.driver.maxResultSize", "4g")  # Driver result size limit
#     .config("spark.local.dir", "temp")  # Specify a directory with enough space
#     .config("spark.shuffle.file.buffer", "128k")  # Shuffle buffer size
#     .config("spark.memory.fraction", "0.8")  # Memory fraction for tasks
#     .config("spark.shuffle.memoryFraction", "0.6")  # Shuffle memory fraction
#     .config("spark.executor.javaOptions", "-Xmx48g")  # JVM heap size for executors
#     .config("spark.jars.packages", "com.microsoft.azure:synapseml_2.12:1.0.8")
#     .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven")
#     .master("local[64]")  # Use all 64 cores on the machine
#     .getOrCreate()
# )

# spark

# spark = (
#     SparkSession
#     .builder
#     .appName("leash belka3")
#     .config("spark.driver.memory", "64g")  # Increased driver memory
#     .config("spark.executor.memory", "64g")  # Increased executor memory
#     .config("spark.executor.instances", "8")  # Reduced number of executors
#     .config("spark.executor.cores", "8")  # Increased cores per executor
#     .config("spark.driver.maxResultSize", "4g")  # Driver result size limit
#     .config("spark.local.dir", "temp")  # Specify a directory with enough space
#     .config("spark.shuffle.file.buffer", "128k")  # Shuffle buffer size
#     .config("spark.memory.fraction", "0.8")  # Memory fraction for tasks
#     .config("spark.shuffle.memoryFraction", "0.7")  # Shuffle memory fraction
#     .config("spark.executor.javaOptions", "-Xmx64g")  # JVM heap size for executors
#     .config("spark.sql.shuffle.partitions", "1000")  # Increase shuffle partitions
#     .config("spark.ui.enabled", "true")  # Enable Spark UI
#     .master("local[8]")  # Reduced number of cores for local mode
#     .getOrCreate()
# )

# spark


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/02 22:49:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/02 22:49:22 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
25/01/02 22:49:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Descriptors Dataset

In [3]:
test_df_features = spark.read.format('parquet').load('test_descriptors.parquet').drop(
    'bb1_smiles', 'bb2_smiles', 'bb3_smiles', 'molecule_smiles'
)
test_df_features.show()

25/01/02 22:49:25 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+---------+------------------+--------------+-----------------+-----------------+---------------------+--------------+-----------------+----------------+-------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+-----------------+----------------------+---------------------+------------------+--------------+------------------+--------+---------------------+--------------+-----------------+----------------+------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+------------------+----------------------+---------------------+------------------+-----------------+--------------------+------------------+---------------------+--------------+-

### Tokens Dataset

In [4]:
test_tokens_df = spark.read.format('parquet').load('test_features.parquet')
test_tokens_df.show()



+---------+-------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|       id|protein| a1| a2| a3| a4| a5| a6| a7| a8| a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|a19|a20|a21|a22|a23|a24| b1| b2| b3| b4| b5| b6| b7| b8| b9|b10|b11|b12|b13|b14|b15|b16|b17|b18|b19|b20|b21|b22|b23|b24| c1| c2| c3| c4| c5| c6| c7| c8| c9|c10|c11|c12|c13|c14|c15|c16|c17|c18|c19|c20|c21|c22|c23|c24| d1| d2| d3| d4| d5| d6| d7| d8| d9|d10|d11|d12|d13|d14|d15|d16|d17|d18|d19|d20|d21|d22|d23|d24|  y|
+---------+-------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+-

                                                                                

## Joining Both Dataset

In [5]:
test_feat_tok_df = (
    test_df_features
    .alias("feat")
    .join(
        test_tokens_df.alias("tok"), how="inner", on=test_df_features.id==test_tokens_df.id
    )
)

test_feat_tok_df = test_feat_tok_df.drop(test_tokens_df.id, test_tokens_df.y).drop(test_tokens_df.protein)

test_feat_tok_df.show()

[Stage 10:>                                                         (0 + 1) / 1]

+---------+------------------+--------------+------------------+--------+---------------------+--------------+-----------------+----------------+------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+------------------+----------------------+---------------------+------------------+------------------+------------------+--------+---------------------+--------------+-----------------+----------------+-------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+------------------+----------------------+---------------------+------------------+------------------+-------------------+------------------+---------------------+--------------+----

                                                                                

## Assembling Columns

In [6]:
features_cols = test_feat_tok_df.columns[2:]
features_cols = ['bb1_MolWt', 'bb1_ExactMolWt', 'bb1_MolLogP', 'bb1_TPSA', 'bb1_NumRotatableBonds', 'bb1_NumHDonors', 'bb1_NumHAcceptors', 'bb1_FormalCharge', 'bb1_FractionCSP3', 'bb1_NumHeavyAtoms', 'bb1_NumAromaticRings', 'bb1_NumAliphaticRings', 'bb1_NumSaturatedRings', 'bb1_NumHeteroatoms', 'bb1_NumRings', 'bb1_MolVolume', 'bb1_RadiusOfGyration', 'bb1_InertialShapeFactor', 'bb1_AromaticProportion', 'bb1_HBondPotential', 'bb1_Lipophilicity', 'bb1_ChargeDistribution', 'bb1_ElectroNegativity', 'bb2_MolWt', 'bb2_ExactMolWt', 'bb2_MolLogP', 'bb2_TPSA', 'bb2_NumRotatableBonds', 'bb2_NumHDonors', 'bb2_NumHAcceptors', 'bb2_FormalCharge', 'bb2_FractionCSP3', 'bb2_NumHeavyAtoms', 'bb2_NumAromaticRings', 'bb2_NumAliphaticRings', 'bb2_NumSaturatedRings', 'bb2_NumHeteroatoms', 'bb2_NumRings', 'bb2_MolVolume', 'bb2_RadiusOfGyration', 'bb2_InertialShapeFactor', 'bb2_AromaticProportion', 'bb2_HBondPotential', 'bb2_Lipophilicity', 'bb2_ChargeDistribution', 'bb2_ElectroNegativity', 'bb3_MolWt', 'bb3_ExactMolWt', 'bb3_MolLogP', 'bb3_TPSA', 'bb3_NumRotatableBonds', 'bb3_NumHDonors', 'bb3_NumHAcceptors', 'bb3_FormalCharge', 'bb3_FractionCSP3', 'bb3_NumHeavyAtoms', 'bb3_NumAromaticRings', 'bb3_NumAliphaticRings', 'bb3_NumSaturatedRings', 'bb3_NumHeteroatoms', 'bb3_NumRings', 'bb3_MolVolume', 'bb3_RadiusOfGyration', 'bb3_InertialShapeFactor', 'bb3_AromaticProportion', 'bb3_HBondPotential', 'bb3_Lipophilicity', 'bb3_ChargeDistribution', 'bb3_ElectroNegativity', 'molecule_MolWt', 'molecule_ExactMolWt', 'molecule_MolLogP', 'molecule_TPSA', 'molecule_NumRotatableBonds', 'molecule_NumHDonors', 'molecule_NumHAcceptors', 'molecule_FormalCharge', 'molecule_FractionCSP3', 'molecule_NumHeavyAtoms', 'molecule_NumAromaticRings', 'molecule_NumAliphaticRings', 'molecule_NumSaturatedRings', 'molecule_NumHeteroatoms', 'molecule_NumRings', 'molecule_MolVolume', 'molecule_RadiusOfGyration', 'molecule_InertialShapeFactor', 'molecule_AromaticProportion', 'molecule_HBondPotential', 'molecule_Lipophilicity', 'molecule_ChargeDistribution', 'molecule_ElectroNegativity', 'protein_onehot', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'a10', 'a11', 'a12', 'a13', 'a14', 'a15', 'a16', 'a17', 'a18', 'a19', 'a20', 'a21', 'a22', 'a23', 'a24', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8', 'b9', 'b10', 'b11', 'b12', 'b13', 'b14', 'b15', 'b16', 'b17', 'b18', 'b19', 'b20', 'b21', 'b22', 'b23', 'b24', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16', 'c17', 'c18', 'c19', 'c20', 'c21', 'c22', 'c23', 'c24', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'd10', 'd11', 'd12', 'd13', 'd14', 'd15', 'd16', 'd17', 'd18', 'd19', 'd20', 'd21', 'd22', 'd23', 'd24']
print(features_cols)

['bb1_MolWt', 'bb1_ExactMolWt', 'bb1_MolLogP', 'bb1_TPSA', 'bb1_NumRotatableBonds', 'bb1_NumHDonors', 'bb1_NumHAcceptors', 'bb1_FormalCharge', 'bb1_FractionCSP3', 'bb1_NumHeavyAtoms', 'bb1_NumAromaticRings', 'bb1_NumAliphaticRings', 'bb1_NumSaturatedRings', 'bb1_NumHeteroatoms', 'bb1_NumRings', 'bb1_MolVolume', 'bb1_RadiusOfGyration', 'bb1_InertialShapeFactor', 'bb1_AromaticProportion', 'bb1_HBondPotential', 'bb1_Lipophilicity', 'bb1_ChargeDistribution', 'bb1_ElectroNegativity', 'bb2_MolWt', 'bb2_ExactMolWt', 'bb2_MolLogP', 'bb2_TPSA', 'bb2_NumRotatableBonds', 'bb2_NumHDonors', 'bb2_NumHAcceptors', 'bb2_FormalCharge', 'bb2_FractionCSP3', 'bb2_NumHeavyAtoms', 'bb2_NumAromaticRings', 'bb2_NumAliphaticRings', 'bb2_NumSaturatedRings', 'bb2_NumHeteroatoms', 'bb2_NumRings', 'bb2_MolVolume', 'bb2_RadiusOfGyration', 'bb2_InertialShapeFactor', 'bb2_AromaticProportion', 'bb2_HBondPotential', 'bb2_Lipophilicity', 'bb2_ChargeDistribution', 'bb2_ElectroNegativity', 'bb3_MolWt', 'bb3_ExactMolWt', 'b

In [7]:
vectorAssembler = VectorAssembler(inputCols=features_cols, outputCol='vectors')
test_feat_tok_df_vectors = vectorAssembler.transform(test_feat_tok_df).select('id', 'vectors')
test_feat_tok_df_vectors.show()



+---------+--------------------+
|       id|             vectors|
+---------+--------------------+
|295246852|(190,[0,1,2,3,4,5...|
|295246961|(190,[0,1,2,3,4,5...|
|295247142|(190,[0,1,2,3,4,5...|
|295247169|(190,[0,1,2,3,4,5...|
|295247204|(190,[0,1,2,3,4,5...|
|295247213|(190,[0,1,2,3,4,5...|
|295247329|(190,[0,1,2,3,4,5...|
|295247347|(190,[0,1,2,3,4,5...|
|295247378|(190,[0,1,2,3,4,5...|
|295247397|(190,[0,1,2,3,4,5...|
|295247414|(190,[0,1,2,3,4,5...|
|295247424|(190,[0,1,2,3,4,5...|
|295247425|(190,[0,1,2,3,4,5...|
|295247435|(190,[0,1,2,3,4,5...|
|295247608|(190,[0,1,2,3,4,5...|
|295247672|(190,[0,1,2,3,4,5...|
|295247725|(190,[0,1,2,3,4,5...|
|295247799|(190,[0,1,2,3,4,5...|
|295247807|(190,[0,1,2,3,4,5...|
|295247924|(190,[0,1,2,3,4,5...|
+---------+--------------------+
only showing top 20 rows



                                                                                

In [8]:
test_feat_tok_df_vectors = test_feat_tok_df_vectors.repartition(1)

In [9]:
test_feat_tok_df_vectors.write.mode('overwrite').format('parquet').save('test_feat_tok_df_vectors.parquet')

                                                                                

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

In [14]:
def getPredictions(df, model):
    predictions = model.transform(df).select("id", "prediction", "probability").orderBy('id')
    predictions.show(truncate=False)
    return predictions

def makeSubmission(
    test_prob, 
    file_name,
    message
):
    import subprocess, os    
    os.makedirs("submission_csv", exist_ok=True)

    sub_df = pd.read_csv('sample_submission.csv.zip')
    sub_df.binds = test_prob
    sub_df.to_csv(file_name, index=False)
    display(pd.read_csv(file_name))
    
    command = [
        "kaggle", "competitions", "submit",
        "-c", "leash-BELKA",
        "-f", file_name,
        "-m", message
    ]
    
    subprocess.run(command)

In [116]:


model = GBTClassificationModel.load('gbt_model')
predictions = getPredictions(test_df_vectors, model)



+---------+----------+----------------------------------------+
|id       |prediction|probability                             |
+---------+----------+----------------------------------------+
|295246830|0.0       |[0.6279070816672211,0.3720929183327789] |
|295246831|0.0       |[0.857039752726353,0.14296024727364698] |
|295246832|0.0       |[0.7983977194008832,0.20160228059911678]|
|295246833|1.0       |[0.32030740604133623,0.6796925939586638]|
|295246834|0.0       |[0.6116999197233459,0.3883000802766541] |
|295246835|1.0       |[0.4082044165247357,0.5917955834752643] |
|295246836|1.0       |[0.4059364842617785,0.5940635157382215] |
|295246837|0.0       |[0.7554311340070863,0.24456886599291372]|
|295246838|0.0       |[0.7821210770164672,0.21787892298353284]|
|295246839|1.0       |[0.47334086476176074,0.5266591352382393]|
|295246840|0.0       |[0.7280127895323997,0.2719872104676003] |
|295246841|0.0       |[0.6121819184610782,0.38781808153892183]|
|295246842|1.0       |[0.409665396767145

                                                                                

In [117]:
test_prob = np.array(predictions.select('probability').collect()).reshape(-1,2)[:,1]
print(test_prob)

                                                                                

array([0.37209292, 0.14296025, 0.20160228, ..., 0.24799954, 0.2237163 ,
       0.35596561])

In [123]:
makeSubmission(
    test_prob,
    file_name= f"submission_csv/_4_sub_PySparkGBT-1.csv",
    message = f"PySpark GBT 1"
)

Unnamed: 0,id,binds
0,295246830,0.372093
1,295246831,0.142960
2,295246832,0.201602
3,295246833,0.679693
4,295246834,0.388300
...,...,...
1674891,296921721,0.178444
1674892,296921722,0.144306
1674893,296921723,0.248000
1674894,296921724,0.223716


100%|██████████| 47.3M/47.3M [00:10<00:00, 4.58MB/s]  


Successfully submitted to NeurIPS 2024 - Predict New Medicines with BELKA

In [124]:
model = GBTClassificationModel.load('gbt_model2')
predictions = getPredictions(test_df_vectors, model)



+---------+----------+----------------------------------------+
|id       |prediction|probability                             |
+---------+----------+----------------------------------------+
|295246830|1.0       |[0.4585862909949032,0.5414137090050968] |
|295246831|0.0       |[0.742702144679811,0.257297855320189]   |
|295246832|0.0       |[0.7226297657331567,0.27737023426684326]|
|295246833|1.0       |[0.4728372954085174,0.5271627045914826] |
|295246834|0.0       |[0.6472599375989134,0.3527400624010866] |
|295246835|1.0       |[0.46793535367718403,0.532064646322816] |
|295246836|0.0       |[0.5197481764067174,0.48025182359328256]|
|295246837|0.0       |[0.8088469120902414,0.1911530879097586] |
|295246838|0.0       |[0.8736886154044494,0.12631138459555058]|
|295246839|1.0       |[0.3332616452486099,0.6667383547513901] |
|295246840|0.0       |[0.65013556920855,0.34986443079145]     |
|295246841|0.0       |[0.5394379658418148,0.4605620341581852] |
|295246842|1.0       |[0.197227960819393

                                                                                

In [125]:
test_prob = np.array(predictions.select('probability').collect()).reshape(-1,2)[:,1]
print(test_prob)
makeSubmission(
    test_prob,
    file_name= f"submission_csv/_4_sub_PySparkGBT-2.csv",
    message = f"PySpark GBT 2"
)

                                                                                

[0.54141371 0.25729786 0.27737023 ... 0.25566305 0.14927391 0.17841661]


Unnamed: 0,id,binds
0,295246830,0.541414
1,295246831,0.257298
2,295246832,0.277370
3,295246833,0.527163
4,295246834,0.352740
...,...,...
1674891,296921721,0.131502
1674892,296921722,0.080195
1674893,296921723,0.255663
1674894,296921724,0.149274


100%|██████████| 47.4M/47.4M [00:16<00:00, 3.10MB/s]  


Successfully submitted to NeurIPS 2024 - Predict New Medicines with BELKA

In [126]:
model = GBTClassificationModel.load('gbt_model3')
predictions = getPredictions(test_df_vectors, model)



+---------+----------+----------------------------------------+
|id       |prediction|probability                             |
+---------+----------+----------------------------------------+
|295246830|0.0       |[0.5900204416732234,0.40997955832677657]|
|295246831|0.0       |[0.7001056902622177,0.2998943097377823] |
|295246832|0.0       |[0.7501166094704026,0.2498833905295974] |
|295246833|1.0       |[0.3765842249855886,0.6234157750144114] |
|295246834|0.0       |[0.5036735162774153,0.4963264837225847] |
|295246835|1.0       |[0.44970792837918494,0.5502920716208151]|
|295246836|1.0       |[0.4894195254714481,0.5105804745285518] |
|295246837|0.0       |[0.6727157024516384,0.32728429754836164]|
|295246838|0.0       |[0.8504474147352973,0.14955258526470272]|
|295246839|1.0       |[0.19255817231722247,0.8074418276827775]|
|295246840|1.0       |[0.3148166001025829,0.6851833998974171] |
|295246841|1.0       |[0.2643104082473145,0.7356895917526856] |
|295246842|1.0       |[0.255865085767325

                                                                                

In [127]:
test_prob = np.array(predictions.select('probability').collect()).reshape(-1,2)[:,1]
print(test_prob)
makeSubmission(
    test_prob,
    file_name= f"submission_csv/_4_sub_PySparkGBT-3.csv",
    message = f"PySpark GBT 3"
)

                                                                                

[0.40997956 0.29989431 0.24988339 ... 0.19473581 0.16310869 0.19648997]


Unnamed: 0,id,binds
0,295246830,0.409980
1,295246831,0.299894
2,295246832,0.249883
3,295246833,0.623416
4,295246834,0.496326
...,...,...
1674891,296921721,0.159567
1674892,296921722,0.106927
1674893,296921723,0.194736
1674894,296921724,0.163109


100%|██████████| 47.4M/47.4M [00:06<00:00, 8.00MB/s]


Successfully submitted to NeurIPS 2024 - Predict New Medicines with BELKA

24/12/31 21:25:57 WARN JavaUtils: Attempt to delete using native Unix OS command failed for path = /home/23m1521/ashish/kaggle/temp/blockmgr-78ce9315-7f6c-4391-975d-9c9209746808. Falling back to Java IO way
java.io.IOException: Failed to delete: /home/23m1521/ashish/kaggle/temp/blockmgr-78ce9315-7f6c-4391-975d-9c9209746808
	at org.apache.spark.network.util.JavaUtils.deleteRecursivelyUsingUnixNative(JavaUtils.java:174)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:109)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:90)
	at org.apache.spark.util.SparkFileUtils.deleteRecursively(SparkFileUtils.scala:121)
	at org.apache.spark.util.SparkFileUtils.deleteRecursively$(SparkFileUtils.scala:120)
	at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1126)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1(DiskBlockManager.scala:368)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1$adapted(DiskBlockMa

In [15]:
model = GBTClassificationModel.load('gbt_model4')
predictions = getPredictions(test_feat_tok_df_vectors, model)

25/01/01 21:23:27 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS

+---------+----------+----------------------------------------+
|id       |prediction|probability                             |
+---------+----------+----------------------------------------+
|295246830|0.0       |[0.5629965630236656,0.43700343697633437]|
|295246831|0.0       |[0.7034383538893488,0.29656164611065117]|
|295246832|0.0       |[0.8139723993373991,0.18602760066260093]|
|295246833|0.0       |[0.7044323427298744,0.29556765727012557]|
|295246834|0.0       |[0.7777997226404001,0.2222002773595999] |
|295246835|0.0       |[0.6624043249139959,0.3375956750860041] |
|295246836|0.0       |[0.8109765641540253,0.1890234358459747] |
|295246837|0.0       |[0.8721802581614457,0.12781974183855427]|
|295246838|0.0       |[0.9496884925526174,0.05031150744738255]|
|295246839|0.0       |[0.5286943114054934,0.4713056885945066] |
|295246840|0.0       |[0.7022887680879931,0.2977112319120069] |
|295246841|0.0       |[0.6890660588023854,0.31093394119761464]|
|295246842|0.0       |[0.605320857276452

                                                                                

In [16]:
test_prob = np.array(predictions.select('probability').collect()).reshape(-1,2)[:,1]
print(test_prob)
makeSubmission(
    test_prob,
    file_name= f"submission_csv/_4_sub_PySparkGBT-4.csv",
    message = f"PySpark GBT 4"
)

                                                                                

[0.43700344 0.29656165 0.1860276  ... 0.27712234 0.19011934 0.21539578]


Unnamed: 0,id,binds
0,295246830,0.437003
1,295246831,0.296562
2,295246832,0.186028
3,295246833,0.295568
4,295246834,0.222200
...,...,...
1674891,296921721,0.175954
1674892,296921722,0.093898
1674893,296921723,0.277122
1674894,296921724,0.190119


100%|██████████| 47.5M/47.5M [00:10<00:00, 4.60MB/s]  


Successfully submitted to NeurIPS 2024 - Predict New Medicines with BELKA

In [17]:
model = GBTClassificationModel.load('gbt_model5')
predictions = getPredictions(test_feat_tok_df_vectors, model)



+---------+----------+----------------------------------------+
|id       |prediction|probability                             |
+---------+----------+----------------------------------------+
|295246830|0.0       |[0.6763672987443041,0.3236327012556959] |
|295246831|0.0       |[0.7286670420696082,0.2713329579303918] |
|295246832|0.0       |[0.7829745283887525,0.21702547161124752]|
|295246833|0.0       |[0.6712709108099004,0.3287290891900996] |
|295246834|0.0       |[0.7199586631698638,0.2800413368301362] |
|295246835|0.0       |[0.6810077227294942,0.31899227727050583]|
|295246836|0.0       |[0.7895721211179306,0.21042787888206937]|
|295246837|0.0       |[0.826435657038017,0.17356434296198298] |
|295246838|0.0       |[0.9062373796012668,0.09376262039873318]|
|295246839|0.0       |[0.5452910470759237,0.4547089529240763] |
|295246840|0.0       |[0.6376848543140979,0.36231514568590206]|
|295246841|0.0       |[0.6402749552077872,0.3597250447922128] |
|295246842|0.0       |[0.672967504383571

                                                                                

In [18]:
test_prob = np.array(predictions.select('probability').collect()).reshape(-1,2)[:,1]
print(test_prob)
makeSubmission(
    test_prob,
    file_name= f"submission_csv/_4_sub_PySparkGBT-5.csv",
    message = f"PySpark GBT 5"
)

                                                                                

[0.3236327  0.27133296 0.21702547 ... 0.31020636 0.27066002 0.26179085]


Unnamed: 0,id,binds
0,295246830,0.323633
1,295246831,0.271333
2,295246832,0.217025
3,295246833,0.328729
4,295246834,0.280041
...,...,...
1674891,296921721,0.266249
1674892,296921722,0.173100
1674893,296921723,0.310206
1674894,296921724,0.270660


100%|██████████| 47.3M/47.3M [00:15<00:00, 3.12MB/s]  


Successfully submitted to NeurIPS 2024 - Predict New Medicines with BELKA