# Calculating Smiles Descriptors

In [1]:
train_len = 295246830
one_len = 1589906
zero_len = 293656924
protein_map = {'BRD4': 1, 'HSA': 2, 'sEH': 3}
vocab = {'C': 6825082866, '#': 81527490, '@': 511451694, 'H': 456489972, '=': 1406606874, 'O': 2554179786,
         'N': 2469595230, 'c': 12257477022, '-': 438483636, '.': 216945504, 'l': 491088828, 'B': 123330132,
         'r': 121915914, 'n': 1997759694, 'D': 295246830, 'y': 295246830, 'o': 67918650, 's': 156618468,
         'S': 90662574, 'F': 492710238, '+': 65206260, 'i': 1414026, '/': 11547096, 'I': 23972994}

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import LongType, IntegerType, StructType, StructField, ArrayType, DoubleType, StringType

from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, StringIndexerModel, OneHotEncoderModel
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

import pandas as pd
import numpy as np

from xgboost.spark import SparkXGBClassifier

from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

from concurrent.futures import ThreadPoolExecutor
from joblib import Parallel, delayed
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski, rdmolops, AllChem, rdchem, rdEHTTools, rdMolDescriptors
from tqdm.auto import tqdm
from padelpy import from_smiles
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
# for 256 Gb and 64 Cores
# spark = (
#     SparkSession
#     .builder
#     .appName("leash belka3")
#     .config("spark.driver.memory", "48g")  # Increased driver memory
#     .config("spark.executor.memory", "48g")  # Increased executor memory
#     .config("spark.executor.instances", "16")  # 16 executors
#     .config("spark.executor.cores", "4")  # 4 cores per executor
#     .config("spark.driver.maxResultSize", "4g")  # Driver result size limit
#     .config("spark.local.dir", "temp")  # Specify a directory with enough space
#     # .config("spark.local.dir", "/scratch/23m1521/temp")  # Specify a directory with enough space
#     .config("spark.shuffle.file.buffer", "128k")  # Shuffle buffer size
#     .config("spark.memory.fraction", "0.8")  # Memory fraction for tasks
#     .config("spark.shuffle.memoryFraction", "0.6")  # Shuffle memory fraction
#     .config("spark.executor.javaOptions", "-Xmx48g")  # JVM heap size for executors
#     .master("local[64]")  # Use all 64 cores on the machine
#     .getOrCreate()
# )

# spark

spark = (
    SparkSession
    .builder
    .appName("leash belka3")
    .config("spark.driver.memory", "64g")  # Increased driver memory for large jobs
    .config("spark.executor.memory", "64g")  # Increased executor memory
    .config("spark.executor.instances", "32")  # 32 executors
    .config("spark.executor.cores", "2")  # 2 cores per executor
    .config("spark.driver.maxResultSize", "8g")  # Driver result size limit
    .config("spark.local.dir", "temp")  # Ensure high-speed storage
    .config("spark.shuffle.file.buffer", "1024k")  # Larger shuffle buffer for better IO
    .config("spark.memory.fraction", "0.85")  # Increased memory for tasks
    .config("spark.shuffle.memoryFraction", "0.7")  # Increased shuffle memory
    .config("spark.executor.javaOptions", "-Xmx64g")  # JVM heap size for executors
    .master("local[*]")  # Use all 64 cores on the machine
    .getOrCreate()
)
spark

# SparkSession for 128 GB RAM and 64 cores
# spark = (
#     SparkSession
#     .builder
#     .appName("Optimized Spark for 128GB RAM and 64 Cores")
#     .config("spark.driver.memory", "64g")  # 64GB for driver memory
#     .config("spark.executor.memory", "64g")  # 64GB for executor memory
#     .config("spark.executor.instances", "16")  # 16 executors
#     .config("spark.executor.cores", "4")  # 4 cores per executor (total = 64 cores)
#     .config("spark.driver.maxResultSize", "8g")  # Driver result size limit
#     .config("spark.local.dir", "temp")  # Temp directory with enough space
#     .config("spark.shuffle.file.buffer", "512k")  # Increased shuffle buffer size
#     .config("spark.memory.fraction", "0.8")  # Memory fraction for tasks
#     .config("spark.shuffle.memoryFraction", "0.6")  # Shuffle memory fraction
#     .config("spark.executor.javaOptions", "-Xmx64g")  # JVM heap size for executors
#     .master("local[64]")  # Use all 64 cores on the machine
#     .getOrCreate()
# )

# spark

# SynapseML 
# spark = (
#     SparkSession
#     .builder
#     .appName("leash belka3")
#     .config("spark.driver.memory", "48g")  # Increased driver memory
#     .config("spark.executor.memory", "48g")  # Increased executor memory
#     .config("spark.executor.instances", "16")  # 16 executors
#     .config("spark.executor.cores", "4")  # 4 cores per executor
#     .config("spark.driver.maxResultSize", "4g")  # Driver result size limit
#     .config("spark.local.dir", "temp")  # Specify a directory with enough space
#     .config("spark.shuffle.file.buffer", "128k")  # Shuffle buffer size
#     .config("spark.memory.fraction", "0.8")  # Memory fraction for tasks
#     .config("spark.shuffle.memoryFraction", "0.6")  # Shuffle memory fraction
#     .config("spark.executor.javaOptions", "-Xmx48g")  # JVM heap size for executors
#     .config("spark.jars.packages", "com.microsoft.azure:synapseml_2.12:1.0.8")
#     .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven")
#     .master("local[64]")  # Use all 64 cores on the machine
#     .getOrCreate()
# )

# spark

# spark = (
#     SparkSession
#     .builder
#     .appName("leash belka3")
#     .config("spark.driver.memory", "64g")  # Increased driver memory
#     .config("spark.executor.memory", "64g")  # Increased executor memory
#     .config("spark.executor.instances", "8")  # Reduced number of executors
#     .config("spark.executor.cores", "8")  # Increased cores per executor
#     .config("spark.driver.maxResultSize", "4g")  # Driver result size limit
#     .config("spark.local.dir", "temp")  # Specify a directory with enough space
#     .config("spark.shuffle.file.buffer", "128k")  # Shuffle buffer size
#     .config("spark.memory.fraction", "0.8")  # Memory fraction for tasks
#     .config("spark.shuffle.memoryFraction", "0.7")  # Shuffle memory fraction
#     .config("spark.executor.javaOptions", "-Xmx64g")  # JVM heap size for executors
#     .config("spark.sql.shuffle.partitions", "1000")  # Increase shuffle partitions
#     .config("spark.ui.enabled", "true")  # Enable Spark UI
#     .master("local[8]")  # Reduced number of cores for local mode
#     .getOrCreate()
# )

# spark


24/12/31 01:59:57 WARN Utils: Your hostname, kanjur resolves to a loopback address: 127.0.1.1; using 10.119.2.14 instead (on interface eno3)
24/12/31 01:59:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/31 01:59:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/31 01:59:58 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
24/12/31 01:59:58 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 57138)
Traceback (most recent call last):
  File "/home/23m1521/.conda/envs/cuda_env2/lib/python3.12/socketserver.py", line 318, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/home/23m1521/.conda/envs/cuda_env2/lib/python3.12/socketserver.py", line 349, in process_request
    self.finish_request(request, client_address)
  File "/home/23m1521/.conda/envs/cuda_env2/lib/python3.12/socketserver.py", line 362, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/home/23m1521/.conda/envs/cuda_env2/lib/python3.12/socketserver.py", line 761, in __init__
    self.handle()
  File "/home/23m1521/.conda/envs/cuda_env2/lib/python3.12/site-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/home/23m1521/.conda/envs/cuda_env2/lib/python3.12/site-packages/pyspark/accumulators.py", line 26

In [3]:
def rename_Cols(df):
    df = (
        df
        .withColumnRenamed('buildingblock1_smiles', 'bb1_smiles')
        .withColumnRenamed('buildingblock2_smiles', 'bb2_smiles')
        .withColumnRenamed('buildingblock3_smiles', 'bb3_smiles')
    )
    return df

In [4]:
full_df = rename_Cols(spark.read.format('parquet').load('train.parquet'))

                                                                                

In [5]:
full_df.show()

                                                                                

+---+--------------------+--------------------+--------------------+--------------------+------------+-----+
| id|          bb1_smiles|          bb2_smiles|          bb3_smiles|     molecule_smiles|protein_name|binds|
+---+--------------------+--------------------+--------------------+--------------------+------------+-----+
|  0|C#CC[C@@H](CC(=O)...|C#CCOc1ccc(CN)cc1.Cl|Br.Br.NCC1CCCN1c1...|C#CCOc1ccc(CNc2nc...|        BRD4|    0|
|  1|C#CC[C@@H](CC(=O)...|C#CCOc1ccc(CN)cc1.Cl|Br.Br.NCC1CCCN1c1...|C#CCOc1ccc(CNc2nc...|         HSA|    0|
|  2|C#CC[C@@H](CC(=O)...|C#CCOc1ccc(CN)cc1.Cl|Br.Br.NCC1CCCN1c1...|C#CCOc1ccc(CNc2nc...|         sEH|    0|
|  3|C#CC[C@@H](CC(=O)...|C#CCOc1ccc(CN)cc1.Cl|   Br.NCc1cccc(Br)n1|C#CCOc1ccc(CNc2nc...|        BRD4|    0|
|  4|C#CC[C@@H](CC(=O)...|C#CCOc1ccc(CN)cc1.Cl|   Br.NCc1cccc(Br)n1|C#CCOc1ccc(CNc2nc...|         HSA|    0|
|  5|C#CC[C@@H](CC(=O)...|C#CCOc1ccc(CN)cc1.Cl|   Br.NCc1cccc(Br)n1|C#CCOc1ccc(CNc2nc...|         sEH|    0|
|  6|C#CC[C@@H](CC(

In [6]:
df = full_df.sample(fraction=0.0001, seed=42)
df.count()

                                                                                

29617

In [81]:
df = rename_Cols(spark.read.format('parquet').load('test.parquet'))
print(df.count())
df.show()

1674896
+---------+--------------------+--------------+--------------------+--------------------+------------+
|       id|          bb1_smiles|    bb2_smiles|          bb3_smiles|     molecule_smiles|protein_name|
+---------+--------------------+--------------+--------------------+--------------------+------------+
|295246830|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|      C=Cc1ccc(N)cc1|C#CCCC[C@H](Nc1nc...|        BRD4|
|295246831|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|      C=Cc1ccc(N)cc1|C#CCCC[C@H](Nc1nc...|         HSA|
|295246832|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|      C=Cc1ccc(N)cc1|C#CCCC[C@H](Nc1nc...|         sEH|
|295246833|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|CC(O)Cn1cnc2c(N)n...|C#CCCC[C@H](Nc1nc...|        BRD4|
|295246834|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|CC(O)Cn1cnc2c(N)n...|C#CCCC[C@H](Nc1nc...|         HSA|
|295246835|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|CC(O)Cn1cnc2c(N)n...|C#CCCC[C@H](Nc1nc...|         sEH|
|295246836|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|    CC1(C)CCCC1(O)

In [None]:
# print(df0_features.rdd.getNumPartitions())
# print(full_df.count())
# df0_features.printSchema()

# binds = df.select('binds').rdd.flatMap(lambda x: x).collect()
# binds = full_df.select('binds').rdd.flatMap(lambda x: x).collect()
# binds = np.array(binds)

# sample_df = full_df.sample(fraction=0.00001)

## Computing Descriptors

In [4]:
def compute_descriptors(smiles_chunk):
    results = []
    smiles = smiles_chunk
    # for smiles in smiles_chunk:
    mol = Chem.MolFromSmiles(smiles)
    
    # General Chemical Descriptors
    desc_MolWt = Descriptors.MolWt(mol)  # Molecular Weight (MW)
    desc_ExactMolWt = rdMolDescriptors.CalcExactMolWt(mol)  # Exact Molecular Weight
    desc_MolLogP = Descriptors.MolLogP(mol)  # LogP (Hydrophobicity)
    desc_TPSA = rdMolDescriptors.CalcTPSA(mol)  # Topological Polar Surface Area (TPSA)
    desc_NumRotatableBonds = rdMolDescriptors.CalcNumRotatableBonds(mol)  # Number of Rotatable Bonds
    desc_NumHDonors = Lipinski.NumHDonors(mol)  # Number of H-Bond Donors
    desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)  # Number of H-Bond Acceptors
    desc_FormalCharge = rdmolops.GetFormalCharge(mol)  # Formal Charge
    desc_FractionCSP3 = rdMolDescriptors.CalcFractionCSP3(mol)  # Fraction of Csp3 Carbons
    # -------------------------------------------------------
    
    desc_NumHeavyAtoms = mol.GetNumHeavyAtoms()  # Number of Heavy Atoms
    
    # Functional Group Descriptors
    desc_NumAromaticRings = rdMolDescriptors.CalcNumAromaticRings(mol)  # Number of Aromatic Rings
    desc_NumAliphaticRings = rdMolDescriptors.CalcNumAliphaticRings(mol)  # Number of Aliphatic Rings
    desc_NumSaturatedRings = rdMolDescriptors.CalcNumSaturatedRings(mol)  # Number of Saturated Rings
    desc_NumHeteroatoms = rdMolDescriptors.CalcNumHeteroatoms(mol)  # Number of Heteroatoms
    desc_NumRings = rdMolDescriptors.CalcNumRings(mol)  # Total Number of Rings
    # -------------------------------------------------------
    
    # Shape and Geometry Descriptors
    if mol.GetNumConformers() >= 1:
        desc_MolVolume = AllChem.ComputeMolVolume(mol)  # Molecular Volume
        desc_RadiusOfGyration = rdMolDescriptors.CalcRadiusOfGyration(mol)  # Radius of Gyration
        desc_InertialShapeFactor = rdMolDescriptors.CalcInertialShapeFactor(mol)  # Inertial Shape Factor
    else:
        desc_MolVolume = desc_RadiusOfGyration = desc_InertialShapeFactor = 0
    # -------------------------------------------------------
    
    # Binding-Specific Descriptors
    desc_AromaticProportion = (
        rdMolDescriptors.CalcNumAromaticRings(mol) / mol.GetNumHeavyAtoms()
        if mol.GetNumHeavyAtoms() > 0 else 0
    )  # Aromatic Proportion
    desc_HBondPotential = Lipinski.NumHDonors(mol) + Lipinski.NumHAcceptors(mol)  # Hydrogen Bonding Potential
    desc_Lipophilicity = Descriptors.MolLogP(mol)  # Lipophilicity
    desc_ChargeDistribution = rdmolops.GetFormalCharge(mol)  # Charge Distribution
    # -------------------------------------------------------
    
    # Electrostatic Properties
    desc_ElectroNegativity = Descriptors.MolMR(mol)  # Molecular Refractivity (Electronegativity)
    # -------------------------------------------------------

    
    results.append({
        # "smiles": smiles,
        
        "MolWt": desc_MolWt,
        "ExactMolWt": desc_ExactMolWt,
        "MolLogP": desc_MolLogP,
        "TPSA": desc_TPSA,
        "NumRotatableBonds": desc_NumRotatableBonds,
        "NumHDonors": desc_NumHDonors,
        "NumHAcceptors": desc_NumHAcceptors,
        "FormalCharge": desc_FormalCharge,
        "FractionCSP3": desc_FractionCSP3,
        
        "NumHeavyAtoms": desc_NumHeavyAtoms,
        
        "NumAromaticRings": desc_NumAromaticRings,
        "NumAliphaticRings": desc_NumAliphaticRings,
        "NumSaturatedRings": desc_NumSaturatedRings,
        "NumHeteroatoms": desc_NumHeteroatoms,
        "NumRings": desc_NumRings,
        
        "MolVolume": desc_MolVolume,
        "RadiusOfGyration": desc_RadiusOfGyration,
        "InertialShapeFactor": desc_InertialShapeFactor,
        
        "AromaticProportion": desc_AromaticProportion,
        "HBondPotential": desc_HBondPotential,
        "Lipophilicity": desc_Lipophilicity,
        "ChargeDistribution": desc_ChargeDistribution,
        
        "ElectroNegativity": desc_ElectroNegativity
    })
    
    return results

### 1. Register the UDF

In [5]:
def infer_data_type(value):
    if isinstance(value, str):
        return StringType()
    elif isinstance(value, float):
        return DoubleType()
    elif isinstance(value, int):
        return IntegerType()
    else:
        return StringType()

In [None]:
sample_smiles = "C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O"
descriptor_sample = compute_descriptors(sample_smiles)[0]
descriptor_names = list(descriptor_sample.keys())

In [45]:
descriptor_schema = StructType([
    StructField(name, infer_data_type(value), True)
    for name, value in descriptor_sample.items()
])

descriptor_udf = udf(
    lambda smiles: compute_descriptors(smiles)[0],
    descriptor_schema
)
descriptor_schema

StructType([StructField('smiles', StringType(), True), StructField('MolWt', DoubleType(), True), StructField('ExactMolWt', DoubleType(), True), StructField('MolLogP', DoubleType(), True), StructField('TPSA', DoubleType(), True), StructField('NumRotatableBonds', IntegerType(), True), StructField('NumHDonors', IntegerType(), True), StructField('NumHAcceptors', IntegerType(), True), StructField('FormalCharge', IntegerType(), True), StructField('FractionCSP3', DoubleType(), True), StructField('NumHeavyAtoms', IntegerType(), True), StructField('NumAromaticRings', IntegerType(), True), StructField('NumAliphaticRings', IntegerType(), True), StructField('NumSaturatedRings', IntegerType(), True), StructField('NumHeteroatoms', IntegerType(), True), StructField('NumRings', IntegerType(), True), StructField('MolVolume', IntegerType(), True), StructField('RadiusOfGyration', IntegerType(), True), StructField('InertialShapeFactor', IntegerType(), True), StructField('AromaticProportion', DoubleType(),

### 2. Apply the UDF to PySpark DataFrame

In [63]:
df = df.withColumn("bb1_descriptors", descriptor_udf("bb1_smiles"))

for field in descriptor_schema.fields:
    df = df.withColumn(
        f"bb1_{field.name}",
        df["bb1_descriptors"][field.name]
    )

df.show(truncate=False)

+---------+-----------------------------------------------+---------------------+-----------------------------------+----------------------------------------------------------------------------------+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------+------------------+--------------+------------------+--------+---------------------+--------------+-----------------+----------------+------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+------------------+----------------------+---------------------+---------+
|id       |buildingblock1_smiles                        

In [62]:
protein_Indexer = StringIndexer(inputCol="protein_name", outputCol="protein_i", stringOrderType="frequencyDesc")
protein_Indexer = protein_Indexer.fit(df)

df = protein_Indexer.transform(df)
df.show()



+---------+---------------------+---------------------+---------------------+--------------------+------------+--------------------+--------------------+------------------+--------------+------------------+--------+---------------------+--------------+-----------------+----------------+------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+------------------+----------------------+---------------------+---------+
|       id|buildingblock1_smiles|buildingblock2_smiles|buildingblock3_smiles|     molecule_smiles|protein_name|     bb1_descriptors|          bb1_smiles|         bb1_MolWt|bb1_ExactMolWt|       bb1_MolLogP|bb1_TPSA|bb1_NumRotatableBonds|bb1_NumHDonors|bb1_NumHAcceptors|bb1_FormalCharge|  bb1_FractionCSP3|bb1_NumHeavyAtoms|bb1_NumAromaticRings|bb1_NumAliphaticRings|bb1_NumSaturatedRings|bb1_NumHeteroa

                                                                                

In [64]:
protein_ohe = OneHotEncoder(inputCol="protein_i", outputCol="protein_onehot")
protein_ohe = protein_ohe.fit(df)

df = protein_ohe.transform(df)

In [65]:
df.show()

                                                                                

+---------+---------------------+---------------------+---------------------+--------------------+------------+--------------------+--------------------+------------------+--------------+------------------+--------+---------------------+--------------+-----------------+----------------+------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+------------------+----------------------+---------------------+---------+--------------+
|       id|buildingblock1_smiles|buildingblock2_smiles|buildingblock3_smiles|     molecule_smiles|protein_name|     bb1_descriptors|          bb1_smiles|         bb1_MolWt|bb1_ExactMolWt|       bb1_MolLogP|bb1_TPSA|bb1_NumRotatableBonds|bb1_NumHDonors|bb1_NumHAcceptors|bb1_FormalCharge|  bb1_FractionCSP3|bb1_NumHeavyAtoms|bb1_NumAromaticRings|bb1_NumAliphaticRings|bb1_NumSaturatedRings

In [83]:
df.drop(col('bb1_descriptors'))

DataFrame[id: bigint, bb1_smiles: string, bb2_smiles: string, bb3_smiles: string, molecule_smiles: string, protein_name: string]

In [84]:
df.show()

+---------+--------------------+--------------+--------------------+--------------------+------------+
|       id|          bb1_smiles|    bb2_smiles|          bb3_smiles|     molecule_smiles|protein_name|
+---------+--------------------+--------------+--------------------+--------------------+------------+
|295246830|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|      C=Cc1ccc(N)cc1|C#CCCC[C@H](Nc1nc...|        BRD4|
|295246831|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|      C=Cc1ccc(N)cc1|C#CCCC[C@H](Nc1nc...|         HSA|
|295246832|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|      C=Cc1ccc(N)cc1|C#CCCC[C@H](Nc1nc...|         sEH|
|295246833|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|CC(O)Cn1cnc2c(N)n...|C#CCCC[C@H](Nc1nc...|        BRD4|
|295246834|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|CC(O)Cn1cnc2c(N)n...|C#CCCC[C@H](Nc1nc...|         HSA|
|295246835|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|CC(O)Cn1cnc2c(N)n...|C#CCCC[C@H](Nc1nc...|         sEH|
|295246836|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|    CC1(C)CCCC1(O)CN|C#CCC

# Get DataFrame Features

In [6]:
def make_features_df(df, load_encoder=False):
    
    ### Add Smiles Descriptors Columns --------------
    smiles_cols = ["bb1_smiles", "bb2_smiles", "bb3_smiles", "molecule_smiles"]
    sample_smiles = "C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O"
    descriptor_sample = compute_descriptors(sample_smiles)[0]
    descriptor_names = list(descriptor_sample.keys())

    descriptor_schema = StructType([
        StructField(name, infer_data_type(value), True) 
        for name, value in descriptor_sample.items()
    ])
    descriptor_udf = udf(lambda smiles: compute_descriptors(smiles)[0], descriptor_schema)

    for smiles_col in smiles_cols:
        cols_name = smiles_col.split("_")[0]
        
        df = df.withColumn(f"{cols_name}_descriptors", descriptor_udf(smiles_col))

        for field in descriptor_schema.fields:
            df = df.withColumn(f"{cols_name}_{field.name}", df[f"{cols_name}_descriptors"][field.name])


        df = df.drop(f"{cols_name}_descriptors")
    
    ### Protein One Hot Encoding --------------
    protein_Indexer_path = "./protein_Indexer"
    protein_ohe_path = "./protein_ohe"
    
    if load_encoder == True:
        protein_Indexer = StringIndexerModel.load(protein_Indexer_path)
        protein_ohe = OneHotEncoderModel.load(protein_ohe_path)
        print('Encoders Loaded Succesfully...')
        
    protein_Indexer = StringIndexer(inputCol="protein_name", outputCol="protein_i", stringOrderType="frequencyDesc")
    protein_Indexer = protein_Indexer.fit(df)
    df = protein_Indexer.transform(df)
    
    protein_ohe = OneHotEncoder(inputCol="protein_i", outputCol="protein_onehot")
    protein_ohe = protein_ohe.fit(df)
    df = protein_ohe.transform(df)

    if load_encoder == False:
        protein_Indexer.write().overwrite().save(protein_Indexer_path)
        protein_ohe.write().overwrite().save(protein_ohe_path)
    
    df = df.drop("protein_i").drop("protein_name")
    # df = df.drop("protein_name")
    
    return df

In [13]:
full_df_features = make_features_df(full_df)

                                                                                

In [14]:
features_cols = [full_df_features.columns[0]] + full_df_features.columns[5:]
full_df_features = full_df_features.select(features_cols)

In [15]:
full_df_features.write.format('parquet').save('train_descriptors.parquet')

24/12/30 03:40:44 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [16]:
full_df_features.show()

[Stage 14:>                                                         (0 + 1) / 1]

+---+-----+-----------------+--------------+-----------------+-----------------+---------------------+--------------+-----------------+----------------+-------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+-----------------+----------------------+---------------------+---------+--------------+-----------+--------+---------------------+--------------+-----------------+----------------+----------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+-----------------+----------------------+---------------------+------------------+------------------+-------------------+------------------+---------------------+--------------+-----------------+---

                                                                                

In [18]:
features_cols.pop(1)

'binds'

In [20]:
vectorAssembler = VectorAssembler(inputCols=features_cols, outputCol='vectors')
full_df_vectors = vectorAssembler.transform(full_df_features).select('vectors')
full_df_vectors.show()

[Stage 15:>                                                         (0 + 1) / 1]

+--------------------+
|             vectors|
+--------------------+
|[0.0,349.38600000...|
|[1.0,349.38600000...|
|[2.0,349.38600000...|
|[3.0,349.38600000...|
|[4.0,349.38600000...|
|[5.0,349.38600000...|
|[6.0,349.38600000...|
|[7.0,349.38600000...|
|[8.0,349.38600000...|
|[9.0,349.38600000...|
|[10.0,349.3860000...|
|[11.0,349.3860000...|
|[12.0,349.3860000...|
|[13.0,349.3860000...|
|[14.0,349.3860000...|
|[15.0,349.3860000...|
|[16.0,349.3860000...|
|[17.0,349.3860000...|
|[18.0,349.3860000...|
|[19.0,349.3860000...|
+--------------------+
only showing top 20 rows



24/12/30 08:28:12 WARN JavaUtils: Attempt to delete using native Unix OS command failed for path = /home/23m1521/ashish/kaggle/temp/blockmgr-d54504c8-85c5-4dae-9603-7d0e383d8a82. Falling back to Java IO way
java.io.IOException: Failed to delete: /home/23m1521/ashish/kaggle/temp/blockmgr-d54504c8-85c5-4dae-9603-7d0e383d8a82
	at org.apache.spark.network.util.JavaUtils.deleteRecursivelyUsingUnixNative(JavaUtils.java:174)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:109)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:90)
	at org.apache.spark.util.SparkFileUtils.deleteRecursively(SparkFileUtils.scala:121)
	at org.apache.spark.util.SparkFileUtils.deleteRecursively$(SparkFileUtils.scala:120)
	at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1126)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1(DiskBlockManager.scala:368)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1$adapted(DiskBlockMa

## Transforming Test Data

In [7]:
test_df = rename_Cols(spark.read.format('parquet').load('test.parquet'))
print(test_df.count())
test_df.show()

                                                                                

1674896
+---------+--------------------+--------------+--------------------+--------------------+------------+
|       id|          bb1_smiles|    bb2_smiles|          bb3_smiles|     molecule_smiles|protein_name|
+---------+--------------------+--------------+--------------------+--------------------+------------+
|295246830|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|      C=Cc1ccc(N)cc1|C#CCCC[C@H](Nc1nc...|        BRD4|
|295246831|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|      C=Cc1ccc(N)cc1|C#CCCC[C@H](Nc1nc...|         HSA|
|295246832|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|      C=Cc1ccc(N)cc1|C#CCCC[C@H](Nc1nc...|         sEH|
|295246833|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|CC(O)Cn1cnc2c(N)n...|C#CCCC[C@H](Nc1nc...|        BRD4|
|295246834|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|CC(O)Cn1cnc2c(N)n...|C#CCCC[C@H](Nc1nc...|         HSA|
|295246835|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|CC(O)Cn1cnc2c(N)n...|C#CCCC[C@H](Nc1nc...|         sEH|
|295246836|C#CCCC[C@H](NC(=O...|C=Cc1ccc(N)cc1|    CC1(C)CCCC1(O)

In [8]:
test_df_features = make_features_df(test_df, load_encoder=True)

Encoders Loaded Succesfully...


                                                                                

In [9]:
test_df_features.show()

24/12/31 02:00:17 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+---------+--------------------+--------------+--------------------+--------------------+------------------+--------------+------------------+--------+---------------------+--------------+-----------------+----------------+------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+------------------+----------------------+---------------------+------------------+--------------+------------------+--------+---------------------+--------------+-----------------+----------------+----------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+------------------+----------------------+---------------------+------------------+--------------+-------------

                                                                                

In [10]:
test_df_features.rdd.getNumPartitions()

8

In [14]:
test_df_features.write.mode('overwrite').format('parquet').save('test_descriptors.parquet')

                                                                                