<a href="https://colab.research.google.com/github/apoorvapu/data_science/blob/main/drug_activity_prediction_belka.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("HugeParquetProcessor").getOrCreate()

In [3]:
# Define filename
dataset_url = "https://huggingface.co/datasets/HoangHa/belka-smiles-train-raw/resolve/main/data/train.parquet"
filename = "train.parquet"
!wget -O $filename $dataset_url


--2025-04-24 19:45:03--  https://huggingface.co/datasets/HoangHa/belka-smiles-train-raw/resolve/main/data/train.parquet
Resolving huggingface.co (huggingface.co)... 18.160.143.76, 18.160.143.75, 18.160.143.32, ...
Connecting to huggingface.co (huggingface.co)|18.160.143.76|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cas-bridge.xethub.hf.co/xet-bridge-us/66269bbd891c75742bd03007/98dac475ec4dd4235359dea8448ff43a92673010cf97d7a63bcb903d6862963a?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250424%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250424T194503Z&X-Amz-Expires=3600&X-Amz-Signature=d225672787b16bea1c6dc954b3a39d40f97d16dcbda14c72be84f5a7c726199b&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27train.parquet%3B+filename%3D%22train.parquet%22%3B&x-id=GetObject&Expires=1745527503&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVz

In [4]:
df = spark.read.parquet("train.parquet")
# Check schema
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- buildingblock1_smiles: string (nullable = true)
 |-- buildingblock2_smiles: string (nullable = true)
 |-- buildingblock3_smiles: string (nullable = true)
 |-- molecule_smiles: string (nullable = true)
 |-- protein_name: string (nullable = true)
 |-- binds: long (nullable = true)



In [5]:
df = df.drop('id','buildingblock1_smiles',	'buildingblock2_smiles',	'buildingblock3_smiles')
import gc
gc.collect()

130

In [6]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.6


In [7]:
from pyspark.ml.linalg import Vectors, VectorUDT
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, FloatType

In [8]:
# Function to convert SMILES to DenseVector
def smiles_to_dense_fp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=128)
        return Vectors.dense([float(x) for x in fp])  # Convert to DenseVector
    return Vectors.dense([0.0] * 256)  # Return zero vector for invalid SMILES

# Register the UDF with VectorUDT to handle DenseVector serialization
fp_udf = udf(smiles_to_dense_fp, VectorUDT())

# Apply function to create fingerprint column
df = df.withColumn("Fingerprint", fp_udf(col("molecule_smiles")))


In [9]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
# Convert Protein names to numerical indices
indexer = StringIndexer(inputCol="protein_name", outputCol="Protein_Index")
df = indexer.fit(df).transform(df)

In [10]:
df.show(5)

+--------------------+------------+-----+--------------------+-------------+
|     molecule_smiles|protein_name|binds|         Fingerprint|Protein_Index|
+--------------------+------------+-----+--------------------+-------------+
|C#CCOc1ccc(CNc2nc...|        BRD4|    0|[0.0,1.0,0.0,0.0,...|          0.0|
|C#CCOc1ccc(CNc2nc...|         HSA|    0|[0.0,1.0,0.0,0.0,...|          1.0|
|C#CCOc1ccc(CNc2nc...|         sEH|    0|[0.0,1.0,0.0,0.0,...|          2.0|
|C#CCOc1ccc(CNc2nc...|        BRD4|    0|[0.0,1.0,0.0,0.0,...|          0.0|
|C#CCOc1ccc(CNc2nc...|         HSA|    0|[0.0,1.0,0.0,0.0,...|          1.0|
+--------------------+------------+-----+--------------------+-------------+
only showing top 5 rows



In [11]:
assembler = VectorAssembler(inputCols=["Fingerprint", "Protein_Index"], outputCol="features")
df = assembler.transform(df).select("features", col("binds").alias("label"))
gc.collect()

342

In [12]:
df.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(257,[1,4,10,15,1...|    0|
|(257,[1,4,10,15,1...|    0|
|(257,[1,4,10,15,1...|    0|
|(257,[1,4,10,15,2...|    0|
|(257,[1,4,10,15,2...|    0|
+--------------------+-----+
only showing top 5 rows



In [13]:
from pyspark.ml.classification import LogisticRegression
# Split into training (80%) and test (20%) sets
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

In [14]:
# Train Random Forest Classifier
rf = LogisticRegression(featuresCol="features", labelCol="label", maxIter=1)
model = rf.fit(train_df)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [15]:
# Predict on test set
predictions = model.transform(test_df)
predictions.select("label", "prediction", "probability").show(5)


NameError: name 'model' is not defined

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"Test AUC: {auc:.3f}")


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
