<a href="https://colab.research.google.com/github/apoorvapu/data_science/blob/main/drug_activity_prediction_belka.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
!pip install pyspark



In [55]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("HugeParquetProcessor").getOrCreate()

In [56]:
# Define filename
dataset_url = "https://huggingface.co/datasets/HoangHa/belka-smiles-train-raw/resolve/main/data/train.parquet"
filename = "train.parquet"
!wget -O $filename $dataset_url


--2025-03-17 23:18:36--  https://huggingface.co/datasets/HoangHa/belka-smiles-train-raw/resolve/main/data/train.parquet
Resolving huggingface.co (huggingface.co)... 18.239.50.16, 18.239.50.103, 18.239.50.49, ...
Connecting to huggingface.co (huggingface.co)|18.239.50.16|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/4f/88/4f882ffde40c5b68f15d4d499c1455831a17d74d14834e270757fbff6f6e08f5/3330782a1855d4d18467fc84e4f2248992d5362fced0f1a2e483d545c642355d?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27train.parquet%3B+filename%3D%22train.parquet%22%3B&Expires=1742257116&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0MjI1NzExNn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzRmLzg4LzRmODgyZmZkZTQwYzViNjhmMTVkNGQ0OTljMTQ1NTgzMWExN2Q3NGQxNDgzNGUyNzA3NTdmYmZmNmY2ZTA4ZjUvMzMzMDc4MmExODU1ZDRkMTg0NjdmYzg0ZTRmMjI0ODk5MmQ1MzYyZmNlZDBmMWEyZTQ4M2Q1NDVjNjQyMzU1ZD9yZX

In [None]:
df = spark.read.parquet("train.parquet")
# Check schema
df.printSchema()

In [None]:
df = df.drop('id','buildingblock1_smiles',	'buildingblock2_smiles',	'buildingblock3_smiles')
import gc
gc.collect()

In [None]:
!pip install rdkit

In [None]:
from pyspark.ml.linalg import Vectors, VectorUDT
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, FloatType

In [None]:
# Function to convert SMILES to DenseVector
def smiles_to_dense_fp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=512)
        return Vectors.dense([float(x) for x in fp])  # Convert to DenseVector
    return Vectors.dense([0.0] * 512)  # Return zero vector for invalid SMILES

# Register the UDF with VectorUDT to handle DenseVector serialization
fp_udf = udf(smiles_to_dense_fp, VectorUDT())

# Apply function to create fingerprint column
df = df.withColumn("Fingerprint", fp_udf(col("molecule_smiles")))


In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
# Convert Protein names to numerical indices
indexer = StringIndexer(inputCol="protein_name", outputCol="Protein_Index")
df = indexer.fit(df).transform(df)

In [None]:
df.show(5)

In [None]:
assembler = VectorAssembler(inputCols=["Fingerprint", "Protein_Index"], outputCol="features")
df = assembler.transform(df).select("features", col("binds").alias("label"))
gc.collect()

In [None]:
df.show(5)

In [None]:
from pyspark.ml.classification import LogisticRegression
# Split into training (80%) and test (20%) sets
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

In [None]:
# Train Random Forest Classifier
rf = LogisticRegression(featuresCol="features", labelCol="label", maxIter=1)
model = rf.fit(train_df)

In [None]:
# Predict on test set
predictions = model.transform(test_df)
predictions.select("label", "prediction", "probability").show(5)


In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"Test AUC: {auc:.3f}")
