In [None]:
print('good to go!')

In [2]:
!pip install --upgrade pip
!pip install fsspec
!pip install pytz

Collecting pip
  Using cached pip-24.0-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-24.0-py3-none-any.whl (2.1 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.3.2
    Uninstalling pip-23.3.2:
      Successfully uninstalled pip-23.3.2
Successfully installed pip-24.0


In [3]:
import pandas as pd

def read_parquet_to_pandas_dataframe(file_path):
    """
    Reads a Parquet file into a Pandas DataFrame.
    
    Parameters:
    file_path (str): The path to the Parquet file.
    
    Returns:
    pandas.DataFrame: The loaded Pandas DataFrame.
    """
    df = pd.read_parquet(file_path, engine='pyarrow')
    return df



In [4]:
df_sEH = read_parquet_to_pandas_dataframe('df_sEH.parquet')


In [5]:
print(f"Dataframe size: {df_sEH.shape}")

Dataframe size: (2898129, 3)


In [9]:
# # Assuming df_sEH is your DataFrame
# df_sEH = df_sEH.sample(frac=0.2, random_state=42)  # 20% random sample

# print(f"New DataFrame size: {df_sEH.shape}")

New DataFrame size: (579626, 3)


## Running Deep Chem

In [6]:
!pip install deepchem

Collecting deepchem
  Using cached deepchem-2.8.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdkit (from deepchem)
  Using cached rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Using cached deepchem-2.8.0-py3-none-any.whl (1.0 MB)
Using cached rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
Installing collected packages: rdkit, deepchem
Successfully installed deepchem-2.8.0 rdkit-2023.9.6


In [7]:
import deepchem as dc
from deepchem.models import GraphConvModel
import numpy as np

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
2024-06-16 19:47:41.682708: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/opt/conda/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'haiku'


In [8]:
# Featurize SMILES strings
featurizer = dc.feat.ConvMolFeaturizer()

In [9]:
X = featurizer.featurize(df_sEH['molecule_smiles'])
y = np.array(df_sEH['binds'])

In [10]:
# Create DeepChem dataset
dataset = dc.data.NumpyDataset(X, y)

In [11]:
# Split the dataset into training and validation sets
splitter = dc.splits.RandomSplitter()
train_dataset, valid_dataset = splitter.train_test_split(dataset, frac_train=0.8)

In [12]:
# Define a Graph Convolutional Network (GCN) model
model = GraphConvModel(n_tasks=1, mode='classification', dropout=0.2, learning_rate=0.001)


2024-06-16 21:49:17.291864: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 20763 MB memory:  -> device: 0, name: NVIDIA L4, pci bus id: 0000:4d:00.0, compute capability: 8.9


In [17]:

# Fit the model
# Custom training loop with verbosity
nb_epoch = 30
for epoch in range(nb_epoch):
    loss = model.fit(train_dataset, nb_epoch=1)
    train_score = model.evaluate(train_dataset, [dc.metrics.roc_auc_score])
    valid_score = model.evaluate(valid_dataset, [dc.metrics.roc_auc_score])
    print(f"Epoch {epoch+1}/{nb_epoch}")
    print(f"  Training Loss: {loss}")
    print(f"  Train ROC-AUC Score: {train_score['metric-1']}")
    print(f"  Valid ROC-AUC Score: {valid_score['metric-1']}")



Epoch 1/30
  Training Loss: 0.06269567779132298
  Train ROC-AUC Score: 0.9958847802384991
  Valid ROC-AUC Score: 0.9955397338701848
Epoch 2/30
  Training Loss: 0.07415872812271118
  Train ROC-AUC Score: 0.9957245235254026
  Valid ROC-AUC Score: 0.9953940258017295
Epoch 3/30
  Training Loss: 0.07397176265716553
  Train ROC-AUC Score: 0.9959354700224914
  Valid ROC-AUC Score: 0.9955977702436463
Epoch 4/30
  Training Loss: 0.07255287503087243
  Train ROC-AUC Score: 0.9960353532536597
  Valid ROC-AUC Score: 0.9957248907523502
Epoch 5/30
  Training Loss: 0.08305994007322523
  Train ROC-AUC Score: 0.9959455633810921
  Valid ROC-AUC Score: 0.9956213206355176
Epoch 6/30
  Training Loss: 0.07306069341199152
  Train ROC-AUC Score: 0.9960936526777013
  Valid ROC-AUC Score: 0.995764955270541
Epoch 7/30
  Training Loss: 0.06694177605889061
  Train ROC-AUC Score: 0.9961269939827713
  Valid ROC-AUC Score: 0.9957940954828748
Epoch 8/30
  Training Loss: 0.07615394592285156
  Train ROC-AUC Score: 0.9962

KeyboardInterrupt: 

In [14]:
# Evaluate the model
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
train_score = model.evaluate(train_dataset, [metric])
valid_score = model.evaluate(valid_dataset, [metric])
print(f"Train ROC-AUC Score: {train_score['roc_auc_score']}")
print(f"Valid ROC-AUC Score: {valid_score['roc_auc_score']}")

Train ROC-AUC Score: 0.995704347514857
Valid ROC-AUC Score: 0.9953787798164365


In [18]:
import os

# Directory to save the model
save_dir = 'deepchem_model_sEH_30E'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Save the model
model.save_checkpoint(model_dir=save_dir)


In [19]:
## Testing

In [20]:
# Specify your S3 Bucket and file key
bucket = 'kaggle-leash-bio'
test_parquet_key = 'test.parquet'
test_parquet_location = f's3://{bucket}/{test_parquet_key}'

In [21]:
# Open the Parquet file
df = pd.read_parquet(test_parquet_location, engine='pyarrow')

In [22]:
# Filter for molecules binding with the sEH protein
df_sEH_test = df[df['protein_name'] == 'sEH']


In [23]:
# # Assuming df_sEH is your DataFrame
# df_sEH_test = df_sEH_test.sample(frac=0.001, random_state=42)  # 20% random sample


In [24]:
X_test = featurizer.featurize(df_sEH_test['molecule_smiles'].tolist())

# Create DeepChem dataset
dataset = dc.data.NumpyDataset(X_test)

In [None]:
#In case of loading a good past model:
model.restore(model_dir='deepchem_model_sEH_10E')

In [33]:
# Predict bindings
predictions = model.predict(dataset)

# Extract the probability of the positive class (binding)
probabilities = predictions[:, 0, 1]  # Assuming the second column corresponds to the positive class

In [34]:
# Create resulting DataFrame with 'id' and 'binds' columns
result_df = pd.DataFrame({
    'id': df_sEH_test['id'],
    'binds': probabilities
})
# Display the resulting DataFrame
result_df

Unnamed: 0,id,binds
2,295246832,0.000009
5,295246835,0.000025
8,295246838,0.000025
11,295246841,0.002058
14,295246844,0.019811
...,...,...
1674883,296921713,0.008101
1674886,296921716,0.025863
1674889,296921719,0.000003
1674892,296921722,0.000517


In [35]:
df_sEH_test

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name
2,295246832,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,sEH
5,295246835,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,CC(O)Cn1cnc2c(N)ncnc21,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2...,sEH
8,295246838,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,CC1(C)CCCC1(O)CN,C#CCCC[C@H](Nc1nc(NCC2(O)CCCC2(C)C)nc(Nc2ccc(C...,sEH
11,295246841,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,COC(=O)c1cc(Cl)sc1N,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2sc(Cl)c...,sEH
14,295246844,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,CSC1CCC(CN)CC1,C#CCCC[C@H](Nc1nc(NCC2CCC(SC)CC2)nc(Nc2ccc(C=C...,sEH
...,...,...,...,...,...,...
1674883,296921713,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1nncs1,Cn1ncc2cc(N)ccc21,Cn1ncc2cc(Nc3nc(Nc4nncs4)nc(N[C@@H](CCCN=[N+]=...,sEH
1674886,296921716,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1nncs1,NCC1CCC2CC2C1,[N-]=[N+]=NCCC[C@H](Nc1nc(NCC2CCC3CC3C2)nc(Nc2...,sEH
1674889,296921719,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,COC(=O)c1ccnc(N)c1,COC(=O)c1ccnc(Nc2nc(Nc3noc4ccc(F)cc34)nc(N[C@@...,sEH
1674892,296921722,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,COC1CCC(CCN)CC1,COC1CCC(CCNc2nc(Nc3noc4ccc(F)cc34)nc(N[C@@H](C...,sEH


In [36]:
# Optionally, save the resulting DataFrame to a CSV file
result_df.to_csv('sEH_predictions_10E.csv', index=False)

In [37]:
# Load the prediction CSV files
she_predictions = pd.read_csv('sEH_predictions_10E.csv')
hsa_predictions = pd.read_csv('HSA_predictions_20.csv')
brd4_predictions = pd.read_csv('BRD4_predictions_20.csv')

# Concatenate the DataFrames
all_predictions = pd.concat([she_predictions, hsa_predictions, brd4_predictions])

# Sort by the 'id' column
all_predictions_sorted = all_predictions.sort_values(by='id')

# Save to a new CSV file
all_predictions_sorted.to_csv('final_submission_sEH_10E.csv', index=False)

print("final_submission.csv created successfully.")

final_submission.csv created successfully.


In [38]:
all_predictions_sorted

Unnamed: 0,id,binds
0,295246830,0.000256
0,295246831,0.004696
0,295246832,0.000009
1,295246833,0.002095
1,295246834,0.021178
...,...,...
557893,296921721,0.017274
558140,296921722,0.000517
558858,296921723,0.005677
557894,296921724,0.021877
