In [1]:
!pip install deepchem
!pip install torch



In [2]:
import torch
import deepchem as dc
from deepchem.models.torch_models import MATModel
import os

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
2024-06-19 06:38:59.751635: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/opt/conda/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'haiku'


In [24]:
def featurize_data(df, n_jobs=-1):
    smiles = df['molecule_smiles'].tolist()
    X = featurizer.featurize(smiles)
    y = df['binds'].tolist()
    dataset = dc.data.NumpyDataset(X, y)
    return dataset

## Quick Setup


In [4]:
# Featurize SMILES strings using ConvMolFeaturizer for MAT
featurizer = dc.feat.MATFeaturizer()

In [15]:
import pandas as pd

def read_parquet_to_pandas_dataframe(file_path):
    """
    Reads a Parquet file into a Pandas DataFrame.
    
    Parameters:
    file_path (str): The path to the Parquet file.
    
    Returns:
    pandas.DataFrame: The loaded Pandas DataFrame.
    """
    df = pd.read_parquet(file_path, engine='pyarrow')
    return df


In [16]:
df_HSA = read_parquet_to_pandas_dataframe('df_HSA.parquet')

In [32]:
# # Assuming df_HSA is your DataFrame
df_HSA = df_HSA.sample(frac=0.2, random_state=42)  # 20% random sample

print(f"New DataFrame size: {df_HSA.shape}")

New DataFrame size: (26, 3)


In [30]:
from rdkit import RDLogger

# Suppress RDKit warnings
RDLogger.DisableLog('rdApp.*')


In [31]:
train_dataset_HSA = featurize_data(df_HSA)

In [33]:
splitter = dc.splits.RandomSplitter()

train_dataset_HSA, valid_dataset_HSA = splitter.train_test_split(train_dataset_HSA, frac_train=0.8)

## Training

In [34]:
def train_and_evaluate(dataset_train, dataset_valid, save_dir):
    model = MATModel(
        mode='classification',
        n_tasks=1,
        number_of_layers=6,  # Number of Transformer layers
        d_model=256,         # Dimensionality of model embeddings
        num_heads=8,         # Number of attention heads
        dropout=0.1,
        learning_rate=0.001
    )

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Training with early stopping and learning rate scheduler
    best_valid_score = 0
    patience = 5
    patience_counter = 0
    initial_lr = 0.001

    nb_epoch = 50
    for epoch in range(nb_epoch):
        loss = model.fit(dataset_train, nb_epoch=1)
        
        # Update learning rate
        new_lr = initial_lr * (0.9 ** (epoch // 10))  # Reduce learning rate every 10 epochs
        model.optimizer.learning_rate = new_lr
        
        train_score = model.evaluate(dataset_train, [dc.metrics.roc_auc_score])
        valid_score = model.evaluate(dataset_valid, [dc.metrics.roc_auc_score])
        print(f"Epoch {epoch+1}/{nb_epoch}")
        print(f"  Training Loss: {loss}")
        print(f"  Train ROC-AUC Score: {train_score['mean-roc_auc_score']}")
        print(f"  Valid ROC-AUC Score: {valid_score['mean-roc_auc_score']}")

        # Early stopping
        if valid_score['mean-roc_auc_score'] > best_valid_score:
            best_valid_score = valid_score['mean-roc_auc_score']
            model.save_checkpoint(model_dir=save_dir)
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break

    print("Training completed.")
    model.restore(model_dir=save_dir)
    print("Best model restored.")
    
    return model

In [36]:
model_HSA = train_and_evaluate(train_dataset_HSA, valid_dataset_HSA, 'mat_model_HSA')

RuntimeError: Found dtype Long but expected Float

## Predicting

In [None]:
# Specify your S3 Bucket and file key
bucket = 'kaggle-leash-bio'
test_parquet_key = 'test.parquet'
test_parquet_location = f's3://{bucket}/{test_parquet_key}'

In [None]:
# Open the Parquet file
df = pd.read_parquet(test_parquet_location, engine='pyarrow')

In [None]:
# Filter for molecules binding with the HSA protein
df_HSA_test = df[df['protein_name'] == 'HSA']


In [None]:
# Reduce for quick view of the test data
df_HSA_test = df_HSA_test.sample(frac=0.001, random_state=42)  # 20% random sample
print(f"New DataFrame size: {df_HSA_test.shape}")

In [None]:
X_test = featurizer.featurize(df_HSA_test['molecule_smiles'].tolist())

# Create DeepChem dataset
dataset = dc.data.NumpyDataset(X_test)

In [None]:
# Predict bindings
predictions = model.predict(dataset)

# Extract the probability of the positive class (binding)
probabilities = predictions[:, 0, 1]  # Assuming the second column corresponds to the positive class

In [18]:
# Create resulting DataFrame with 'id' and 'binds' columns
result_df = pd.DataFrame({
    'id': df_HSA_test['id'],
    'binds': probabilities
})
# Display the resulting DataFrame
result_df

In [None]:
# Optionally, save the resulting DataFrame to a CSV file
result_df.to_csv('MAT_HSA_predictions_50E.csv', index=False)