In [4]:
from transformers import BertModel, BertTokenizer, RobertaTokenizer,AutoTokenizer, AutoModelForMaskedLM,AutoModel 
import torch

model_name = 'DeepChem/ChemBERTa-77M-MLM'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, output_hidden_states=True)

def get_bert_embeddings(smiles_strings):
    encoded_input = tokenizer(smiles_strings, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**encoded_input)
    embeddings = outputs.last_hidden_state[:, 0, :]  # Using the [CLS] token embedding from last hidden state
    return embeddings


Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
smiles_strings = ["C1CCCCC1", "C1=CC=CC=C1"]
embeddings = get_bert_embeddings(smiles_strings)

In [7]:
import pandas as pd
train_data = pd.read_csv('/home/parsa/smiles_classification/training_w_features.csv').sample(frac=1)
# val_data = pd.read_csv('/home/parsa/smiles_classification/validation_w_features.csv').sample(frac=1)
val_data = pd.read_csv('/home/parsa/smiles_classification/Features+ SMILES.csv').rename({'RESULTS':'RESULT'},axis=1) #pd.read_csv('/home/parsa/smiles_classification/data_validation.csv')


In [13]:
train_data.groupby('Results').count()

Unnamed: 0_level_0,SMILES,Molecular Weight,LogP,Number of Atoms,Number of Bonds,Number of Rings,Rotatable Bonds Count,Hydrogen Bond Donors,Hydrogen Bond Acceptors,Number of Stereocenters,Topological Polar Surface Area (TPSA)
Results,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,203,203,203,203,203,203,203,203,203,203,203
1,203,203,203,203,203,203,203,203,203,203,203


In [16]:
val_data.shape

(50, 12)

In [4]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = SVC()

embeddings = get_bert_embeddings(train_data.SMILES.tolist()) 


clf.fit(embeddings.numpy() , train_data.Results)

val_embeddings = get_bert_embeddings(val_data.SMILES.tolist())

y_pred = clf.predict(val_embeddings.numpy())
print(classification_report(val_data.RESULT, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.72      0.68        25
           1       0.68      0.60      0.64        25

    accuracy                           0.66        50
   macro avg       0.66      0.66      0.66        50
weighted avg       0.66      0.66      0.66        50



In [2]:
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


def process_X_data(smiles, features):
    scaler = StandardScaler()

    embeddings = get_bert_embeddings(smiles).numpy()
    combined_features = np.concatenate((embeddings, features), axis=1)
    return scaler.fit_transform(combined_features)

In [8]:
clf = SVC()
feature_columns = ['Molecular Weight', 'LogP', 'Number of Atoms',
       'Number of Bonds', 'Number of Rings', 'Rotatable Bonds Count',
       'Hydrogen Bond Donors', 'Hydrogen Bond Acceptors',
       'Number of Stereocenters', 'Topological Polar Surface Area (TPSA)'] # Add all your feature column names


train_x = process_X_data(train_data.SMILES.tolist(), train_data[feature_columns].values ) 
clf.fit(train_x , train_data.Results)

val_x = process_X_data(val_data.SMILES.tolist(), val_data[feature_columns].values ) 
y_pred = clf.predict(val_x)

print(classification_report(val_data.RESULT, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.25      0.40        12

    accuracy                           0.25        12
   macro avg       0.50      0.12      0.20        12
weighted avg       1.00      0.25      0.40        12



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB  # Gaussian for continuous features, Multinomial for discrete
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler 


# Define a pipeline if scaling is needed (e.g., for SVM or KNN)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVC())
])

# Define parameter grids for different classifiers
param_grid = [
    {'classifier': [SVC()],
     'scaler': [StandardScaler()],
     'classifier__C': [0.1, 1, 10],
     'classifier__kernel': ['linear', 'rbf', 'poly', 'rbf', 'sigmoid', 'precomputed']},
    {'classifier': [RandomForestClassifier()],
     'classifier__n_estimators': [10, 50, 100],
     'classifier__max_features': ['sqrt', 'log2']},
    {'classifier': [GradientBoostingClassifier()],
     'classifier__n_estimators': [50, 100, 150],
     'classifier__learning_rate': [0.01, 0.1, 0.2]},
    {'classifier': [LogisticRegression()],
     'scaler': [StandardScaler()],
     'classifier__C': [0.1, 1, 10]},
    {'classifier': [KNeighborsClassifier()],
     'scaler': [StandardScaler()],
     'classifier__n_neighbors': [3, 5, 7, 9, 11, 13]},
     {
        'classifier': [DecisionTreeClassifier()],
        'classifier__criterion': ['gini', 'entropy'],  # 'entropy' is used for the Information Gain in C4.5
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 10]
    },
    {
        'classifier': [GaussianNB()]  # No hyperparameters for tuning typically
    },
    {
        'classifier': [MultinomialNB()],
        'classifier__alpha': [0.1, 1.0, 10.0]  # Smoothing parameter
    }
]
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'precision': make_scorer(precision_score)  # Default is binary; adjust as needed
}
# Create a GridSearchCV object
grid_search = GridSearchCV(
    pipe, 
    param_grid, 
    scoring=scoring, 
    refit='precision',  # Choose one metric to use for refitting the best model
    cv=10, 
    return_train_score=True,
    n_jobs=-1
)

# Assuming X_train, y_train are your data prepared earlier
grid_search.fit(train_x,  train_data.Results)

# Best model after grid search
print("Best parameters:", grid_search.best_params_)
print("Best cross-validated score:", grid_search.best_score_)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Best parameters: {'classifier': GradientBoostingClassifier(), 'classifier__learning_rate': 0.01, 'classifier__n_estimators': 150}
Best cross-validated score: 0.7147808916229967


In [10]:
results = grid_search.cv_results_

import pandas as pd

# Convert results to a DataFrame for easier handling and visualization
df_results = pd.DataFrame(results)
# selected_columns = [col for col in df_results.columns if 'param_' in col or 'test_accuracy' in col or 'test_precision' in col]
# df_results = df_results[selected_columns]
display(df_results[['param_classifier','mean_test_precision',	'mean_test_accuracy','mean_test_recall',	'mean_test_f1',	'rank_test_precision']].sort_values('rank_test_precision').drop_duplicates('param_classifier'))


Unnamed: 0,param_classifier,mean_test_precision,mean_test_accuracy,mean_test_recall,mean_test_f1,rank_test_precision
26,GradientBoostingClassifier(),0.714781,0.70689,0.705238,0.704958,1
51,DecisionTreeClassifier(),0.660704,0.647317,0.625714,0.637652,10
19,RandomForestClassifier(),0.650057,0.620793,0.561429,0.594561,11
13,SVC(),0.643752,0.638171,0.635714,0.636585,12
33,LogisticRegression(),0.630598,0.62561,0.640714,0.628411,25
36,KNeighborsClassifier(),0.5847,0.593232,0.625238,0.599511,44
54,GaussianNB(),0.551349,0.558963,0.590238,0.563946,50
55,MultinomialNB(),,,,,53


In [6]:
results = grid_search.cv_results_

import pandas as pd

# Convert results to a DataFrame for easier handling and visualization
df_results = pd.DataFrame(results)
# selected_columns = [col for col in df_results.columns if 'param_' in col or 'test_accuracy' in col or 'test_precision' in col]
# df_results = df_results[selected_columns]
display(df_results[['param_classifier','mean_test_precision',	'mean_test_accuracy','mean_test_recall',	'mean_test_f1',	'rank_test_precision']].sort_values('rank_test_precision').drop_duplicates('param_classifier'))


Unnamed: 0,param_classifier,mean_test_precision,mean_test_accuracy,mean_test_recall,mean_test_f1,rank_test_precision
24,GradientBoostingClassifier(),0.715444,0.714268,0.710714,0.709894,1
33,LogisticRegression(),0.653751,0.652439,0.666429,0.653442,9
10,SVC(),0.651269,0.6625,0.700476,0.670096,11
50,DecisionTreeClassifier(),0.650541,0.653171,0.660476,0.653065,12
23,RandomForestClassifier(),0.645208,0.630793,0.588333,0.607606,13
38,KNeighborsClassifier(),0.593355,0.603598,0.636905,0.608922,37
54,GaussianNB(),0.581815,0.58622,0.651429,0.60979,40
55,MultinomialNB(),,,,,53


In [31]:
results = grid_search.cv_results_

import pandas as pd

# Convert results to a DataFrame for easier handling and visualization
df_results = pd.DataFrame(results)
selected_columns = [col for col in df_results.columns if 'param_' in col or 'test_accuracy' in col or 'test_precision' in col]
df_results = df_results[selected_columns]
display(df_results[['param_classifier','mean_test_precision',	'mean_test_accuracy',	'rank_test_precision']].sort_values('rank_test_precision'))


Unnamed: 0,param_classifier,mean_test_precision,mean_test_accuracy,rank_test_precision
25,GradientBoostingClassifier(),0.720489,0.707012,1
24,GradientBoostingClassifier(),0.711206,0.704512,2
26,GradientBoostingClassifier(),0.705187,0.697134,3
27,GradientBoostingClassifier(),0.672727,0.672683,4
10,SVC(),0.670787,0.672378,5
28,GradientBoostingClassifier(),0.665368,0.655366,6
29,GradientBoostingClassifier(),0.658973,0.655427,7
18,RandomForestClassifier(),0.655579,0.631098,8
32,GradientBoostingClassifier(),0.652968,0.645488,9
42,DecisionTreeClassifier(),0.650698,0.645183,10


In [22]:
# the results for only smile embeddings
results = grid_search.cv_results_

import pandas as pd

# Convert results to a DataFrame for easier handling and visualization
df_results = pd.DataFrame(results)
selected_columns = [col for col in df_results.columns if 'param_' in col or 'test_accuracy' in col or 'test_precision' in col]
df_results = df_results[selected_columns]
display(df_results[['param_classifier','mean_test_precision',	'std_test_precision',	'rank_test_precision']].sort_values('rank_test_precision'))


Unnamed: 0,param_classifier,mean_test_precision,std_test_precision,rank_test_precision
33,LogisticRegression(),0.633377,0.072645,1
18,RandomForestClassifier(),0.619565,0.083826,2
0,SVC(),0.61665,0.056337,3
8,SVC(),0.616093,0.070229,4
14,SVC(),0.614168,0.046784,5
15,SVC(),0.612085,0.073829,6
13,SVC(),0.612085,0.073829,6
32,GradientBoostingClassifier(),0.609768,0.057518,8
4,SVC(),0.609615,0.196425,9
10,SVC(),0.605718,0.052138,10


In [32]:
from sklearn.metrics import classification_report

# Predict on the test data
y_pred = grid_search.predict(val_x) 
# Print classification report
print(classification_report(val_data.RESULT, y_pred))


              precision    recall  f1-score   support

           0       0.68      0.68      0.68        25
           1       0.68      0.68      0.68        25

    accuracy                           0.68        50
   macro avg       0.68      0.68      0.68        50
weighted avg       0.68      0.68      0.68        50



In [1]:
import boto3
AWS_ACCESS_KEY_ID = "AKIA5KLWPDHXUMLAB6EO"
AWS_SECRET_ACCESS_KEY= "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"

# model.trunk.set_chunk_size(64)
s3 = boto3.client(
    's3',
                     aws_access_key_id=AWS_ACCESS_KEY_ID, 
                      aws_secret_access_key=AWS_SECRET_ACCESS_KEY, 
                      
)
def upload_pdb_file(bucket, key, upload_path):
    s3.upload_file(upload_path, bucket, key)

In [2]:
bucket_name = 'model-server-data'
upload_pdb_file(bucket_name, 'albert_pdb/vae_train.zip', '/data/parsa/AFDB/train.zip')
