In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
from sklearn.ensemble import RandomForestClassifier
import joblib

# Load the new dataset of generated SMILES
new_data = pd.read_csv('test_data_QSAR.csv')

# Calculate Lipinski descriptors for the new data
def calculate_lipinski_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return [Descriptors.MolWt(mol), Descriptors.MolLogP(mol), Lipinski.NumHDonors(mol), Lipinski.NumHAcceptors(mol)]
    else:
        return [None, None, None, None]

new_data[['mol_weight', 'mol_logP', 'num_H_donors', 'num_H_acceptors']] = new_data['canonical_smiles'].apply(calculate_lipinski_descriptors).apply(pd.Series)

# Select features for prediction
X_new = new_data[['mol_weight', 'mol_logP', 'num_H_donors', 'num_H_acceptors']]

# Load the trained model
loaded_model = joblib.load('qsar_model.joblib')

# Use the loaded model to make predictions on the new data
predictions_new = loaded_model.predict(X_new)

# Add the predictions to the new dataset
new_data['predicted_medical_relevance'] = predictions_new

# Save or print the updated dataset with predictions
new_data.to_csv('predicted_results.csv', index=False)

