In [None]:
"""
This script requires an input Excel file with at least the following columns:
- 'ID': A unique identifier for each molecule.
- 'Smiles': The SMILES string representing the molecule.

The script calculates the Synthetic Accessibility Score (SA_Score), 
a measure of how easy or difficult it is to synthesize a molecule. 
This index is reported in: https://doi.org/10.1186/1758-2946-1-8

It can also be used to update an existing database that already contains 
precomputed values in the 'SA_Score' column. The code automatically detects 
cells with empty values in 'SA_Score' and calculates results only for those, 
leaving previously computed values unchanged. Otherwise, if no values exist in 
'SA_Score', the script will calculate results for the entire database.

Example of an input file structure for updating a database:

ID        Smiles        SA_Score
# CMPD001   CCO           2.45
# CMPD002   C1CCO1        3.12
# CMPD003   CCN(CC)C      2.98
# CMPD004   COC           <empty>
# CMPD005   CC(=O)O       <empty>

The results are saved in a new Excel file named 'results.xlsx' 
(or 'results_updated.xlsx' if the input already contained values).

Note: When running this script, you may see a warning like:
"[timestamp] DEPRECATION WARNING: please use MorganGenerator".
This warning does not come from this script. It originates from how 
the RDKit `sascorer` module is implemented internally, since it relies 
on a deprecated API for Morgan fingerprints. The script itself is 
working correctly and can be safely used despite this message.
"""

import pandas as pd
from rdkit import Chem
from rdkit.Chem import RDConfig
import os
import sys

# Add the SA_Score path to sys.path
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
import sascorer

# Load the Excel file
df = pd.read_excel('input_file.xlsx')

# Ensure the 'SA_Score' column exists
if 'SA_Score' not in df.columns:
    df['SA_Score'] = None

# Function to calculate SA_Score
def calculate_sa_score(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        score = sascorer.calculateScore(mol)
        return round(score, 2)
    else:
        return None

# Check if there are already computed values
already_has_values = df['SA_Score'].notna().any()

# Calculate only for rows where 'SA_Score' is missing (NaN or None)
for idx, row in df[df['SA_Score'].isna()].iterrows():
    score = calculate_sa_score(row['Smiles'])
    df.at[idx, 'SA_Score'] = score
    # Update the last successfully processed ID in real time (overwriting in the same line)
    sys.stdout.write(f"\rLast successful ID processed: {row['ID']}")
    sys.stdout.flush()

# Choose output filename depending on whether there were preexisting values
output_file = "results_updated.xlsx" if already_has_values else "results.xlsx"
df.to_excel(output_file, index=False)

print(f"\nProcessing completed. Results saved to '{output_file}'.")
