In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.9.5-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.5-cp311-cp311-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.5


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

def generate_smiles_variants(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            raise ValueError("Invalid SMILES")

        variants = {
            'Canonical_SMILES': Chem.MolToSmiles(mol, canonical=True),
            'Random_SMILES_1': Chem.MolToSmiles(mol, doRandom=True, canonical=False),
            'Random_SMILES_2': Chem.MolToSmiles(mol, doRandom=True, canonical=False),
            'NonIsomeric_SMILES': Chem.MolToSmiles(mol, isomericSmiles=False),
            'Kekule_SMILES': Chem.MolToSmiles(mol, kekuleSmiles=True),
            'SMILES_with_Hs': Chem.MolToSmiles(mol, allHsExplicit=True),
            'Branchless_SMILES': Chem.MolToSmiles(mol, canonical=False, doRandom=False).replace("(", "").replace(")", "")
        }

        AllChem.Compute2DCoords(mol)
        variants['SMILES_2D'] = Chem.MolToSmiles(mol)

        return variants

    except Exception as e:
        print(f"Error processing {smiles}: {str(e)}")
        return {key: None for key in [
            'Canonical_SMILES',
            'Random_SMILES_1',
            'Random_SMILES_2',
            'NonIsomeric_SMILES',
            'Kekule_SMILES',
            'SMILES_with_Hs',
            'Branchless_SMILES',
            'SMILES_2D'
        ]}

input_file = "/content/250k_rndm_zinc_drugs_clean_3.csv"
output_file = "enhanced_molecules_top1000.csv"

df = pd.read_csv(input_file).head(1000)

variants_df = df["smiles"].apply(lambda x: pd.Series(generate_smiles_variants(x)))

enhanced_df = pd.concat([df, variants_df], axis=1)
enhanced_df.to_csv(output_file, index=False)
print(f"Processed data is saved at {output_file}")

前1000行数据处理完成，结果已保存至 enhanced_molecules_top1000.csv


In [None]:
!pip install openai
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [None]:

!pip uninstall openai -y
!pip install openai>=1.0.0

Found existing installation: openai 1.61.1
Uninstalling openai-1.61.1:
  Successfully uninstalled openai-1.61.1


In [None]:
!pip install openai>=1.0.0

In [None]:
import os
import time
import json
import logging
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm
from openai import OpenAI
from dotenv import load_dotenv
from typing import Dict, Optional


logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("api_experiment.log"),
        logging.StreamHandler()
    ]
)

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

CONFIG = {
    "input_file": "enhanced_molecules_top1000.csv",
    "output_file": "api_predictions.csv",
    "smiles_columns": [
        'smiles',
        'Canonical_SMILES',
        'Random_SMILES_1',
        'Random_SMILES_2',
        'NonIsomeric_SMILES',
        'Kekule_SMILES',
        'SMILES_with_Hs',
        'Branchless_SMILES',
        'SMILES_2D'
    ],
    "properties": ['logP', 'qed', 'SAS'],
    "model": "gpt-4-turbo",
    "max_retries": 3,
    "request_timeout": 30,
    "temperature": 0.3,
    "batch_size": 10
}

def build_prompt(smiles: str) -> str:
    return f"""As a computational chemistry assistant, predict the following properties for the molecule: {smiles}

Return ONLY a properly formatted JSON object with the following structure:
{{
    "logP": <float_value>,    // Predicted octanol-water partition coefficient (3 decimal places)
    "qed": <float_value>,     // Quantitative Estimate of Drug-likeness (0-1, 3 decimals)
    "SAS": <float_value>      // Synthetic Accessibility Score (1-10, 3 decimals)
}}

Important:
1. Do not include any explanatory text
2. Ensure proper JSON syntax
3. Maintain exact key names
4. Values must be numeric"""

def parse_response(content: str) -> Optional[Dict]:
    try:

        start = content.find('{')
        end = content.rfind('}') + 1
        json_str = content[start:end]

        json_str = json_str.replace("'", '"')
        json_str = json_str.replace("True", "true").replace("False", "false")

        parsed = json.loads(json_str)

        required_keys = set(CONFIG["properties"])
        if not all(key in parsed for key in required_keys):
            raise ValueError("Missing required keys")

        for k, v in parsed.items():
            if not isinstance(v, (int, float)):
                raise ValueError(f"Invalid type for {k}: {type(v)}")

        return parsed

    except (json.JSONDecodeError, ValueError, TypeError, KeyError) as e:
        logging.error(f"error: {str(e)}")
        logging.debug(f"bug: {content}")
        return None

def query_api(smiles: str) -> Optional[Dict]:
    for attempt in range(CONFIG["max_retries"] + 1):
        try:
            response = client.chat.completions.create(
                model=CONFIG["model"],
                messages=[
                    {"role": "system", "content": "You are a helper that only returns valid JSON, and a drug expert"},
                    {"role": "user", "content": build_prompt(smiles)}
                ],
                temperature=CONFIG["temperature"],
                timeout=CONFIG["request_timeout"]
            )

            if not response.choices:
                raise ValueError("Empty API response")

            content = response.choices[0].message.content
            parsed = parse_response(content)

            if parsed:
                return parsed

        except Exception as e:
            if attempt < CONFIG["max_retries"]:
                sleep_time = 2 ** attempt + np.random.uniform(0, 1)
                logging.warning(f"Try {attempt+1}/{CONFIG['max_retries']} 失败: {str(e)} - 等待 {sleep_time:.1f}s")
                time.sleep(sleep_time)
            else:
                logging.error(f"all retry fail: {smiles}")
                return None

def process_batch(df: pd.DataFrame) -> pd.DataFrame:
    results = pd.DataFrame(
        index=df.index,
        columns=pd.MultiIndex.from_product(
            [CONFIG["smiles_columns"], CONFIG["properties"]],
            names=['SMILES Type', 'Properties']
        )
    )

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Handle Molecules"):
        for col in CONFIG["smiles_columns"]:
            smiles = row[col]

            if pd.isna(smiles):
                logging.warning(f"Column {idx} Row {col} contains empty value")
                continue

            predictions = query_api(smiles)

            for prop in CONFIG["properties"]:
                results.loc[idx, (col, prop)] = predictions.get(prop, np.nan) if predictions else np.nan

    return results

def analyze_results(predictions: pd.DataFrame, true_values: pd.DataFrame) -> pd.DataFrame:
    analysis = []

    for col in CONFIG["smiles_columns"]:
        for prop in CONFIG["properties"]:
            pred_col = (col, prop)

            temp_df = pd.DataFrame({
                f'{prop}_true': true_values[prop],
                f'{prop}_pred': predictions[pred_col]
            })

            temp_df = temp_df.dropna()

            if len(temp_df) < 5:
                continue

            mae = np.mean(np.abs(temp_df[f'{prop}_pred'] - temp_df[f'{prop}_true']))
            rmse = np.sqrt(np.mean((temp_df[f'{prop}_pred'] - temp_df[f'{prop}_true'])**2))
            r2 = 1 - (np.sum((temp_df[f'{prop}_pred'] - temp_df[f'{prop}_true'])**2)
                     / np.sum((temp_df[f'{prop}_true'] - np.mean(temp_df[f'{prop}_true']))**2))

            _, p_value = stats.ttest_rel(temp_df[f'{prop}_pred'], temp_df[f'{prop}_true'])

            analysis.append({
                'SMILES Type': col,
                '{Properties}': prop,
                'MAE': mae,
                'RMSE': rmse,
                'R²': r2,
                'p value': p_value,
                '#samples': len(temp_df)
            })

    return pd.DataFrame(analysis)
def main():
    try:
        df = pd.read_csv(CONFIG["input_file"]).head(500)
        logging.info(f"process {len(df)} lines data")

        predictions = process_batch(df)

        final_df = pd.concat([df, predictions], axis=1)
        final_df.to_csv(CONFIG["output_file"], index=False)
        logging.info(f"The outcome is saved at {CONFIG['output_file']}")

        # 分析结果
        analysis_df = analyze_results(predictions, df[CONFIG["properties"]])
        print("\Ultimate Result")
        print(analysis_df.round(3).to_string(index=False))

    except Exception as e:
        logging.critical(f"The main process fails: {str(e)}", exc_info=True)

if __name__ == "__main__":
    main()

处理分子: 100%|██████████| 500/500 [2:18:21<00:00, 16.60s/it]


=== 最终分析结果 ===
          SMILES类型   性质   MAE  RMSE      R²  p值  样本量
            smiles logP 0.843 1.128   0.365 0.0  500
            smiles  qed 0.199 0.226  -1.615 0.0  500
            smiles  SAS 2.261 2.493  -9.188 0.0  500
  Canonical_SMILES logP 0.840 1.123   0.370 0.0  500
  Canonical_SMILES  qed 0.199 0.226  -1.613 0.0  500
  Canonical_SMILES  SAS 2.189 2.413  -8.545 0.0  500
   Random_SMILES_1 logP 0.816 1.080   0.418 0.0  500
   Random_SMILES_1  qed 0.190 0.221  -1.498 0.0  500
   Random_SMILES_1  SAS 2.366 2.583  -9.941 0.0  500
   Random_SMILES_2 logP 0.836 1.105   0.390 0.0  500
   Random_SMILES_2  qed 0.196 0.226  -1.620 0.0  500
   Random_SMILES_2  SAS 2.389 2.605 -10.129 0.0  500
NonIsomeric_SMILES logP 0.889 1.184   0.300 0.0  500
NonIsomeric_SMILES  qed 0.220 0.248  -2.166 0.0  500
NonIsomeric_SMILES  SAS 2.156 2.393  -8.389 0.0  500
     Kekule_SMILES logP 0.787 1.055   0.444 0.0  500
     Kekule_SMILES  qed 0.192 0.219  -1.469 0.0  500
     Kekule_SMILES  SAS 2.266 


