In [1]:
!pip install -U deepeval

Collecting deepeval
  Downloading deepeval-3.7.4-py3-none-any.whl.metadata (18 kB)
Collecting anthropic (from deepeval)
  Downloading anthropic-0.75.0-py3-none-any.whl.metadata (28 kB)
Collecting click<8.3.0,>=8.0.0 (from deepeval)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting ollama (from deepeval)
  Downloading ollama-0.6.1-py3-none-any.whl.metadata (4.3 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc<2.0.0,>=1.24.0 (from deepeval)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.39.0-py3-none-any.whl.metadata (2.5 kB)
Collecting portalocker (from deepeval)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=5.4.0 (from deepeval)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting pyfiglet (from deepeval)
  Downloading pyfiglet-1.0.4-py3-none-any.whl.metadata (7.4 kB)
Collecting pytest-asyncio (from deepeval)
  Downloading pytest_asyncio-1.3.0-py3-none-any.whl.metadata (4.1 kB)
Colle

In [2]:
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.models import GPTModel
from openai import OpenAI
from deepeval.models import DeepEvalBaseLLM
import asyncio
import json

In [3]:
API_KEY = "sk-or-vv-7934229a4838d3430a31f4f81f3eddf2f4ea28e52e275c7a77dd1d12c9ba2450"
API_URL = "https://api.vsegpt.ai/v1"
MODEL = "gpt-5-mini"

In [4]:
client = OpenAI(
    api_key="sk-or-vv-7934229a4838d3430a31f4f81f3eddf2f4ea28e52e275c7a77dd1d12c9ba2450", # ваш ключ в VseGPT после регистрации
    base_url="https://api.vsegpt.ru/v1",
)

def call_vsegpt(prompt: str, model: str):
    messages = [{"role": "user", "content": prompt}]
    response_big = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.7,
        n=1,
        max_tokens=3000,
        extra_headers={"Topalova": "g_eval"},
    )
    return response_big.choices[0].message.content


In [5]:
class VseGPTModel(DeepEvalBaseLLM):
    def __init__(self, model_name="openai/gpt-4o-mini"):
        self.model_name = model_name

    def load_model(self):
        # Nothing to load — it's API-based
        pass

    def generate(self, prompt: str) -> str:
      return call_vsegpt(prompt, model=self.model_name)

    def get_model_name(self):
        return "Llama-3 8B"

    # Асинхронная версия — deepeval вызывает именно её
    async def a_generate(self, prompt: str, **kwargs) -> str:
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(None, self.generate, prompt)

    # Этот метод deepeval вызывает внутри GEval — обязателен
    async def a_generate_raw_response(self, prompt: str, **kwargs):
        text = await self.a_generate(prompt)

        try:
            data = json.loads(text)
            if "score" not in data or "reason" not in data:
                data = {"score": 0.0, "reason": text}
        except json.JSONDecodeError:
            data = {"score": 0.0, "reason": text}
        json_text = json.dumps(data)

        # Fake response для GEval
        class FakeChoice:
            def __init__(self, content):
                self.message = type("obj", (), {"content": content})

        class FakeResponse:
            def __init__(self, text):
                self.choices = [FakeChoice(text)]

        return FakeResponse(json_text), 0.0



In [6]:
vsegpt_judge = VseGPTModel(model_name="openai/gpt-4o-mini")

# Создаём метрику "TargetRelevance"
synthetic_eval = GEval(
    name="SyntheticFeasibility",
    criteria="Assess the synthetic feasibility, stability, approximate number of synthesis steps, rough cost and patent risk of a molecule given as SMILES from the perspective of a medicinal/ synthetic chemist.",
    # подробные шаги, по которым LLM должен делать вывод
    evaluation_steps=[
        "Validate the SMILES: report if it's chemically invalid.",
        "Identify chemically unstable, highly reactive or protecting-group-requiring functional groups (e.g., peroxides, acyl azides, isocyanates, unstable hemiacetals).",
        "Assess whether the scaffold is common in medicinal chemistry (familiar scaffolds are easier).",
        "Estimate the approximate number of synthetic steps required from simple commercially available building blocks (very rough integer).",
        "Give a succinct judgment about likely cost/complexity: Low/Medium/High and short justification.",
        "Assess patent risk at high-level (Low/Medium/High) by checking for obvious matches to known privileged scaffolds (no external database calls — high-level judgement).",
        "Provide an overall synthetic-feasibility score in [0.0,1.0], where 1.0 = trivially synthesizable, 0.0 = essentially infeasible/unstable.",
        "Return the answer strictly in JSON following the specified schema; keep text concise."
    ],
    evaluation_params=[
        LLMTestCaseParams.INPUT,
        LLMTestCaseParams.ACTUAL_OUTPUT
        # LLMTestCaseParams.EXPECTED_OUTPUT
    ],
    model = vsegpt_judge,
    strict_mode=False,
    verbose_mode=True
)

# Пример: список молекул
smiles_list = ["C1=CC2=C(C=C1)C(=O)C(=O)N2", "CC(=O)O", "C[Si](C)(C)C"]

for smi in smiles_list:
    prompt = f"""
You are an expert medicinal/synthetic chemist with decades of practical experience in multi-step synthesis and process chemistry.
Your task: evaluate the provided molecule (SMILES) strictly from the standpoint of synthetic feasibility, stability, approximate step count, rough cost and patent risk.

SMILES: {smi}

Use the following checklist when evaluating:
- Validate SMILES (is it chemically valid?)
- Identify any unstable/highly reactive functional groups that would complicate synthesis or storage
- Judge whether the scaffold is a common medicinal chemistry scaffold or a rare/challenging one
- Give a conservative estimate of the number of synthetic steps from simple commercial building blocks (integer)
- Provide a rough cost/complexity estimate: Low / Medium / High
- Provide a high-level patent risk assessment (Low/Medium/High) — do not invent patents, just note obvious privileged scaffolds if present
- Output overall synthetic feasibility score between 0.0 and 1.0 (1.0 = trivial synthesis, 0.0 = infeasible/unstable)
- Keep reasoning concise and factual, avoid hallucinated database claims

Respond STRICTLY in JSON, with this exact schema (no extra fields):

{{
  "score": <float between 0.0 and 1.0>,
  "validity": <true/false>,
  "unstable_groups": [ "<group1>", "<group2>", ... ],
  "estimated_steps": <integer>,
  "cost_estimate": "<Low|Medium|High>",
  "patent_risk": "<Low|Medium|High>",
  "reason": "<one or two concise sentences>"
}}

Do not output anything else. If SMILES is invalid, set "validity": false, "score": 0.0 and explain briefly in "reason".
"""
    test_case = LLMTestCase(
        input=prompt,
        actual_output=smi,
        #expected_output="User requested molecules targeting DNA gyrase B of E. coli."
    )

    score = synthetic_eval.measure(test_case)  # возвращается просто число

    print(f"{smi}: {score:.2f}")

Output()

Output()

C1=CC2=C(C=C1)C(=O)C(=O)N2: 0.80


Output()

CC(=O)O: 1.00


C[Si](C)(C)C: 1.00
