In [None]:
!pip install nest-asyncio aiohttp

import nest_asyncio
nest_asyncio.apply()

In [None]:
import asyncio
import aiohttp
import json
import time
import random
from typing import Dict, Any, List

In [None]:

def preprocess_chembl_data(df) -> List[Dict[str, Any]]:
    processed_data = []
    for idx, row in df.iterrows():
        compound_data = {"name": "", "smiles": "", "text": ""}
        if pd.notna(row.get('Molecule Name')):
            compound_data["name"] = str(row['Molecule Name'])
        elif pd.notna(row.get('Molecule ChEMBL ID')):
            compound_data["name"] = f"Compound_{row['Molecule ChEMBL ID']}"
        if pd.notna(row.get('Smiles')):
            compound_data["smiles"] = str(row['Smiles'])
        text_parts = []
        mol_info = []
        if pd.notna(row.get('Molecule ChEMBL ID')):
            mol_info.append(f"ChEMBL ID: {row['Molecule ChEMBL ID']}")
        if pd.notna(row.get('Molecular Weight')):
            mol_info.append(f"Molecular Weight: {row['Molecular Weight']}")
        if pd.notna(row.get('#RO5 Violations')):
            mol_info.append(f"RO5 Violations: {row['#RO5 Violations']}")
        if pd.notna(row.get('AlogP')):
            mol_info.append(f"AlogP: {row['AlogP']}")
        if mol_info:
            text_parts.append("Molecular properties: " + "; ".join(mol_info))
        assay_info = []
        if pd.notna(row.get('Assay Description')):
            assay_info.append(f"Assay: {row['Assay Description']}")
        if pd.notna(row.get('Assay Type')):
            assay_info.append(f"Type: {row['Assay Type']}")
        if pd.notna(row.get('Assay Organism')):
            assay_info.append(f"Organism: {row['Assay Organism']}")
        if pd.notna(row.get('Assay Tissue Name')):
            assay_info.append(f"Tissue: {row['Assay Tissue Name']}")
        if assay_info:
            text_parts.append("Experimental details: " + "; ".join(assay_info))
        activity_info = []
        if pd.notna(row.get('Standard Type')):
            activity_info.append(f"Measurement type: {row['Standard Type']}")
        if pd.notna(row.get('Standard Relation')):
            activity_info.append(f"Relation: {row['Standard Relation']}")
        if pd.notna(row.get('Standard Value')):
            activity_info.append(f"Value: {row['Standard Value']}")
        if pd.notna(row.get('Standard Units')):
            activity_info.append(f"Units: {row['Standard Units']}")
        if pd.notna(row.get('pChEMBL Value')):
            activity_info.append(f"pChEMBL: {row['pChEMBL Value']}")
        if pd.notna(row.get('Standard Text Value')):
            activity_info.append(f"Text value: {row['Standard Text Value']}")
        if pd.notna(row.get('Value')):
            activity_info.append(f"Additional value: {row['Value']}")
        toxicity_info = []
        if pd.notna(row.get('Data Validity Comment')):
            toxicity_info.append(f"Validity: {row['Data Validity Comment']}")
        if pd.notna(row.get('Comment')):
            toxicity_info.append(f"Comment: {row['Comment']}")
        if pd.notna(row.get('Action Type')):
            toxicity_info.append(f"Action: {row['Action Type']}")
        if toxicity_info:
            activity_info.extend(toxicity_info)
        if activity_info:
            text_parts.append("Activity/toxicity data: " + "; ".join(activity_info))
        target_info = []
        if pd.notna(row.get('Target Name')):
            target_info.append(f"Target: {row['Target Name']}")
        if pd.notna(row.get('Target Organism')):
            target_info.append(f"Target organism: {row['Target Organism']}")
        if pd.notna(row.get('Target Type')):
            target_info.append(f"Target type: {row['Target Type']}")
        if target_info:
            text_parts.append("Target information: " + "; ".join(target_info))
        ligand_efficiency = []
        if pd.notna(row.get('Ligand Efficiency BEI')):
            ligand_efficiency.append(f"BEI: {row['Ligand Efficiency BEI']}")
        if pd.notna(row.get('Ligand Efficiency LE')):
            ligand_efficiency.append(f"LE: {row['Ligand Efficiency LE']}")
        if pd.notna(row.get('Ligand Efficiency LLE')):
            ligand_efficiency.append(f"LLE: {row['Ligand Efficiency LLE']}")
        if pd.notna(row.get('Ligand Efficiency SEI')):
            ligand_efficiency.append(f"SEI: {row['Ligand Efficiency SEI']}")
        if ligand_efficiency:
            text_parts.append("Ligand efficiency: " + "; ".join(ligand_efficiency))
        compound_data["text"] = ". ".join(text_parts)
        if compound_data["text"].strip():
            processed_data.append(compound_data)
    return processed_data

In [None]:
# --- Configuration ---
API_KEYS = [
    "PXE3aM9RwSWVdyTQk0kgMk8fq0NotEca",
    "v3efxoWTMOsWUvC9Etz7GVQjgsVwrq5P",
    "E9v3G6uEBzZX5ZZYdBf0fr0ghMvjXIbG",
    "eAcAMU0tziYMEhs31LGsU9VQKE99h9dM",
    "K5OfEybaZkmukyJIxUfUnjuXA2Huz6Gy",
    "qvmvzUl4jQqL5T6tUVUZNVTnvPXtG9yz",
    "J1U0dl5cHzsCpzI10WGdTmpeMUKSjdm9",
    "PDBy1PZniHZJXk4PEFsMQdZ7zCQT8Vw6",
    "PGU9U73JlwXkoQWqdbsJUBK4ciQzp1Yq",
] 

MODEL_NAME = "mistral-small-latest"  
MAX_RETRIES = 3
BASE_DELAY = 5.0  
BATCH_PAUSE = 1.0  # pause between questions

# --- format prompt ---
def build_prompt(compound_name: str, smiles: str, text: str) -> str:
    system_prompt = """You are an expert toxicologist. Extract ALL toxicity-related measurements from the text.
CRITICAL: For each metric, SEPARATE the numerical value from the units.
- Extract numerical values as floats when possible
- Put units in separate field
- If no clear numerical value, use descriptive text
Examples:
- "LD50 of 500 mg/kg" → value: 500, units: "mg/kg"
- "IC50 = 250 μM" → value: 250, units: "μM" 
- "albumin level 1.45 ± 0.06 g per 100 mL" → value: 1.45, units: "g/100mL"
- "mild hepatotoxicity" → value: "mild", units: null
Toxicity levels (example for LD50):
- "none": No toxicity data
- "low": High LD50 (>1000 mg/kg), minimal effects
- "moderate": LD50 50-1000 mg/kg, dose-dependent  
- "high": LD50 10-50 mg/kg, significant toxicity
- "severe": LD50 <10 mg/kg, lethal damage
Respond ONLY in valid JSON matching this schema:
{
  "compound_name": "...",
  "smiles": "...",
  "toxicity_level": "...",
  "evidence": "...",
  "toxicity_metrics": { "...": { "value": ..., "units": "...", "description": "..." } },
  "confidence": "..."
}"""
    user_prompt = f"""Compound: {compound_name}
SMILES: {smiles}
Text:
{text}
Extract and structure all toxicity metrics:"""
    return system_prompt, user_prompt

# --- requests to Mistral API ---
async def call_mistral(
    session: aiohttp.ClientSession,
    api_key: str,
    compound_name: str,
    smiles: str,
    text: str,
    retry_count: int = 0
) -> Dict[str, Any]:
    system, user = build_prompt(compound_name, smiles, text)
    url = "https://api.mistral.ai/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user", "content": user}
        ],
        "temperature": 0.1,
        "max_tokens": 500,
        "response_format": {"type": "json_object"}
    }

    try:
        async with session.post(url, headers=headers, json=payload) as resp:
            if resp.status == 200:
                data = await resp.json()
                content = data["choices"][0]["message"]["content"]
                parsed = json.loads(content)
                return parsed
            elif resp.status == 429:
                if retry_count < MAX_RETRIES:
                    wait = BASE_DELAY * (2 ** retry_count) + random.uniform(0, 1)
                    print(f"429 error. Retrying in {wait:.2f}s (attempt {retry_count + 1})")
                    await asyncio.sleep(wait)
                    return await call_mistral(session, api_key, compound_name, smiles, text, retry_count + 1)
                else:
                    print("Max retries exceeded for 429 error.")
                    return {"error": "429 after max retries"}
            else:
                text_resp = await resp.text()
                print(f"HTTP {resp.status}: {text_resp}")
                return {"error": f"HTTP {resp.status}", "response": text_resp}
    except Exception as e:
        print(f"Exception during API call: {e}")
        if retry_count < MAX_RETRIES:
            await asyncio.sleep(BASE_DELAY * (2 ** retry_count))
            return await call_mistral(session, api_key, compound_name, smiles, text, retry_count + 1)
        else:
            return {"error": "Exception after max retries", "exception": str(e)}


async def process_batch(test_cases: List[Dict[str, str]], output_file: str):
    async with aiohttp.ClientSession() as session:
        for i in range(0, len(test_cases), 9):
            batch = test_cases[i:i+9]
            tasks = []
            for j, case in enumerate(batch):
                key = API_KEYS[j % len(API_KEYS)]
                task = call_mistral(
                    session,
                    key,
                    case["name"],
                    case["smiles"],
                    case["text"]
                )
                tasks.append(task)

            print(f"Sending batch {i//9 + 1} with {len(tasks)} requests...")
            results = await asyncio.gather(*tasks)

            # Запись результатов в файл
            with open(output_file, 'a', encoding='utf-8') as f:
                for res in results:
                    f.write(json.dumps(res, ensure_ascii=False) + '\n')

            print(f"Batch {i//9 + 1} completed. Pausing for {BATCH_PAUSE}s...")
            await asyncio.sleep(BATCH_PAUSE)

In [None]:
import pandas as pd

df = pd.read_csv("input/toxicity/chembl.csv", sep=";")
df = df.drop_duplicates()

test_cases = preprocess_chembl_data(df.iloc[40_000:])

In [None]:
asyncio.run(process_batch(test_cases, "working/chembl_40000-48375.jsonl"))