In [4]:
import pandas as pd
import requests
import time

In [56]:
import pandas as pd
from collections import defaultdict

def preprocess_hepatotoxicity_data(df):
    """
    Reads CSV file with hepatotoxicity data and converts it to
    structured text format grouped by molecules.
    Version 2: Combines experiment context with hepatotoxicity measurements in a single section.
    
    Args:
        csv_file_path (str): Path to the CSV file.
    
    Returns:
        list: List of strings, where each string is a preprocessed description of one molecule.
    """
    
    # Load data
    # df = pd.read_csv(csv_file_path)
    
    # Dictionary for grouping data by molecule ChEMBL ID
    molecules_dict = defaultdict(lambda: {
        'properties': {},
        'experiments': []  # Now storing complete experiment descriptions
    })
    
    # Iterate through all dataframe rows
    for _, row in df.iterrows():
        chembl_id = row['Molecule ChEMBL ID']
        mol_data = molecules_dict[chembl_id]
        
        # Fill molecule properties (only once per molecule)
        if not mol_data['properties']:
            mol_data['properties'] = {
                'Molecule Name': row['Molecule Name'],
                'Molecule Max Phase': row['Molecule Max Phase'],
                'Molecular Weight': row['Molecular Weight'],
                '#RO5 Violations': row['#RO5 Violations'],
                'AlogP': row['AlogP'],
                'Smiles': row['Smiles']
            }
        
        # Create complete experiment description
        experiment_info = {
            'assay_id': row['Assay ChEMBL ID'],
            'assay_description': row['Assay Description'],
            'assay_type': row['Assay Type'],
            'organism': row['Assay Organism'],
            'tissue': row.get('Assay Tissue Name', ''),
            'bao_format': row['BAO Label']
        }
        
        # Add hepatotoxicity data if available
        hepatotoxicity_data = {}
        
        # Quantitative measurements
        if pd.notna(row['Standard Value']) and pd.notna(row['Standard Type']):
            hepatotoxicity_data = {
                'type': 'quantitative',
                'standard_type': row['Standard Type'],
                'standard_relation': row['Standard Relation'],
                'standard_value': row['Standard Value'],
                'standard_units': row['Standard Units'],
                'pchembl_value': row.get('pChEMBL Value', ''),
                'data_comment': row.get('Data Validity Comment', '')
            }
        # Text descriptions and comments
        elif pd.notna(row['Comment']):
            hepatotoxicity_data = {
                'type': 'qualitative',
                'comment': row['Comment'],
                'standard_text_value': row.get('Standard Text Value', ''),
                'standard_relation': row.get('Standard Relation', ''),
                'standard_value': row.get('Standard Value', ''),
                'standard_units': row.get('Standard Units', '')
            }
        
        # Combine experiment info with hepatotoxicity data
        if hepatotoxicity_data:
            experiment_info['hepatotoxicity'] = hepatotoxicity_data
            mol_data['experiments'].append(experiment_info)
        elif 'Hepatotoxicity' in str(row['Assay Description']) or 'Hepatotoxicity' in str(row.get('Comment', '')):
            # If hepatotoxicity is mentioned in assay description or comment but no specific data
            experiment_info['hepatotoxicity'] = {
                'type': 'context_only',
                'note': 'Hepatotoxicity mentioned in experiment context'
            }
            mol_data['experiments'].append(experiment_info)
    
    # Generate text representation for each molecule
    processed_data = []
    
    for chembl_id, data in molecules_dict.items():
        props = data['properties']
        
        # Section 1: Basic molecule properties
        text_representation = [
            f"MOLECULE: {chembl_id}",
            f"Name: {props['Molecule Name']}",
            f"Max Phase: {props['Molecule Max Phase']}",
            f"Molecular Weight: {props['Molecular Weight']}",
            f"RO5 Violations: {props['#RO5 Violations']}",
            f"AlogP: {props['AlogP']}",
            f"SMILES: {props['Smiles']}",
            ""
        ]
        
        # Section 2: Experiments with hepatotoxicity data
        if data['experiments']:
            text_representation.append("EXPERIMENTS AND HEPATOTOXICITY MEASUREMENTS:")
            
            for i, exp in enumerate(data['experiments'], 1):
                text_representation.append(f"Experiment {i}:")
                text_representation.append(f"  Description: {exp['assay_description']}")
                text_representation.append(f"  Organism: {exp['organism']}")
                if exp['tissue']:
                    text_representation.append(f"  Tissue: {exp['tissue']}")
                text_representation.append(f"  Assay Type: {exp['assay_type']} ({exp['bao_format']})")
                
                # Hepatotoxicity data
                if 'hepatotoxicity' in exp:
                    tox_data = exp['hepatotoxicity']
                    
                    if tox_data['type'] == 'quantitative':
                        text_representation.append("  Quantitative Data:")
                        text_representation.append(f"    Measurement Type: {tox_data['standard_type']}")
                        text_representation.append(f"    Value: {tox_data['standard_relation']} {tox_data['standard_value']} {tox_data['standard_units']}")
                        if tox_data['pchembl_value']:
                            text_representation.append(f"    pChEMBL: {tox_data['pchembl_value']}")
                    
                    elif tox_data['type'] == 'qualitative':
                        text_representation.append("  Qualitative Data:")
                        text_representation.append(f"    Comment: {tox_data['comment']}")
                        if tox_data['standard_text_value']:
                            text_representation.append(f"    Text Value: {tox_data['standard_text_value']}")
                        if tox_data['standard_value']:
                            text_representation.append(f"    Value: {tox_data['standard_relation']} {tox_data['standard_value']} {tox_data.get('standard_units', '')}")
                    
                    elif tox_data['type'] == 'context_only':
                        text_representation.append("  Contextual Data:")
                        text_representation.append(f"    Note: {tox_data['note']}")
                    
                    if tox_data.get('data_comment'):
                        text_representation.append(f"    Data Comment: {tox_data['data_comment']}")
                
                text_representation.append("")
        else:
            text_representation.append("EXPERIMENTS: No hepatotoxicity experiment data available.")
            text_representation.append("")
        
        # text_representation.append("---" * 20)
        processed_data.append("\n".join(text_representation))
    
    return processed_data

In [60]:
# Example usage
if __name__ == "__main__":
    # Replace 'test.csv' with your file path
    # processed_molecules = preprocess_hepatotoxicity_data(hepato_df[hepato_df["Molecule ChEMBL ID"] == "CHEMBL95"])
    processed_molecules = preprocess_hepatotoxicity_data(hepato_df.head())

### 