# Generate Synthetic Data



In [2]:
import os, sys 

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

import random
import asyncio
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from datetime import datetime
from typing import Dict, Optional, List

# Imports for Pydantic AI
from pydantic_ai import Agent
from pydantic import BaseModel, Field, field_validator
from pydantic_ai.models.ollama import OllamaModel

from environment.LLM.analyst_behavior import AnalystBehavior, AnalystBehaviorSimulator

# Depending on if you are working in the container or not.  
# data_path_base = "/home/asheller/cicero/datasets/" Would be outside the container
#data_path_base = "/app/datasets/"

# It is currently set to
data_path_base = "/app/datasets/"

model_name = "mistral:7b"  # Replace with your preferred model
#model_name = "llama3.2"  # Replace with your preferred model
#model_name = "deepseek-r1:7b"  # Replace with your preferred model

base_url = "http://olloma:11434/v1/"  # Ollama's default base URL

In [3]:
async def main(total_iterations = None):
    try:
        MIND_type = "MINDsmall"
        data_path = data_path_base + MIND_type + "/"
        history_file = data_path + "history.tsv"
        impressions_file = data_path + "impressions.tsv"
        behaviors_file = data_path + "train/behaviors.tsv"
        analysts_file = data_path_base + "synthetic_analysts.csv"
        news_file = data_path + "train/news.tsv"
        uid_to_names = data_path + "uid_to_name.tsv"
        news_behaviors_file = data_path + "new_behaviors.tsv"

        # Initialize the behavior simulator
        behavior_simulator = AnalystBehaviorSimulator(
            history_file=history_file,
            analysts_file=analysts_file,
            behaviors_file=behaviors_file,
            news_file=news_file,
            uid_to_names=uid_to_names
        )

        # Connect to Ollama server and create the agent
        behavior_simulator.connect_ollama(ollama_url=base_url, model_name=model_name)
        behavior_simulator.create_agent()

        behaviors = []
        if not total_iterations:
            total_iterations = len(behavior_simulator.uid_to_names) * 3
        with tqdm(total=total_iterations, desc="Processing analysts", unit="iteration") as pbar:
            for _ in range(total_iterations):
                try:
                    # Step 1: Pick a random analyst
                    analyst_uid = behavior_simulator.pick_random_user()

                    # Step 2: Select news articles for the analyst to review
                    session_articles = behavior_simulator.pick_random_news(num_articles=random.choice([1, 2, 3, 4, 5]))

                    # Step 3: Get the next impression ID
                    impression_id = behavior_simulator.current_impression_id

                    # Step 4: Generate behavior for each article with retry logic
                    impressions_this_session = []
                    history_this_session = []
                    for article in session_articles.to_dict(orient='records'):
                        retry_attempts = 0
                        max_retries = 5
                        success = False

                        while not success and retry_attempts < max_retries:
                            try:
                                result = await behavior_simulator.behavior_as_the_analyst(analyst_uid, impression_id, article)

                                # Validate the result
                                if result and hasattr(result, 'news_id') and hasattr(result, 'clicked'):
                                    item = result.news_id + '-' + result.clicked
                                    if int(result.clicked):
                                        history_this_session.append(result.news_id)
                                    impressions_this_session.append(item)
                                    success = True
                                else:
                                    raise ValueError("Incomplete or invalid result returned by the agent.")
                            except Exception as e:
                                retry_attempts += 1
                                #print(f"Error retrieving behavior for article {article['news_id']}, attempt {retry_attempts}: {e}")

                        if not success:
                            print(f"Failed to retrieve behavior for article {article['news_id']} after {max_retries} attempts. Skipping.")

                    # Skip if no valid impressions were generated
                    if not impressions_this_session:
                        print(f"No valid impressions for analyst {analyst_uid}, skipping.")
                        continue

                    # Step 5: Create a behavior record
                    current_timestamp = datetime.now().strftime("%m/%d/%Y %I:%M:%S %p")
                    prior_history = behavior_simulator.update_history(analyst_uid,history_this_session)
                    behavior = {
                        'impression_id': int(impression_id),
                        'user_id': analyst_uid,
                        'time': current_timestamp,
                        'history': prior_history,
                        'impressions': impressions_this_session
                    }
                    behaviors.append(behavior)
                    

                except Exception as e:
                    print(f"Error processing analyst behavior for UID {analyst_uid}: {e}")

                # Update the progress bar
                pbar.update(1)

                if len(behaviors) % 5 == 0 or len(behaviors) >= total_iterations:
                    # Define the file path for the CSV
                    new_behaviors_file = data_path + 'analyst_behavior.csv'

                    # Check if the file exists
                    if os.path.exists(new_behaviors_file):
                        try:
                            # Check if the file is not empty
                            if os.path.getsize(new_behaviors_file) > 0:
                                # Load the existing CSV into a DataFrame
                                existing_behaviors = pd.read_csv(new_behaviors_file)
                                #print(f"Loaded existing behaviors from {new_behaviors_file}")
                            else:
                                # File is empty, create an empty DataFrame
                                #print(f"{new_behaviors_file} is empty. Initializing a new DataFrame.")
                                existing_behaviors = pd.DataFrame()
                        except Exception as e:
                            print(f"Error reading {new_behaviors_file}: {e}")
                            existing_behaviors = pd.DataFrame()  # Create an empty DataFrame if reading fails
                    else:
                        # Create an empty DataFrame if the file doesn't exist
                        #print(f"{new_behaviors_file} does not exist. Creating a new one.")
                        existing_behaviors = pd.DataFrame()

                    # Convert current behaviors to a DataFrame
                    new_behaviors = pd.DataFrame(behaviors)

                    # Combine existing and new behaviors
                    combined_behaviors = behavior_simulator.format_behaviors(pd.concat([existing_behaviors, new_behaviors], ignore_index=True))

                    # Save the combined DataFrame back to the CSV
                    try:
                        combined_behaviors.to_csv(
                            news_behaviors_file,
                            sep="\t", 
                            index=False,
                            header=False,
                            quoting=3  # Ensure no unexpected quotes
                        )

                        # TODO Double check this is working well
                        behavior_simulator.save_history_to_tsv()
                        
                    except Exception as e:
                        print(f"Error saving to {new_behaviors_file}: {e}")

                    # Reset the behaviors list
                    behaviors = []
 
 
        print("Processing complete")

    except Exception as e:
        print(f"Error in main execution: {e}")


In [6]:
if __name__ == '__main__':
    # With no arguments it will run for 3x the number of analysts.
    await main()

/app/datasets/MINDsmall/history.tsv does not exist. Creating a new one.
Connected to Ollama model: mistral:7b
Agent successfully created.


Processing analysts:   0% 0/6000 [00:00<?, ?iteration/s]

Failed to retrieve behavior for article N6263 after 5 attempts. Skipping.
No valid impressions for analyst A6582, skipping.


Processing analysts:   0% 0/6000 [00:29<?, ?iteration/s]


CancelledError: 