In [1]:
import time

# Start the timer
start_time = time.time()


import modin.pandas as pd
from collections import defaultdict
from pydantic import BaseModel, Field
from pydantic_ai import Agent
from pydantic_ai.models.ollama import OllamaModel
from typing import Dict
from typing import Optional
import uuid
import random
import asyncio
from tqdm.notebook import tqdm
import re

# Define the Ollama model running on Ollama
ollama_model = OllamaModel(
    model_name="llama3.2",  # Replace with your preferred model  Could be 'mistrel:7b', 'granite3.1-dense:latest', 'llama3.2', gemma2
    base_url="http://ollama:11434/v1/"  # Ollama's default base URL
)


MIND_type = 'MINDsmall'

data_path_base="/app/datasets/"
data_path = data_path_base + MIND_type +"/"


#behaviors_file = data_path + "train/behaviors.tsv"
#print(f"Behaviors File {behaviors_file}")

news_file = data_path + "train/news.tsv"
news_df = pd.read_csv(news_file, sep="\t", names=["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"])
#print(f"News file {news_file}")
# Load the behaviors data
columns = ["impression_id", "user_id", "time", "history", "impressions"]
#behaviors_df = pd.read_csv(behaviors_file, sep="\t", names=columns)

def print_elapsed_time(start_time):
    """
    Print the elapsed time since `start_time` in hours, minutes, and seconds.
    
    Args:
        start_time (float): The starting time, typically obtained from time.time().
    """
    elapsed_time = time.time() - start_time
    hours, remainder = divmod(elapsed_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    print(f"Elapsed Time: {int(hours)}h {int(minutes)}m {int(seconds)}s")



2024-12-24 21:00:47,454	INFO worker.py:1821 -- Started a local Ray instance.


## Define the Analyst Profile

This is the data structure that will be returned

In [2]:
class AnalystProfile(BaseModel):
    """
    This is the structure the LLM will return.
    """
    user_id: str = Field(description = 'A random unique identifier consisting of 8 alphanumeric characters, e.g., "a1b2c3d4"')
    age: int = Field(description = 'Age of the analyst', ge=25, le = 65)
    gender: str = Field(description= 'Gender: Male, Female, or Non-binary. With a distribution identicial to the real world population')
    primary_news_interest: str = Field(description= 'Primary catagory of news Interest')
    secondary_news_interest: str = Field(description= 'Secondary catagory of news Interest')
    job: str = Field(description= 'Job title  e.g. Technology Analyst')
    description: str = Field(description='The background of the analyst in their field of expertise')

    def __str__(self):
        return (
            f"AnalystProfile:\n"
            f"  User ID: {self.user_id}\n"
            f"  Age: {self.age}\n"
            f"  Gender: {self.gender}\n"
            f"  Primary News Interest: {self.primary_news_interest}\n"
            f"  Secondary News Interest: {self.secondary_news_interest}\n"
            f"  Job: {self.job}\n"
            f"  Description: {self.description}\n"
        )
    
    def __repr__(self):
        return self.__str__()


# Create the agent
agent = Agent(model=ollama_model, result_type=AnalystProfile, retries=5)

## Analysts Area's of expertise.
These are a list of areas that an alayst might specialize in.  These can be adjusted to create synthetic users.  

In [3]:
ANALYST_AREAS = [
    "Global Politics",
    "Economics and Markets",
    "Science and Innovation",
    "Health and Medicine",
    "Climate and Environment",
    "Sports and Entertainment",
    "Technology and Startups",
    "Crime and Legal Affairs",
    "Business and Corporate Affairs",
    "Culture and Society",
    "Education and Academia",
    "Infrastructure and Urban Development",
    "Energy and Sustainability",
    "Defense and Security",
    "Art and Design",
    "Food and Agriculture",
    "Travel and Tourism",
    "Religion and Philosophy",
    "Consumer Trends and Retail",
    "Space Exploration",
    "Artificial Intelligence and Machine Learning",
    "Blockchain and Cryptocurrency",
    "Mental Health and Wellness",
    "Social Media and Digital Culture",
    "Activism and Social Justice",
    "Emerging Technologies",
    "Crisis Response"
]


## Define a Method to generate the synthetic analysts

In [4]:
async def generate_synthetic_analyst_with_llm(analyst_area: str) -> Optional[dict]:
    """
    Generate a synthetic analyst profile for a given area of expertise using LLM.

    Args:
        analyst_area (str): The primary area of expertise for the synthetic analyst.
    
    Returns:
        dict: A dictionary representing the synthetic analyst profile.
    """
    prompt = f"""
    Create a synthetic analyst profile. Ensure the description does not include any names or personal identifiers. Include: Include:
    - name:  Generate a unique user ID with a UUID string that is 8 characters long.
    - age: An integer between 25 and 65.
    - gender: Choose Male (50%), Female (45%), or Non-binary (5%).
    - primary_news_interest: '{analyst_area}'.
    - secondary_news_interest: A secondary area of interest related to '{analyst_area}'.
    - job: A realistic job title.
    - description: Write a detailed paragraph about the analyst's professional background, work habits, and how they consume news. Do not include names, personal identifiers, or any references to specific people.

    Format the response as valid JSON matching this structure:
    {{
        "name": "{str(uuid.uuid4())[:8]}",
        "age": int,
        "gender": "string",
        "primary_news_interest": "string",
        "secondary_news_interest": "string",
        "job": "string",
        "description": "string"
    }}
    """

    try:
        result = await agent.run(prompt)
        return result.data
    except Exception as e:
        print(f"Error generating profile for '{analyst_area}': {e}")
        return None


In [5]:
result = await generate_synthetic_analyst_with_llm(ANALYST_AREAS[0])
print_elapsed_time(start_time)

Elapsed Time: 0h 0m 10s


In [6]:
print(result)

AnalystProfile:
  User ID: 4fb9fbf3
  Age: 37
  Gender: Male
  Primary News Interest: Global Politics
  Secondary News Interest: International Relations and Global Governance
  Job: Data Analyst in the Financial Sector
  Description: It was noted that data analysts in various sectors, when not glued to spreadsheets, tend to delve into current events. The majority seem drawn to global politics due to its far-reaching implications on their line of work. In a quiet corner with their computer and newsfeed refreshed, they spend their time piecing together metrics.

Their favorite method of news consumption is through in-depth analysis websites, which not only cover broad topics but also allow for more specialized interests. As a result, their job title may reflect the intersection of data analysis and that specific interest area.



In [7]:
result2 = await generate_synthetic_analyst_with_llm(ANALYST_AREAS[1])
print_elapsed_time(start_time)

Elapsed Time: 0h 0m 12s


In [8]:
print(result2)

AnalystProfile:
  User ID: c54f6d7e
  Age: 47
  Gender: Male
  Primary News Interest: Economics and Markets
  Secondary News Interest: Monetary policy and interest rate analysis
  Job: Portfolio Manager
  Description: The analyst has a background in economics, with a specialization in finance and trading. They prefer to stay up-to-date on market trends and news through a combination of traditional sources and online platforms. In their free time, they enjoy reading books on economic theory and attending industry events. As a portfolio manager, the analyst is responsible for making data-driven investment decisions, analyzing financial statements, and developing strategies to minimize risk and maximize returns. They are highly detail-oriented and enjoy working with large datasets. The analyst also prioritizes staying current with emerging trends in monetary policy and interest rate analysis, recognizing its critical impact on global markets.



In [9]:


async def gen_analysts_profiles(num_profiles: int, analyst_areas: list, max_retries: int = 3) -> pd.DataFrame:
    """
    Generate synthetic analyst profiles with retry logic for errors.

    Args:
        num_profiles (int): Total number of profiles to generate.
        analyst_areas (list): List of primary news interests for analysts.
        max_retries (int): Maximum number of retries for a failed generation.

    Returns:
        pd.DataFrame: DataFrame containing all generated profiles.
    """
    profiles = []
    with tqdm(total=num_profiles, desc="Generating Synthetic Analysts") as pbar:
        for i in range(num_profiles):
            area = analyst_areas[i % len(analyst_areas)]
            retries = 0
            while retries <= max_retries:
                try:
                    # Attempt to generate the profile
                    profile = await generate_synthetic_analyst_with_llm(area)
                    if profile:
                        profiles.append(profile.model_dump())
                        break  # Exit retry loop on success
                except Exception as e:
                    retries += 1
                    if retries > max_retries:
                        print(f"Failed to generate profile for '{area}' after {max_retries} retries: {e}")
                        break
            pbar.update(1)
    df = pd.DataFrame(profiles)
    return df


In [10]:

df = await gen_analysts_profiles(1000,ANALYST_AREAS)
print_elapsed_time(start_time)

Generating Synthetic Analysts:   0%|          | 0/1000 [00:00<?, ?it/s]

Error generating profile for 'Travel and Tourism': Exceeded maximum retries (5) for result validation
Error generating profile for 'Religion and Philosophy': Exceeded maximum retries (5) for result validation
Error generating profile for 'Blockchain and Cryptocurrency': Exceeded maximum retries (5) for result validation
Error generating profile for 'Technology and Startups': Exceeded maximum retries (5) for result validation
Error generating profile for 'Technology and Startups': Exceeded maximum retries (5) for result validation
Error generating profile for 'Activism and Social Justice': Exceeded maximum retries (5) for result validation
Error generating profile for 'Space Exploration': Exceeded maximum retries (5) for result validation
Error generating profile for 'Activism and Social Justice': Exceeded maximum retries (5) for result validation
Error generating profile for 'Defense and Security': Exceeded maximum retries (5) for result validation
Error generating profile for 'Busines

In [11]:
display(df.head())
display(df.tail())

Unnamed: 0,user_id,age,gender,primary_news_interest,secondary_news_interest,job,description
0,"{""type"":""string"",""value"":""cbb3b8da-7ea1-4e11-b...",42,Male,Global Politics,International Relations,Global Economist,The analyst is experienced in understanding th...
1,8e9de907,35,Male,Economics and Markets,Financial Risk Management,Quantitative Analyst,This analyst has a strong foundation in econom...
2,6f9e5c71-2dbb-4a77-a5d1-e35cd9ea7a59,42,Male,Science and Innovation,Advances in AI and machine learning,Data Analyst,"As a seasoned data analyst, our professional e..."
3,da6e263e,53,Male,Health and Medicine,Medical Research and Innovation,Biotechnology Analyst,A seasoned biotechnologist with over a decade ...
4,3d973a88,37,Male,Climate and Environment,Renewable Energy,Sustainability Analyst,The sustainability analyst at a leading green ...


Unnamed: 0,user_id,age,gender,primary_news_interest,secondary_news_interest,job,description
995,3ba57866-4d24-41f8-bb15-e7eaeb0ac2c6,47,Male,Social Media and Digital Culture,Data Analysis and Insights,Quantitative Analyst,A seasoned professional with a strong backgrou...
996,e4138ec7-6d11-49ab-b1af-d3ff8f74da22,41,Male,Activism and Social Justice,Human Rights and Economic Equity,Business Analyst - Corporate Responsibility Pr...,This analyst possesses a detailed understandin...
997,29a067aa,51,Female,Emerging Technologies,Cybersecurity Trends,Senior Technology Analyst,As a seasoned technologist with over a decade ...
998,537bcc51,41,Male,Crisis Response,Disaster Management,Senior Technology Analyst,A seasoned expert with extensive experience in...
999,3de18f34-4d6b-44aa-a5e2-c87db8c3a0fa,51,Male,Global Politics,International Relations,Economic Analyst: International Market Specialist,As a seasoned economic analyst with extensive ...


In [12]:
len(df)

1000

In [13]:
df.to_csv(data_path_base + "synthetic_analysts.csv")