In [1]:
!pip install python-dotenv strands-agents[mistral] strands-agents-tools tqdm 

Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting strands-agents-tools
  Downloading strands_agents_tools-0.2.6-py3-none-any.whl.metadata (43 kB)
Collecting strands-agents[mistral]
  Downloading strands_agents-1.7.1-py3-none-any.whl.metadata (12 kB)
Collecting docstring-parser<1.0,>=0.15 (from strands-agents[mistral])
  Downloading docstring_parser-0.17.0-py3-none-any.whl.metadata (3.5 kB)
Collecting mcp<2.0.0,>=1.11.0 (from strands-agents[mistral])
  Downloading mcp-1.13.1-py3-none-any.whl.metadata (74 kB)
Collecting opentelemetry-api<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_api-1.36.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-instrumentation-threading<1.00b0,>=0.51b0 (from strands-agents[mistral])
  Downloading opentelemetry_instrumentation_threading-0.57b0-py3-none-any.whl.metadata (2.1 kB)
Collecting opentelemetry-sdk<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading o

In [2]:
!aws s3 cp s3://gdsc25test/ . --recursive --quiet

In [1]:
import json
import os
import boto3
import dotenv
import requests

from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Type, TypeVar
from tqdm import tqdm
from pydantic import BaseModel, Field
from strands.agent import Agent
from strands.models.mistral import MistralModel


T = TypeVar('T')
M = TypeVar('M', bound=BaseModel)

dotenv.load_dotenv("env")

True

# Utilities

In [2]:
def get_agent(system_prompt: str = "") -> Agent:
    model = MistralModel(
        api_key=os.environ["MISTRAL_API_KEY"],
        model_id="mistral-large-latest",
        stream=False
    )
    return Agent(model=model, system_prompt=system_prompt, callback_handler=None)

def load_file_content(path: str | Path) -> str:
    path = Path(path)
    with path.open('r', encoding='utf-8') as file:
        return file.read()

def get_job_paths() -> List[Path]:
    data_dir = Path('./sample/jobs') #Path('./Data/jobs')--------------------------------------------------------------------
    paths = []
    for file in data_dir.iterdir():
        if file.name.startswith('job_') and file.suffix in ('.md'):
            paths.append(file)
    return paths

def get_training_paths() -> List[Path]:
    data_dir = Path('./sample/trainings') #Path('./Data/trainings') -------------------------------------------------------------------------
    paths = []
    for file in data_dir.iterdir():
        if file.name.startswith('tr_') and file.suffix in ('.md',):
            paths.append(file)
    return paths

# Data structures

In [12]:
class JobInfo(BaseModel):
    domain: str = Field(default="", description="Field or industry of the job")
    required_skills: List[str] = Field(default_factory=list, description="List of required skills for the job.")
    location: str = Field(default="", description="Job location.")
    years_of_experience_required: str = Field(default="", description="Years of experience required to get this job.")

class TrainingInfo(BaseModel):
    domain: str = Field(default="", description="Field or industry of the training outcome.")
    skill_acquired_and_level: tuple = Field(default_factory=tuple, description="A skill from the domain and the outcome level of it.")


class PersonaInfo(BaseModel):
    name: str = Field(default="", description="Persona's name")
    skills: List = Field(default_factory=list, description="List of tuples representing skills and their levels.")
    location: str = Field(default="", description="Current location")
    age: str = Field(default=None, description="Age of the persona")
    years_of_experience: str = Field(default="", description="Years of experience in a field.")
    domain: str = Field(default="", description="Field or industry in which the persona wants to work in.")

    def job_fits(self, job_info: JobInfo) -> bool:
        """Check if the persona's skills match the job requirements."""
        agent = get_agent()
        candidate_skills =  ', '.join(f"{skill}: {level}" for skill, level in self.skills)
        job_skills = ', '.join(job_info.required_skills)
        domain = self.domain
        job_domain = job_info.domain
        candidate_experience = self.years_of_experience
        required_experience = job_info.years_of_experience_required
        prompt = f
        """Based on the list of skills of the candidate and the list of skills
        required for a job decide whether the candidate is a good fit.
        Only match them if you are 90% sure that the person's domain: {domain} is same as job's domain: {job_domain}.
        Candidate skills: {candidate_skills}
        Candidate years of experience: {candidate_experience}
        Years of expirience required for a job: {required_experience}
        canidates domaine: {domain}
        job domain: {job_domain}
        Job skills: {job_skills}
        """
        return agent.structured_output(BoolOut, prompt).result

    def training_fits(self, training_info: TrainingInfo) -> bool:
        """Check if the persona could benefit from the training."""
        agent = get_agent()
        persona_skills = ", ".join(f"{skill}: {level}" for skill, level in self.skills) #', '.join(self.skills)
        domain = self.domain
        training_domain = training_info.domain
        prompt = f"""Based on the list of skills of the persona and the domain
        assign only trainings that can only improve the current level of every persona skill.
        (E.g. if a person has skill "Typing" on a basic level you can just assign "Typing" on a intermediate level.)
        Only assign Intermediate level if current is Basic, or Advanced level if current is Intermediate.
        Only match them if you are 90% sure that the person's domain: {domain} is same as training's domain: {training_domain}.
        
        Persona skills: {persona_skills}
        """
        return agent.structured_output(BoolOut, prompt).result


class BoolOut(BaseModel):
    result: bool = Field(default=False, description="whether the condition is fulfilled or not")

# Accessing agents

In [6]:
def send_message_to_chat(message: str, persona_id: str, conversation_id: str = None) -> Optional[Tuple[str, str]]:
    """
    Send a single message to the chat endpoint and return the response.

    Args:
        persona_id: ID of the persona
        message: Message to send

    Returns:
        The response from the chat endpoint
    """
    url = 'https://5xxe59fsr7.execute-api.eu-central-1.amazonaws.com/main/chat'
    headers = {
        "x-api-key": os.environ["AWS_API_KEY"]
    }
    payload = {
        "message": message,
        "persona_id": persona_id,
        "conversation_id": conversation_id,
        "team_id": "ArturTeam",
    }

    response = requests.post(
        url=url,
        json=payload,
        headers=headers
    )

    if response.status_code != 200:
        print(f"Err: {response}, {persona_id}, {response.text}")
        return None

    response_data = response.json()
    return response_data['response'], response_data['conversation_id']


def get_conversation(persona_id: str, max_turns: int = 5, verbose: bool = True) -> List[str]:
    """
    Perform a conversation with a single persona for a maximum number of turns.

    Args:
        persona_id: ID of the persona
        max_turns: Maximum number of conversation turns (default: 5)

    Returns:
        List of conversation messages
    """
    system_prompt = """
    **Speak only in english.**
    **Do not give info from the internet.** 
    Continue to ask questions about this person - do not provide the jobs, trainings or anything yet.
    You are a helpful and empathetic assistant. Your goal is to engage in a natural conversation with a persona to gather the following information:
    - Their name
    - Their skills and **level of this skill**
    - Their current location
    - Their age
    - Their preferences
    - Years of experience
    
    Remember to always gather all of those information!
    Ask open-ended questions to encourage detailed responses. Be polite, patient, and adapt your questions based on their answers.
    If the persona is unsure or vague, gently probe for more details. Do not ask all questions at once; let the conversation flow naturally.
    **Do not comment on whatever the response is. Just ask questions to retrieve the information.**
    **Focus only on asking question about persona. Do not provide any additional info from the internet.**
    """

    
    conversation = []
    current_turn = 0
    converation_agent = get_agent(system_prompt)
    conversation_id = None

    # greeting
    agent_message = "Hello! I'm here to help you find the best job or training opportunities. Can you tell me your name?"
    converation_agent.messages = [{
        "role": "assistant",
        "content": [{
            "text": agent_message
        }]
    }]
    conversation.append(f"Assistant: {agent_message}")

    while current_turn < max_turns:
        user_response, conversation_id = send_message_to_chat(
            agent_message,
            persona_id,
            conversation_id
        )
        if user_response is None:
            print(f"User: {persona_id} did not respond")
            break
        elif verbose:
            print(f"Response: {user_response}\n\n")
        conversation.append(f"User: {user_response}")
        agent_message = str(converation_agent(user_response))
        conversation.append(f"Assistant: {agent_message}")
        current_turn += 1
    print(f'Conversation for persona: {persona_id} DONE')
    return conversation

#maybe pararel_conversations() ?
def get_conversations(persona_ids: List[str], max_workers: int = 10, verbose: bool = False, max_turns: int = 5) -> Dict[str, List[str]]:
    all_conversations = {}
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(get_conversation, persona_id, verbose=verbose, max_turns=max_turns): persona_id
            for persona_id in persona_ids
        }
        for future in futures:
            persona_id = futures[future]
            try:
                all_conversations[persona_id] = future.result()
            except Exception as e:
                print(f"Error occurred for {persona_id}: {e}")
    return all_conversations



# Extracting information

In [4]:
def extract_info(model: Type[M], text: str) -> M:
    extraction_agent = get_agent()
    return extraction_agent.structured_output(output_model=model, prompt=text)

def extract_info_from_conversation(conversation: List[str]) -> PersonaInfo:
    text = '\n'.join(conversation)
    return extract_info(PersonaInfo, text)


def extract_info_from_job_path(path: str | Path) -> JobInfo:
    text = load_file_content(path)
    return extract_info(JobInfo, text)


def extract_info_from_training_path(path: str | Path) -> TrainingInfo:
    text = load_file_content(path)
    return extract_info(TrainingInfo, text)

In [8]:
# conversation = get_conversation('per1', max_turns=2)

In [9]:
# persona_info = extract_info_from_conversation(conversation)
# persona_info

## Testing the persona with  trainig only

In [14]:
persona_training_only = ["persona_005"] #pers_foo_003

persona_conversations = get_conversations(persona_training_only, max_workers=1, max_turns=5)

persona_infos = {
    persona_id: extract_info_from_conversation(con)
    for persona_id, con in persona_conversations.items()
}

for info in persona_infos:
    print(info)

Conversation for persona: persona_005 DONE
persona_005


In [15]:
persona_infos

{'persona_005': PersonaInfo(name='Pedro Araújo', skills=[(['Food Safety', 'Food Sustainability', 'Food Production'], ['Intermediate', 'Intermediate', 'Intermediate']), (['Regulatory Compliance', 'Process Optimization', 'Hands-on Technical Abilities'], ['Intermediate', 'Intermediate', 'Intermediate'])], location='Brasília', age='16', years_of_experience='1', domain='Food Industry')}

## Testing perosna with job but empty trainig list (job_trainig flag)

In [37]:
# perosna_job_only = ["persona_003"] #pers_foo_002

# persona_conversations = get_conversation("persona_003", max_turns=5)

# persona_conversations = get_conversations(perosna_job_only, max_workers=1, max_turns=5)

# persona_infos = {
#     persona_id: extract_info_from_conversation(con)
#     for persona_id, con in persona_conversations.items()
# }

# for info in persona_infos:
#     print(info)

Conversation for persona: persona_003 DONE
persona_003


## Testing the persona with jobs and trainigs

In [6]:
# perosna_job_and_training = ["persona_004"] #pers_foo_001
# persona_conversations = get_conversations(perosna_job_and_training, max_workers=1, max_turns=5)

# persona_infos = {
#     persona_id: extract_info_from_conversation(con)
#     for persona_id, con in persona_conversations.items()
# }

# for info in persona_infos:
#     print(info)

Conversation for persona: persona_004 DONE
persona_004


In [14]:
awerness_perosna = ["persona_001"] #pers_foo_005
persona_conversations = get_conversations(awerness_perosna, max_workers=1, max_turns=5)

persona_infos = {
    persona_id: extract_info_from_conversation(con)
    for persona_id, con in persona_conversations.items()
}

for info in persona_infos:
    print(info)


KeyboardInterrupt



In [7]:
persona_ids = ["persona_050", "persona_053", "persona_030"] # "persona_033", "persona_022", "persona_096", "persona_080", "persona_082", "persona_078", "persona_079"]
#persona_ids = ["persona_039"]
persona_conversations = get_conversations(persona_ids, max_workers=3, max_turns=10)
persona_conversations
# persona_infos = {
#     persona_id: extract_info_from_conversation(con)
#     for persona_id, con in persona_conversations.items()
# }

# for info in persona_infos:
#     print(info)

#AFTER ITS DONE IT SHOULD GIVE A RECOMENDATION-------------------------


Exception in thread ThreadPoolExecutor-0_1:
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/opentelemetry/instrumentation/threading/__init__.py", line 150, in __wrap_threading_run
    token = context.attach(instance._otel_context)
AttributeError: 'Thread' object has no attribute '_otel_context'


Conversation for persona: persona_050 DONE
Conversation for persona: persona_053 DONE
Conversation for persona: persona_030 DONE


{'persona_050': ["Assistant: Hello! I'm here to help you find the best job or training opportunities. Can you tell me your name?",
  'User: I’m André Araújo. Nice to meet you.',
  'Assistant: Nice to meet you too, André! It’s great to have you here.\n\nCould you tell me a bit about where you’re currently living? For example, the city or region—whatever you’re comfortable sharing.\n',
  'User: I live in Curitiba. I don’t want to move anywhere else.',
  'Assistant: That’s great to know, André. Curitiba is a wonderful city.\n\nCould you share a little about what you do or what skills you’ve developed over time? For example, are there any areas where you feel particularly strong or experienced?\n',
  'User: I don’t have work experience yet. I just like numbers and organizing things.',
  'Assistant: I see. When you say you like numbers and organizing things, what kind of activities do you enjoy most? For example, do you work with spreadsheets, budgets, or maybe planning tasks?\n\nAlso, how 

In [13]:
# Generate persona IDs from persona_001 to persona_100
# persona_ids = [f"persona_{i:03d}" for i in range(1, 101)]
persona_ids = [f"persona_{i:03d}" for i in range(1, 11)]

# Directory to save conversations
output_dir = "conversations"
os.makedirs(output_dir, exist_ok=True)

# Process in batches (e.g., batch size of 10)
batch_size = 10
persona_infos = {}

for i in range(0, len(persona_ids), batch_size):
    batch_ids = persona_ids[i:i + batch_size]
    try:
        # Get conversations for this batch with parallel workers
        batch_conversations = get_conversations(batch_ids, max_workers=10, max_turns=10)
        for persona_id, con in batch_conversations.items():
            # Save conversation to file
            filename = os.path.join(output_dir, f"{persona_id}_conversation.txt")
            with open(filename, 'w') as f:
                f.write(str(con))
            # Extract info and add to persona_infos
            persona_infos[persona_id] = extract_info_from_conversation(con)
            print(f"Processed {persona_id}")
    except Exception as e:
        print(f"Error processing batch starting with {batch_ids[0]}: {e}")
        continue

# Now persona_infos contains information for all successfully processed personas
for info in persona_infos.values():
    print(info)

Conversation for persona: persona_006 DONE
Conversation for persona: persona_004 DONE
Conversation for persona: persona_008 DONE
Conversation for persona: persona_002 DONE
Conversation for persona: persona_001 DONE
Conversation for persona: persona_007 DONE
Conversation for persona: persona_010 DONE
Conversation for persona: persona_009 DONE
Conversation for persona: persona_003 DONE
Conversation for persona: persona_005 DONE
Processed persona_001
Processed persona_002
Processed persona_003
Processed persona_004
Processed persona_005
Processed persona_006
Processed persona_007
Processed persona_008
Processed persona_009
Processed persona_010
name='Rafael Ribeiro' skills=[{'skill': 'Following instructions', 'level': 'Good'}, {'skill': 'Cleaning', 'level': 'Basic experience with soap and water'}, {'skill': 'Teamwork', 'level': 'Enjoys group work'}, {'skill': 'Observational learning', 'level': 'Prefers watching and hands-on activities'}] location='São Paulo' age='21' years_of_experience='

In [7]:
job_file_paths = get_job_paths()
job_infos = {path.stem: extract_info_from_job_path(path) for path in tqdm(job_file_paths)}
#list(job_infos.items())[0]

#dodac zapisywanie jsonów do folderów job_infos.model_dump_json

100%|██████████| 3/3 [00:04<00:00,  1.62s/it]


In [8]:
training_file_paths = get_training_paths()
training_infos = {path.stem: extract_info_from_training_path(path) for path in tqdm(training_file_paths)}
#list(training_infos.items())[0]

#dodac zapisywanie trainignów do folderów trainngs_infos.model_dump_json

100%|██████████| 13/13 [00:17<00:00,  1.34s/it]


In [25]:
matches = []

for persona_id, persona_info in persona_infos.items():
    persona_dict = {"persona_id": persona_id}
    job_matches = []
    training_matches = []
    for job_id, job_info in job_infos.items():
        if persona_info.job_fits(job_info):
            job_matches.append((job_id, job_info))
    for training_id, training_info in training_infos.items():
        if persona_info.training_fits(training_info):
            training_matches.append((training_id, training_info))

# job_matches
# training_matches
print(job_matches)
print(training_matches)

[]
[('tr_foo_waste_reduction_03', TrainingInfo(title='Master the Basics of Food Industry', domain='Food Industry', skill_acquired_and_level=({'level': 'Avançado', 'skill': 'Waste Reduction'},))), ('tr_vis_portfolio_development_03', TrainingInfo(title='Portfolio Development', domain='Visual Arts', skill_acquired_and_level=({'level': 'Avançado', 'skill': 'Portfolio Development'},))), ('tr_foo_waste_reduction_01', TrainingInfo(title='Waste Reduction - Básico', domain='Food Industry', skill_acquired_and_level=({'Skill': 'Waste Reduction', 'Level': 'Básico'},))), ('tr_vis_digital_art_software_if_applicable__02', TrainingInfo(title='Master the Basics of Visual Arts!', domain='Visual Arts', skill_acquired_and_level=({'level': 'Intermediário', 'skill': 'Digital Art Software'},))), ('tr_foo_production_line_management_01', TrainingInfo(title='Production Line Management - Básico', domain='Food Industry', skill_acquired_and_level=({'level': 'Básico', 'skill': 'Production Line Management'},))), ('t

In [1]:
persona_type = ["trainings_only", "jobs+trainings", "awareness"]

persona_dict = {"perosna_id": perosna_id, "predicted_type": persona_type}
if persona_type == "jobs+trainings":
    jobs_list = []
    persona_training_ids = {training_id for training_id, _ in training_matches}
    for job_id, job_info in job_matches:
        #suggested trainings from job_info?
        suggested_trainings = getattr(job_info, 'suggested_trainings', [])
        # Filter to only those trainings that the persona matches with
        filtered_trainings = [
            tr_id for tr_id in suggested_trainings
            if tr_id in persona_training_ids
        ]
        job_entry = {
            "job_id": job_id,
            "suggested_trainings": filtered_trainings
        }
        jobs_list.append(job_entry)
    persona_dict["jobs"] = jobs_list

elif persona_type == "trainings_only":
    training_ids = [training_id for training_id, _ in training_matches]
    persona_dict["trainings"] = training_ids

elif persona_type == "awareness":
    persona_dict["gold_items"] = "info"

In [9]:
def generate_persona_json(persona_infos, job_infos, training_infos):
    """
    Generate JSON output for all personas.

    Args:
        persona_infos: Dictionary of persona_id to persona_info objects
        job_infos: Dictionary of job_id to job_info objects
        training_infos: Dictionary of training_id to training_info objects

    Returns:
        List of persona dictionaries ready for JSON serialization
    """
    results = []

    for persona_id, persona_info in persona_infos.items():
        job_matches = []
        training_matches = []

        for job_id, job_info in job_infos.items():
            if persona_info.job_fits(job_info):
                job_matches.append((job_id, job_info))

        for training_id, training_info in training_infos.items():
            if persona_info.training_fits(training_info):
                training_matches.append((training_id, training_info))
        
        training_matches = [
            training_id  # We only care about the ID, not the info
            for training_id in training_infos.keys()
            if persona_info.training_fits(training_infos[training_id])
        ]

        # Build the persona dictionary based on matches
        persona_dict = {"persona_id": persona_id}

        if job_matches:
            # This is a jobs+trainings persona
            persona_dict["predicted_type"] = "jobs+trainings"

            jobs_list = []
            #
            matched_training_ids = set(training_matches)

            for job_id, job_info in job_matches:
                #suggested trainings from job_info
                suggested_trainings = getattr(job_info, 'suggested_trainings', [])

                #filter to only those trainings that the persona matches with
                filtered_trainings = [
                    tr_id for tr_id in suggested_trainings
                    if tr_id in matched_training_ids
                ]

                jobs_list.append({
                    "job_id": job_id,
                    "suggested_trainings": filtered_trainings
                })

            persona_dict["jobs"] = jobs_list

        elif training_matches:
            # This is a trainings_only persona
            persona_dict["predicted_type"] = "trainings_only"
            persona_dict["trainings"] = training_matches

        else:
            # This is an awareness persona
            persona_dict["predicted_type"] = "awareness"
            persona_dict["gold_items"] = "info" #I dont really like that but thats something we have in our gold json

        results.append(persona_dict)

    return results


In [10]:
print(generate_persona_json(persona_infos, job_infos, training_infos))

[{'persona_id': 'persona_004', 'predicted_type': 'trainings_only', 'trainings': ['tr_foo_waste_reduction_03', 'tr_foo_waste_reduction_01', 'tr_foo_production_line_management_01', 'tr_foo_recipe_scaling_01', 'tr_foo_product_development_01', 'tr_foo_food_safety_standards_haccp_gmp__01', 'tr_foo_quality_inspection_01', 'tr_foo_material_procurement_01']}]


In [None]:
generate_persona_json()

In [1]:
data = [{'persona_id': 'persona_004', 'predicted_type': 'trainings_only', 'trainings': ['tr_foo_waste_reduction_03', 'tr_foo_waste_reduction_01', 'tr_foo_production_line_management_01', 'tr_foo_recipe_scaling_01', 'tr_foo_product_development_01', 'tr_foo_food_safety_standards_haccp_gmp__01', 'tr_foo_quality_inspection_01', 'tr_foo_material_procurement_01']}]

In [6]:
with open('submission.jsonl', 'w') as file:  # Open in write mode with .jsonl extension
    for item in data:
        json.dump(item, file)  # Write the JSON object
        file.write('\n')  

In [7]:
from validator import validate_file

validate_file('submition.json')

<validator.ValidationResult at 0x7f32e1554130>

In [19]:
print(persona_infos)
# print(job_infos)
# print(training_infos)

{'persona_053': PersonaInfo(name='Lucas Ferreira', skills=['basic data interpretation', 'Excel (SUM, AVERAGE)'], location='Brasília, Brazil', age='28')}


In [11]:
print(persona_infos)

{'persona_004': PersonaInfo(name='Rafael Gomes', skills=[('Product Development', 'Intermediate'), ('Quality Assurance', 'Intermediate')], location='Rio de Janeiro', age='27', years_of_experience='1', domain='Food Industry')}
