In [1]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_mistralai.chat_models import ChatMistralAI
from langchain_core.output_parsers import JsonOutputParser
import os 

FEATURE_ENGINEERING_PROMPT = """
**Role:** You are an expert HR data analyst and feature engineer. Your task is to process a JSON object containing a candidate's professional profile and transform it into a structured set of features for a job recommendation search engine.

**Objective:** Analyze the provided candidate JSON data. From this data, you must calculate, infer, and extract specific features. Your final output must be a single, clean JSON object containing only the engineered features listed below, without any additional explanations or conversational text.

---

**Instructions:**

Given the input candidate profile below, perform the following feature engineering tasks:

1.  **`total_years_of_experience`**: Calculate the total years of professional work experience. Sum the duration of all roles listed in the `experiences` array. If a role is ongoing or has no `end_date`, calculate the experience up to the current date (`August 2025`). Provide the result as a floating-point number.
2.  **`seniority_level`**: Infer the candidate's seniority level based on their `total_years_of_experience` and job titles. Classify it into one of the following categories: **"Junior"**, **"Mid-level"**, **"Senior"**, **"Lead"**, or **"Manager/Director"**.
3.  **`education_level`**: Determine the highest level of education achieved by the candidate from the `education` array. Classify it into one of the following: **"High School"**, **"Bachelor's"**, **"Master's"**, **"PhD"**, or **"Other"**.
4.  **`skill_keywords`**: Generate a single, comprehensive, and deduplicated list of the candidate's technical skills, tools, and languages.
    * First, include all skills from the explicit `skills` array.
    * Second, meticulously scan the `description` fields within both `experiences` and `education`. From these descriptions, extract any mentioned technologies, programming languages, frameworks, libraries, databases, and tools.
    * **Crucially, you must infer and include the foundational programming language when a framework or library is mentioned.** For example:
        * If you see "Flask", "Django", or "PyTorch", you **must** include "Python".
        * If you see "React", "Express.js", or "Vue", you **must** include "JavaScript".
        * If you see "Spring Boot", you **must** include "Java".
        * If you see ".NET", you **must** include "C#".
5.  **`recent_job_title`**: Identify and return the most recent job title from the `experiences` array.
6.  **`recent_company`**: Identify and return the company of the most recent job from the `experiences` array.
7.  **`candidate_summary`**: Generate a concise, professional summary (2-3 sentences) of the candidate's profile. This summary should highlight their total experience, key skills, and most recent role, making it suitable for a recruiter's initial screening.

---

**Input Candidate Data:**

```json
{candidate_json}
```

---

**Required Output Format:**

You **MUST** provide your response as a single, valid JSON object with the following structure. Do not include any text before or after the JSON object.

```json
{{
  "total_years_of_experience": "<float>",
  "seniority_level": "<string>",
  "education_level": "<string>",
  "skill_keywords": ["<string>", "<string>", ...],
  "recent_job_title": "<string>",
  "recent_company": "<string>",
  "candidate_summary": "<string>"
}}
```
"""

model_name = 'mistral-small-latest'

prompt = ChatPromptTemplate.from_messages([("system", FEATURE_ENGINEERING_PROMPT)])
chat = ChatMistralAI(api_key=os.environ['MISTRAL_API_KEY'], model_name=model_name, temperature=0)
candidate_fe_chain = prompt | chat | JsonOutputParser()



In [2]:
import json
from typing import Any

def _load_json(path: str) -> Any:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)
    
candidates = _load_json('app/data/candidates.json')
jobs = _load_json('app/data/jobs.json')


In [7]:
a = candidates[1]
print('Original: \n', a)

candidates_enriched = await candidate_fe_chain.ainvoke({"candidate_json": a})
print('Feature Engineered: \n', candidates_enriched)

Original: 
 {'first_name': 'Bob', 'last_name': 'Smith', 'birthdate': '1988-07-22', 'age': 36, 'email': 'bob.smith@example.com', 'phone': '+19876543210', 'address': '45 Broadway, New York, USA', 'skills': ['JavaScript', 'React', 'Node.js'], 'experiences': [{'company': 'Webify', 'role': 'Frontend Developer', 'start_date': '2013-01-01', 'end_date': '2016-06-30', 'description': 'Built SPAs with React.'}, {'company': 'CloudApps', 'role': 'Fullstack Developer', 'start_date': '2016-07-01', 'end_date': '2021-12-31', 'description': 'Worked on Node.js + React fullstack apps.'}], 'education': [{'institution': 'College X', 'degree': 'B.Sc. in Information Technology', 'year_of_graduation': 2010, 'description': '...'}]}
Feature Engineered: 
 {'total_years_of_experience': 12.5, 'seniority_level': 'Senior', 'education_level': "Bachelor's", 'skill_keywords': ['JavaScript', 'React', 'Node.js', 'Python'], 'recent_job_title': 'Fullstack Developer', 'recent_company': 'CloudApps', 'candidate_summary': "Bob 

In [4]:
JOB_FEATURE_ENGINEERING_PROMPT = """
**Role:** You are an expert technical recruiter and data analyst. Your task is to process a JSON object containing a job posting and transform it into a structured set of features for a candidate recommendation engine.

**Objective:** Analyze the provided job posting JSON. From this data, you must infer, extract, and standardize specific features that can be used to match against candidate profiles. Your final output must be a single, clean JSON object containing only the engineered features listed below.

---

**Instructions:**

Given the input job posting below, perform the following feature engineering tasks:

1.  **`extracted_skills`**: Generate a single, comprehensive, and deduplicated list of all required technical skills, tools, and languages.
    * First, include all skills from the `required_skills` array.
    * Second, meticulously scan the `job_description` for any other mentioned technologies, programming languages, frameworks, libraries, databases, and tools.
    * **Crucially, you must infer and include the foundational programming language when a framework or library is mentioned.** For example:
        * If the description mentions "Flask" or "Django", you **must** include "Python".
        * If it mentions "React" or "Express.js", you **must** include "JavaScript".
        * If it mentions "Spring Boot", you **must** include "Java".

2.  **`seniority_level`**: Infer the job's seniority level based on the `job_title` and keywords within the `job_description` (e.g., "senior," "lead," "principal," "entry-level"). Classify it into one of the following categories, which must align with the candidate seniority levels: **"Junior"**, **"Mid-level"**, **"Senior"**, **"Lead"**, or **"Manager/Director"**.

3.  **`required_experience_years`**: Identify the minimum years of professional experience required for the role, mentioned in the `job_description` (e.g., "5+ years of experience").
    * If a specific number is mentioned, extract it as a floating-point number.
    * If no specific number is mentioned, infer a reasonable minimum based on the `seniority_level` (e.g., Junior: 0, Mid-level: 2, Senior: 5, Lead: 8).

4.  **`location_normalized`**: Standardize the location information. If the location is remote, specify that. For on-site roles, provide the city and state/country. Examples: "San Francisco, CA", "London, UK", "Remote (USA)", "Remote (Global)".

5.  **`job_summary_for_embedding`**: Generate a concise summary (2-3 sentences) of the role. This summary should capture the core responsibilities, the main technologies used, and the company's mission or team environment. This text will be used to create a vector embedding for semantic search.

---

**Input Job Data:**

```json
{job_json}
```

---

**Required Output Format:**

You **MUST** provide your response as a single, valid JSON object with the following structure. Do not include any text before or after the JSON object.

```json
{{
  "extracted_skills": ["<string>", "<string>", ...],
  "seniority_level": "<string>",
  "required_experience_years": <float>,
  "location_normalized": "<string>",
  "job_summary_for_embedding": "<string>"
}}
```
"""

model_name = 'mistral-small-latest'

prompt = ChatPromptTemplate.from_messages([("system", JOB_FEATURE_ENGINEERING_PROMPT)])
chat = ChatMistralAI(api_key=os.environ['MISTRAL_API_KEY'], model_name=model_name, temperature=0)
job_fe_chain = prompt | chat | JsonOutputParser()


In [5]:
b = jobs[0]
print('Original: \n', b)

job_enriched = await job_fe_chain.ainvoke({"job_json": b})
print('Feature Engineered: \n', job_enriched)

Original: 
 {'job_title': 'Software Engineer', 'job_description': 'Responsible for developing and maintaining software applications.', 'budget': {'min': 70000, 'max': 90000, 'currency': 'USD'}, 'location': 'San Francisco, CA', 'company_name': 'Tech Corp', 'employment_type': 'Full-time', 'required_skills': ['JavaScript', 'Python', 'Java']}
Feature Engineered: 
 {'extracted_skills': ['JavaScript', 'Python', 'Java'], 'seniority_level': 'Mid-level', 'required_experience_years': 2.0, 'location_normalized': 'San Francisco, CA', 'job_summary_for_embedding': 'Software Engineer at Tech Corp in San Francisco, CA. Responsible for developing and maintaining software applications using JavaScript, Python, and Java. Join a dynamic team focused on innovation and technology.'}


In [None]:
import os
from mistralai import Mistral

api_key = os.environ["MISTRAL_API_KEY"]
model = "mistral-embed"

client = Mistral(api_key=api_key)

candidate_summary_1 = 'Alice Johnson is a Senior Backend Engineer with over 10 years of experience in software development. Her expertise includes Python, Flask, Docker, and various JavaScript frameworks. Most recently, she led a microservices migration at InnovateX.'
candidate_summary_2 = "Bob Smith is a Senior Fullstack Developer with 12.5 years of experience, specializing in JavaScript, React, and Node.js. Most recently, he worked at CloudApps, where he developed fullstack applications. He holds a Bachelor's degree in Information Technology."
candidate_summary_3 = "Alex is a Senior Sales with 15 years of experience, specializing in Commodity Market sector. Most recently, he worked at CloudApps, where he developed fullstack applications. He holds a Bachelor's degree in Information Technology."

job_summary = 'Software Engineer at Tech Corp in San Francisco, CA. Responsible for developing and maintaining software applications using JavaScript, Python, and Java. Join a dynamic team focused on innovation and technology.'


In [15]:
from sklearn.metrics.pairwise import euclidean_distances

def get_text_embedding(inputs):
    embeddings_batch_response = client.embeddings.create(
        model=model,
        inputs=inputs
    )
    return embeddings_batch_response.data[0].embedding

sentences = [
    candidate_summary_1, candidate_summary_2, candidate_summary_3
]
embeddings = [get_text_embedding([t]) for t in sentences]

reference_sentence = job_summary
reference_embedding = get_text_embedding([reference_sentence])

for t, e in zip(sentences, embeddings):
    distance = euclidean_distances([e], [reference_embedding])
    print(t, distance)

Alice Johnson is a Senior Backend Engineer with over 10 years of experience in software development. Her expertise includes Python, Flask, Docker, and various JavaScript frameworks. Most recently, she led a microservices migration at InnovateX. [[0.66553928]]
Bob Smith is a Senior Fullstack Developer with 12.5 years of experience, specializing in JavaScript, React, and Node.js. Most recently, he worked at CloudApps, where he developed fullstack applications. He holds a Bachelor's degree in Information Technology. [[0.67950844]]
Alex is a Senior Sales with 15 years of experience, specializing in Commodity Market sector. Most recently, he worked at CloudApps, where he developed fullstack applications. He holds a Bachelor's degree in Information Technology. [[0.70537118]]


In [19]:
import os
from mistralai import Mistral
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# --- Configuration ---
api_key = os.environ.get("MISTRAL_API_KEY")
if not api_key:
    raise ValueError("MISTRAL_API_KEY environment variable not set.")

model = "mistral-embed"
client = Mistral(api_key=api_key)

# --- Input Data ---
candidate_summaries = [
    'Alice Johnson is a Senior Backend Engineer with over 10 years of experience in software development. Her expertise includes Python, Flask, Docker, and various JavaScript frameworks. Most recently, she led a microservices migration at InnovateX.',
    "Bob Smith is a Senior Fullstack Developer with 12.5 years of experience, specializing in JavaScript, React, and Node.js. Most recently, he worked at CloudApps, where he developed fullstack applications. He holds a Bachelor's degree in Information Technology.",
    "Alex is a Senior Sales with 15 years of experience, specializing in Commodity Market sector. Most recently, he worked at CloudApps, where he developed fullstack applications. He holds a Bachelor's degree in Information Technology."
]
job_summary = 'Software Engineer at Tech Corp in San Francisco, CA. Responsible for developing and maintaining software applications using JavaScript, Python, and Java. Join a dynamic team focused on innovation and technology.'

# --- Function for Batch Embedding ---
def get_embeddings_batch(texts, model="mistral-embed"):
    """
    Gets embeddings for a list of texts in a single API call.
    """
    embeddings_batch_response = client.embeddings.create(
        model=model,
        inputs=texts
    )
    return [data.embedding for data in embeddings_batch_response.data]

# --- Main Logic ---

# 1. Combine all texts into one list for a single API call
all_texts = [job_summary] + candidate_summaries

# 2. Get all embeddings in one go (efficient batching)
all_embeddings = get_embeddings_batch(all_texts)

# 3. Separate the job embedding from the candidate embeddings
job_embedding = np.array(all_embeddings[0]).reshape(1, -1)
candidate_embeddings = np.array(all_embeddings[1:])

# 4. Calculate Cosine Similarity (higher is better)
similarity_scores = cosine_similarity(job_embedding, candidate_embeddings)

# 5. Display the results
print(f"Ranking candidates against the job: '{job_summary}'\n")
# Create a list of (summary, score) tuples to sort them
ranked_candidates = sorted(
    zip(candidate_summaries, similarity_scores[0]), 
    key=lambda item: item[1], 
    reverse=True # Sort from highest score to lowest
)

for i, (summary, score) in enumerate(ranked_candidates):
    print(f"Rank {i+1}: Score = {score:.4f}")
    print(f"  Candidate: {summary}\n")


Ranking candidates against the job: 'Software Engineer at Tech Corp in San Francisco, CA. Responsible for developing and maintaining software applications using JavaScript, Python, and Java. Join a dynamic team focused on innovation and technology.'

Rank 1: Score = 0.7785
  Candidate: Alice Johnson is a Senior Backend Engineer with over 10 years of experience in software development. Her expertise includes Python, Flask, Docker, and various JavaScript frameworks. Most recently, she led a microservices migration at InnovateX.

Rank 2: Score = 0.7698
  Candidate: Bob Smith is a Senior Fullstack Developer with 12.5 years of experience, specializing in JavaScript, React, and Node.js. Most recently, he worked at CloudApps, where he developed fullstack applications. He holds a Bachelor's degree in Information Technology.

Rank 3: Score = 0.7514
  Candidate: Alex is a Senior Sales with 15 years of experience, specializing in Commodity Market sector. Most recently, he worked at CloudApps, whe

In [1]:
from app.service.candidate_service import CandidateService
from app.service.llm_manager import LLMManager
from pathlib import Path

llm_manager = LLMManager()
candidate_service = CandidateService(llm_manager=llm_manager)

input_file = Path("app/data/raw_candidates.json")
output_file = Path("app/data/processed_candidates.json")

# Run the processing pipeline
await candidate_service.process_candidates_from_file(
    input_path=input_file, output_path=output_file
)

Processing Candidates: 100%|██████████| 10/10 [00:04<00:00,  2.01it/s]


In [1]:
from app.model.schemas import RawJob
from app.service.search_service import SearchService
from app.service.llm_manager import LLMManager
from pathlib import Path
import json

processed_candidates_file = Path("app/data/processed_candidates.json")
raw_jobs_file = Path("app/data/raw_jobs.json")

# Load a sample job
try:
    with open(raw_jobs_file, "r") as f:
        sample_job_data = json.load(f)[0]
        print(sample_job_data)
    sample_job = RawJob(**sample_job_data)
except (FileNotFoundError, IndexError):
    print(f"Could not load sample job from {raw_jobs_file}. Exiting.")


# --- EXECUTION ---
llm_manager = LLMManager()
search_service = SearchService(llm_manager, processed_candidates_file)

top_candidates = await search_service.find_top_candidates(sample_job, top_n=5)

# --- RESULTS ---
print("\n--- Top 5 Recommended Candidates ---")
for ranked_candidate in top_candidates:
    candidate_info = ranked_candidate.candidate.original_data
    print(
        f"  - Name: {candidate_info.first_name} {candidate_info.last_name} "
        f"| Score: {ranked_candidate.score:.4f}"
    )
    print(
        f"    Summary: {ranked_candidate.candidate.engineered_features.candidate_summary}"
    )

{'job_title': 'Software Engineer', 'job_description': 'Responsible for developing and maintaining software applications.', 'budget': {'min': 70000, 'max': 90000, 'currency': 'USD'}, 'location': 'San Francisco, CA', 'company_name': 'Tech Corp', 'employment_type': 'Full-time', 'required_skills': ['JavaScript', 'Python', 'Java']}
extracted_skills=['JavaScript', 'Python', 'Java'] seniority_level='Mid-level' required_experience_years=2.0 job_summary_for_embedding='Software Engineer at Tech Corp responsible for developing and maintaining software applications. The role involves working with JavaScript, Python, and Java. The company offers a competitive salary range of $70,000 to $90,000 USD for this full-time position in San Francisco, CA.'

--- Top 5 Recommended Candidates ---
  - Name: Alice Johnson | Score: 0.7773
    Summary: Alice Johnson is a Senior Backend Engineer with 9.5 years of experience in software development. Her expertise includes Python, Flask, and Docker, with a strong bac