In [None]:
from google.colab import files
uploaded = files.upload()

In [44]:

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# import requests  # Uncomment this if you are making API calls
# import json  # Uncomment this if you are making API calls



In [45]:
# Function for simplified text preprocessing
def simple_preprocess_text(text):
    text = text.lower()
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token.isalnum()]
    return ' '.join(filtered_tokens)



In [46]:
# Load the data
candidates_df = pd.read_csv('candidates.csv')
companies_df = pd.read_csv('companies.csv')
vacancies_df = pd.read_csv('vacancies.csv')



In [47]:
# Text Preprocessing
candidates_df['processed_mission'] = candidates_df['mission_statement'].apply(simple_preprocess_text)
companies_df['processed_mission'] = companies_df['mission_statement'].apply(simple_preprocess_text)
vacancies_df['processed_description'] = vacancies_df['description'].apply(simple_preprocess_text)



In [48]:
# Keyword Extraction (For demonstration, let's assume 'manager', 'operations', and 'fashion' are domain-specific keywords)
keywords = ['manager', 'operations', 'fashion']



In [49]:
# Add keyword presence as features
for keyword in keywords:
    candidates_df[keyword] = candidates_df['processed_mission'].apply(lambda x: int(keyword in x))
    companies_df[keyword] = companies_df['processed_mission'].apply(lambda x: int(keyword in x))
    vacancies_df[keyword] = vacancies_df['processed_description'].apply(lambda x: int(keyword in x))



In [50]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
combined_corpus = pd.concat([candidates_df['processed_mission'], companies_df['processed_mission'], vacancies_df['processed_description']], ignore_index=True)
tfidf_vectorizer.fit(combined_corpus)

candidates_tfidf = tfidf_vectorizer.transform(candidates_df['processed_mission'])
companies_tfidf = tfidf_vectorizer.transform(companies_df['processed_mission'])
vacancies_tfidf = tfidf_vectorizer.transform(vacancies_df['processed_description'])



In [51]:
# Include keyword features in TF-IDF matrix
candidates_features = np.hstack([candidates_tfidf.toarray(), candidates_df[keywords].values])
companies_features = np.hstack([companies_tfidf.toarray(), companies_df[keywords].values])
vacancies_features = np.hstack([vacancies_tfidf.toarray(), vacancies_df[keywords].values])



In [52]:
# Cosine Similarity
candidate_company_sim = cosine_similarity(candidates_features, companies_features)
candidate_vacancy_sim = cosine_similarity(candidates_features, vacancies_features)



In [53]:
# Function to get top N matches for each vacancy
def get_top_matches(candidate_vacancy_sim, candidate_company_sim, top_n=3, api_key=None):
    top_matches = {}
    matches_for_api = []

    for vacancy_id in range(candidate_vacancy_sim.shape[1]):
        vacancy_sim_scores = candidate_vacancy_sim[:, vacancy_id]
        company_id = vacancies_df.loc[vacancy_id, 'company_id']
        company_sim_scores = candidate_company_sim[:, company_id]

        # Calculate combined similarity
        combined_sim_scores = (vacancy_sim_scores + company_sim_scores) / 2.0

        top_candidate_indices = np.argsort(combined_sim_scores)[::-1][:top_n]

        # Convert NumPy int64 to native Python int before appending to matches_for_api
        matches_for_api.append({
            "vacancy_id": int(vacancy_id),
            "company_id": int(company_id),
            "candidate_ids": [int(idx) for idx in top_candidate_indices.tolist()]
        })

    # Uncomment the lines below to make an API call for match evaluation
    # api_url = "YOUR_API_URL_HERE"
    # response = requests.post(api_url, data=json.dumps({"matches": matches_for_api}), headers={"Authorization": f"Bearer {api_key}"})
    # if response.status_code == 200:
    #     top_matches = response.json()
    # else:
    #     top_matches = {}

    return matches_for_api  # Replace with `top_matches` after enabling the API call



In [54]:
# Get the top 3 matches for each vacancy
api_key = "f0302950-8bf9-4dd6-97f8-2b661e0b58e2"  # Replace with your actual API key
top_3_matches = get_top_matches(candidate_vacancy_sim, candidate_company_sim, top_n=3, api_key=api_key)



In [55]:
# Display or save the top 3 matches
print(top_3_matches)


[{'vacancy_id': 0, 'company_id': 97, 'candidate_ids': [109, 245, 112]}, {'vacancy_id': 1, 'company_id': 50, 'candidate_ids': [160, 124, 97]}, {'vacancy_id': 2, 'company_id': 58, 'candidate_ids': [113, 53, 63]}, {'vacancy_id': 3, 'company_id': 50, 'candidate_ids': [160, 114, 97]}, {'vacancy_id': 4, 'company_id': 68, 'candidate_ids': [217, 68, 55]}, {'vacancy_id': 5, 'company_id': 51, 'candidate_ids': [7, 90, 95]}, {'vacancy_id': 6, 'company_id': 83, 'candidate_ids': [136, 329, 115]}, {'vacancy_id': 7, 'company_id': 18, 'candidate_ids': [299, 338, 260]}, {'vacancy_id': 8, 'company_id': 38, 'candidate_ids': [277, 60, 339]}, {'vacancy_id': 9, 'company_id': 68, 'candidate_ids': [68, 43, 283]}, {'vacancy_id': 10, 'company_id': 80, 'candidate_ids': [137, 245, 109]}, {'vacancy_id': 11, 'company_id': 100, 'candidate_ids': [255, 100, 246]}, {'vacancy_id': 12, 'company_id': 19, 'candidate_ids': [336, 285, 90]}, {'vacancy_id': 13, 'company_id': 103, 'candidate_ids': [97, 28, 114]}, {'vacancy_id': 

In [56]:
# Convert the list of matches to a dictionary format
top_3_matches_dict = {}
for i, match in enumerate(top_3_matches):
    vacancy_id = match["vacancy_id"]
    candidate_ids = match["candidate_ids"]
    top_3_matches_dict[vacancy_id] = candidate_ids

# Get truncated detailed information for the top matches of the first 5 vacancies
truncated_top_matches = get_truncated_detailed_matches(top_3_matches_dict, num_vacancies_to_show=5)

# Displaying the truncated information for better understanding of the matches
print(truncated_top_matches)


{0: {'Company': 'Guardian Insurance Solutions', 'Vacancy Description': 'Job Vacancy: Operations Manager at Guardian Insurance Solutions\n\nWe are currently seeking a highly m...', 'Top Matching Candidates': ['Isabella Martin', 'Anna Smith', 'Sofia Martinez']}, 1: {'Company': 'FabStyle', 'Vacancy Description': 'We are excited to announce a new opportunity at FabStyle, a leading player in the Fashion and Appare...', 'Top Matching Candidates': ['Omar Hussein', 'Amir Hassan', 'Muhammad Ali']}, 2: {'Company': 'InsuraCorp', 'Vacancy Description': 'Job Vacancy: Negotiation Specialist at InsuraCorp\n\nInsuraCorp, a leading insurance company, is seeki...', 'Top Matching Candidates': ['Ricardo Santos', 'Jacob Jensen', 'James Wilson']}, 3: {'Company': 'FabStyle', 'Vacancy Description': 'FabStyle, a leading fashion and apparel company, is seeking a dynamic and creative individual to joi...', 'Top Matching Candidates': ['Omar Hussein', 'Sophia Lee', 'Muhammad Ali']}, 4: {'Company': 'Consultix Consu