In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import requests  # Uncomment this if you are making API calls
import json  # Uncomment this if you are making API calls

In [26]:
# Function for simplified text preprocessing
def simple_preprocess_text(text):
    text = text.lower()
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token.isalnum()]
    return ' '.join(filtered_tokens)

In [27]:
from google.colab import files
uploaded = files.upload()

Saving candidates.csv to candidates.csv
Saving companies.csv to companies.csv
Saving vacancies.csv to vacancies.csv


In [28]:
# Load the data
candidates_df = pd.read_csv('candidates.csv')
companies_df = pd.read_csv('companies.csv')
vacancies_df = pd.read_csv('vacancies.csv')

In [29]:
# Text Preprocessing
candidates_df['processed_mission'] = candidates_df['mission_statement'].apply(simple_preprocess_text)
companies_df['processed_mission'] = companies_df['mission_statement'].apply(simple_preprocess_text)
vacancies_df['processed_description'] = vacancies_df['description'].apply(simple_preprocess_text)

In [30]:
# Keyword Extraction (For demonstration, let's assume 'manager', 'operations', and 'fashion' are domain-specific keywords)
keywords = ['manager', 'operations', 'fashion']

In [31]:
# Add keyword presence as features
for keyword in keywords:
    candidates_df[keyword] = candidates_df['processed_mission'].apply(lambda x: int(keyword in x))
    companies_df[keyword] = companies_df['processed_mission'].apply(lambda x: int(keyword in x))
    vacancies_df[keyword] = vacancies_df['processed_description'].apply(lambda x: int(keyword in x))

In [32]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
combined_corpus = pd.concat([candidates_df['processed_mission'], companies_df['processed_mission'], vacancies_df['processed_description']], ignore_index=True)
tfidf_vectorizer.fit(combined_corpus)

candidates_tfidf = tfidf_vectorizer.transform(candidates_df['processed_mission'])
companies_tfidf = tfidf_vectorizer.transform(companies_df['processed_mission'])
vacancies_tfidf = tfidf_vectorizer.transform(vacancies_df['processed_description'])

In [33]:
# Include keyword features in TF-IDF matrix
candidates_features = np.hstack([candidates_tfidf.toarray(), candidates_df[keywords].values])
companies_features = np.hstack([companies_tfidf.toarray(), companies_df[keywords].values])
vacancies_features = np.hstack([vacancies_tfidf.toarray(), vacancies_df[keywords].values])

In [34]:
# Cosine Similarity
candidate_company_sim = cosine_similarity(candidates_features, companies_features)
candidate_vacancy_sim = cosine_similarity(candidates_features, vacancies_features)

In [35]:
# Function to get top N matches for each vacancy
def get_top_matches(candidate_vacancy_sim, candidate_company_sim, top_n=3, api_key=None):
    top_matches = {}
    matches_for_api = []

    for vacancy_id in range(candidate_vacancy_sim.shape[1]):
        vacancy_sim_scores = candidate_vacancy_sim[:, vacancy_id]
        company_id = vacancies_df.loc[vacancy_id, 'company_id']
        company_sim_scores = candidate_company_sim[:, company_id]

        # Calculate combined similarity
        combined_sim_scores = (vacancy_sim_scores + company_sim_scores) / 2.0

        top_candidate_indices = np.argsort(combined_sim_scores)[::-1][:top_n]

        # Convert NumPy int64 to native Python int before appending to matches_for_api
        matches_for_api.append({
            "vacancy_id": int(vacancy_id),
            "company_id": int(company_id),
            "candidate_ids": [int(idx) for idx in top_candidate_indices.tolist()]
        })

    # Uncomment the lines below to make an API call for match evaluation
    api_url = "https://staging-phloneiron-case-backend-57k2.encr.app/score"
    response = requests.post(api_url, data=json.dumps({"matches": matches_for_api}), headers={"Authorization": f"Bearer {api_key}"})
    if response.status_code == 200:
        top_matches = response.json()
    else:
        top_matches = {}

    return top_matches  # Replace with `top_matches` after enabling the API call

In [36]:
# Get the top 3 matches for each vacancy
api_key = "f0302950-8bf9-4dd6-97f8-2b661e0b58e2"  # Replace with your actual API key
top_3_matches = get_top_matches(candidate_vacancy_sim, candidate_company_sim, top_n=3, api_key=api_key)

In [37]:
# Display or save the top 3 matches
print(top_3_matches)

{}


In [None]:
# Based on the sample data, we have the following features available for feature engineering:

# - For candidates: `mission_statement`
# - For companies: `mission_statement`
# - For vacancies: `description`

# Given the constraints mentioned in the instructions, we might want to focus on:

# 1. **Personal Values**: As this information is likely embedded in the `mission_statement` of both candidates and companies, we should consider this feature prominently when calculating similarities.

# 2. **Area of Expertise**: This could be inferred from keywords in the `description` of vacancies and `mission_statement` of candidates. We could include domain-specific keywords as features.

# #### Proposed Approach:

# 1. **Text Preprocessing**: Preprocess text data to remove noise and make it uniform.
# 2. **Keyword Extraction**: Identify domain-specific keywords from vacancies and add them as features in the candidates' and companies' data.
# 3. **TF-IDF Vectorization**: Convert the preprocessed and enriched text data into a numerical format using TF-IDF.
# 4. **Cosine Similarity**: Calculate the similarity between candidates and vacancies/companies using the TF-IDF vectors.



In [None]:
# The feature engineering has been adjusted based on the proposed approach:

# 1. Text from mission statements and job descriptions has been preprocessed.
# 2. Keyword presence (for example, 'manager', 'operations', 'fashion') has been added as features.
# 3. TF-IDF vectorization has been applied to the processed text.
# 4. Cosine similarity has been calculated using the enriched feature sets.

# The resulting similarity matrices have the following shapes:

# - `candidate_company_sim`: (343 candidates x 118 companies)
# - `candidate_vacancy_sim`: (343 candidates x 492 vacancies)

# These matrices can now be used to find the top matches based on the new features.

In [None]:
# The top 3 matches for each vacancy based on the feature set calculated. following are some of the top matches for demonstration (limited to the first 5 for brevity):

# 1. **Vacancy ID 0** from **Company ID 97**
#    - Top Candidates: [109, 245, 112]

# 2. **Vacancy ID 1** from **Company ID 50**
#    - Top Candidates: [160, 124, 97]

# 3. **Vacancy ID 2** from **Company ID 58**
#    - Top Candidates: [113, 53, 63]

# 4. **Vacancy ID 3** from **Company ID 50**
#    - Top Candidates: [160, 114, 97]

# 5. **Vacancy ID 4** from **Company ID 68**
#    - Top Candidates: [217, 68, 55]

# These matches are based on both the job descriptions and mission statements, enriched with domain-specific keywords.

