In [None]:
# The provided Python code performs the following tasks:

### 1. Import Libraries
# The code starts by importing the necessary Python libraries:
# - `pandas` for data manipulation
# - `numpy` for numerical operations
# - `TfidfVectorizer` and `cosine_similarity` from scikit-learn for text processing and similarity calculations

### 2. Data Loading
# The code includes placeholders for loading the CSV files for candidates, companies, and vacancies. You'll need to replace these placeholders with the actual code to load your CSV files in Google Colab.

### 3. Text Preprocessing
# A simplified text preprocessing function called `simple_preprocess_text` is defined. This function:
# - Converts the text to lowercase
# - Splits the text into tokens (words)
# - Filters out non-alphanumeric tokens
# The function is then applied to the mission statements and job descriptions in the data.

### 4. Feature Engineering
#The code combines the preprocessed text from all three datasets to create a single corpus. It then uses the `TfidfVectorizer` to transform this text into a set of feature vectors, creating a consistent feature space across all datasets.

### 5. Cosine Similarity Calculation
#The cosine similarity between the candidates and companies, as well as between the candidates and vacancies, is calculated using the `cosine_similarity` function. This results in two similarity matrices.

### 6. Matching Algorithm
#A function called `get_top_matches` is defined to find the top N candidates for each vacancy based on the cosine similarity scores. The function takes into account both the similarity with the vacancy description and the company's mission statement.

### 7. Output
#The top 3 matching candidates for each vacancy are calculated and stored in a dictionary called `top_3_matches`. This dictionary can then be used for further analysis or output.


In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from google.colab import files
uploaded = files.upload()

Saving candidates.csv to candidates.csv
Saving companies.csv to companies.csv
Saving vacancies.csv to vacancies.csv


In [3]:
# Load the data (Replace these lines with actual data loading code in Google Colab)
candidates_df = pd.read_csv("candidates.csv")
companies_df = pd.read_csv("companies.csv")
vacancies_df = pd.read_csv("vacancies.csv")

In [4]:
# Simplified text preprocessing function
def simple_preprocess_text(text):
    text = text.lower()
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token.isalnum()]
    return ' '.join(filtered_tokens)

In [5]:
# Apply simplified text preprocessing
candidates_df['simple_processed_mission'] = candidates_df['mission_statement'].apply(simple_preprocess_text)
companies_df['simple_processed_mission'] = companies_df['mission_statement'].apply(simple_preprocess_text)
vacancies_df['simple_processed_description'] = vacancies_df['description'].apply(simple_preprocess_text)

In [6]:
# Create a single corpus and generate a consistent feature space using TfidfVectorizer
combined_corpus = pd.concat([
    candidates_df['simple_processed_mission'],
    companies_df['simple_processed_mission'],
    vacancies_df['simple_processed_description']
], ignore_index=True)

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(combined_corpus)

candidates_tfidf = tfidf_vectorizer.transform(candidates_df['simple_processed_mission'])
companies_tfidf = tfidf_vectorizer.transform(companies_df['simple_processed_mission'])
vacancies_tfidf = tfidf_vectorizer.transform(vacancies_df['simple_processed_description'])

In [7]:
# Calculate cosine similarity
candidate_company_similarity = cosine_similarity(candidates_tfidf, companies_tfidf)
candidate_vacancy_similarity = cosine_similarity(candidates_tfidf, vacancies_tfidf)

In [8]:
# Function to get top N matches for each vacancy
def get_top_matches(candidate_vacancy_sim, candidate_company_sim, top_n=3):
    top_matches = {}
    num_vacancies = candidate_vacancy_sim.shape[1]

    for vacancy_id in range(num_vacancies):
        vacancy_sim_scores = candidate_vacancy_sim[:, vacancy_id]
        company_id = vacancies_df.loc[vacancy_id, 'company_id']
        company_sim_scores = candidate_company_sim[:, company_id]
        combined_sim_scores = (vacancy_sim_scores + company_sim_scores) / 2.0
        top_candidate_indices = np.argsort(combined_sim_scores)[::-1][:top_n]
        top_matches[vacancy_id] = top_candidate_indices.tolist()

    return top_matches

In [9]:
# Get top 3 matches for each vacancy
top_3_matches = get_top_matches(candidate_vacancy_similarity, candidate_company_similarity, top_n=3)

In [10]:
# Display or use the top 3 matches as needed
print(top_3_matches)

{0: [109, 33, 113], 1: [160, 124, 97], 2: [113, 53, 63], 3: [160, 114, 97], 4: [217, 68, 243], 5: [7, 90, 95], 6: [136, 329, 115], 7: [299, 338, 260], 8: [277, 4, 60], 9: [68, 97, 43], 10: [63, 97, 46], 11: [255, 100, 246], 12: [336, 285, 90], 13: [97, 28, 114], 14: [146, 51, 298], 15: [160, 97, 28], 16: [95, 186, 146], 17: [235, 144, 271], 18: [314, 285, 330], 19: [258, 321, 63], 20: [166, 202, 203], 21: [206, 335, 198], 22: [95, 157, 10], 23: [314, 285, 234], 24: [202, 166, 21], 25: [173, 238, 59], 26: [285, 314, 90], 27: [113, 300, 63], 28: [256, 78, 101], 29: [136, 329, 63], 30: [256, 78, 101], 31: [7, 173, 26], 32: [70, 325, 255], 33: [180, 287, 64], 34: [314, 285, 306], 35: [7, 315, 173], 36: [157, 95, 139], 37: [122, 60, 4], 38: [314, 285, 90], 39: [152, 28, 114], 40: [134, 237, 276], 41: [250, 337, 70], 42: [31, 26, 315], 43: [342, 70, 257], 44: [95, 253, 157], 45: [329, 63, 136], 46: [314, 338, 285], 47: [193, 78, 319], 48: [109, 33, 79], 49: [329, 63, 245], 50: [314, 285, 90]

In [11]:
# Function to get truncated detailed information about the top matches for a given number of vacancies
def get_truncated_detailed_matches(top_matches, num_vacancies_to_show=5):
    truncated_matches = {}

    for vacancy_id, candidate_ids in list(top_matches.items())[:num_vacancies_to_show]:
        # Get the company ID for the current vacancy
        company_id = vacancies_df.loc[vacancy_id, 'company_id']

        # Get the details of the company, vacancy, and candidates
        company_name = companies_df.loc[company_id, 'name']
        vacancy_description = vacancies_df.loc[vacancy_id, 'description'][:100] + '...'  # Truncate description
        candidate_names = candidates_df.loc[candidate_ids, 'name'].tolist()

        # Store the truncated information in the dictionary
        truncated_matches[vacancy_id] = {
            'Company': company_name,
            'Vacancy Description': vacancy_description,
            'Top Matching Candidates': candidate_names
        }

    return truncated_matches

# Get truncated detailed information for the top matches of the first 5 vacancies
truncated_top_matches = get_truncated_detailed_matches(top_3_matches, num_vacancies_to_show=5)

# Displaying the truncated information for better understanding of the matches
truncated_top_matches


{0: {'Company': 'Guardian Insurance Solutions',
  'Vacancy Description': 'Job Vacancy: Operations Manager at Guardian Insurance Solutions\n\nWe are currently seeking a highly m...',
  'Top Matching Candidates': ['Isabella Martin',
   'Pavel Ivanov',
   'Ricardo Santos']},
 1: {'Company': 'FabStyle',
  'Vacancy Description': 'We are excited to announce a new opportunity at FabStyle, a leading player in the Fashion and Appare...',
  'Top Matching Candidates': ['Omar Hussein', 'Amir Hassan', 'Muhammad Ali']},
 2: {'Company': 'InsuraCorp',
  'Vacancy Description': 'Job Vacancy: Negotiation Specialist at InsuraCorp\n\nInsuraCorp, a leading insurance company, is seeki...',
  'Top Matching Candidates': ['Ricardo Santos',
   'Jacob Jensen',
   'James Wilson']},
 3: {'Company': 'FabStyle',
  'Vacancy Description': 'FabStyle, a leading fashion and apparel company, is seeking a dynamic and creative individual to joi...',
  'Top Matching Candidates': ['Omar Hussein', 'Sophia Lee', 'Muhammad Ali']}