In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

# Load dataset
df_1 = pd.read_csv('Dataset.csv')

# Function to extract numerical part from experience
def extract_years(experience):
    # Convert experience to string before applying split
    experience = str(experience)
    try:
        return int(experience.split()[0])
    except (ValueError, IndexError):
        return 0  # Handle cases where conversion fails

# Preprocess experience
df_1['Experience'] = df_1['Experience'].apply(extract_years)

# Vectorize Skills using TF-IDF
vectorizer = TfidfVectorizer()
skills_vectors = vectorizer.fit_transform(df_1['Skills'])

# Combine skills vectors with experience
skills_df = pd.DataFrame(skills_vectors.toarray())
skills_df['Experience'] = df_1['Experience']

# Ensure all columns are numeric and have string names
X = skills_df.astype(float)
X.columns = X.columns.astype(str)

# Standardize the data before DBSCAN
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=2)  # Adjust 'eps' and 'min_samples' for your data
df_1['Cluster'] = dbscan.fit_predict(X_scaled)

# Function to process experience input
def process_experience(exp_input):
    exp_match = re.search(r'(at least|at most|exact)\s*(\d+)', exp_input.lower())
    if exp_match:
        condition = exp_match.group(1)
        years = int(exp_match.group(2))
    else:
        raise ValueError("Invalid experience input format. Please specify 'at least', 'at most', or 'exact'.")
    return condition, years

# Function to filter candidates based on skills and experience criteria
def filter_candidates(cluster_candidates, skills_required, experience_condition, experience_years):
    def has_all_skills(candidate_skills, required_skills):
        candidate_skills_set = set(map(str.lower, candidate_skills.split(', ')))
        return all(skill.lower() in candidate_skills_set for skill in required_skills)

    # Filter based on skills: keep only candidates with all required skills
    filtered_candidates = cluster_candidates[
        cluster_candidates['Skills'].apply(lambda x: has_all_skills(x, skills_required))
    ]

    # Apply strict experience condition
    if experience_condition == 'at least':
        filtered_candidates = filtered_candidates[filtered_candidates['Experience'] >= experience_years]
    elif experience_condition == 'at most':
        filtered_candidates = filtered_candidates[filtered_candidates['Experience'] <= experience_years]
    elif experience_condition == 'exact':
        filtered_candidates = filtered_candidates[filtered_candidates['Experience'] == experience_years]

    return filtered_candidates[['Name', 'Skills', 'Experience']]

# Function to match candidates based on skills and experience
def match_candidates(skills_required, exp_input, df_1, vectorizer):
    # Process experience input
    experience_condition, experience_years = process_experience(exp_input)

    # Vectorize query skills
    query_vector = vectorizer.transform([', '.join(skills_required)]).toarray()

    # Create a new DataFrame to hold the combined data for the query
    query_df = pd.DataFrame(query_vector, columns=X.columns[:-1])  # Exclude the experience column
    query_df['Experience'] = experience_years  # Add experience to the query DataFrame

    # Convert to array for cosine similarity calculation
    query_array = query_df.to_numpy()

    # Calculate cosine similarity with existing candidates
    similarities = cosine_similarity(query_array, X)

    # Find candidates with similarity greater than a threshold (e.g., 0.5)
    similar_candidates = df_1[similarities.flatten() > 0.5]

    # Filter candidates who have **all required skills**
    def has_all_skills(candidate_skills, required_skills):
        candidate_skills_set = set(map(str.lower, candidate_skills.split(', ')))
        return all(skill in candidate_skills_set for skill in required_skills)

    # Apply the filter for skills
    similar_candidates = similar_candidates[
        similar_candidates['Skills'].apply(lambda x: has_all_skills(x, skills_required))
    ]

    # Apply experience condition
    if experience_condition == 'at least':
        similar_candidates = similar_candidates[similar_candidates['Experience'] >= experience_years]
    elif experience_condition == 'at most':
        similar_candidates = similar_candidates[similar_candidates['Experience'] <= experience_years]
    elif experience_condition == 'exact':
        similar_candidates = similar_candidates[similar_candidates['Experience'] == experience_years]

    # Return only the matching candidates
    return similar_candidates[['Name', 'Skills', 'Experience']]

# Get inputs from the user
skills_input = input("Enter required skills (comma-separated): ")
experience_input = input("Enter experience requirement (e.g., 'at least 3 years'): ")

# Convert skills input to list
skills_required = [skill.strip().lower() for skill in skills_input.split(',')]

# Find matching candidates
matching_candidates = match_candidates(skills_required, experience_input, df_1, vectorizer)

# Display matching candidates or a message if none are found
if matching_candidates.empty:
    print("No matching candidates found based on the provided skills and experience.")
else:
    print("Matching Candidates:")
    print(matching_candidates)


Enter required skills (comma-separated): python, sql
Enter experience requirement (e.g., 'at least 3 years'): at least 4 years
Matching Candidates:
                   Name                                             Skills  \
63    Christopher Evans          Big Data, SQL, Python, Hadoop, Spark, ETL   
133       Aiko Nakamura                 Big Data, SQL, Python, Hadoop, ETL   
200         Amara Nkosi  Python, R, SQL, Machine Learning, Data Mining,...   
203        Fatima Abbas  Python, SQL, Big Data, Data Pipelines, Cloud P...   
210          Sophia Kim  SQL, R, Python, Data Visualization, Data Wrang...   
...                 ...                                                ...   
1265      Jack Martinez  Java, SQL, Python, RESTful APIs, Spring, Hiber...   
1322        Amadou Diop            Python, Flask, React, AWS, GraphQL, SQL   
1330    Carla Fernandez   Tableau, Power BI, SQL, Python, Data Warehousing   
1334    Jamarcus Wilson          SQL, Power BI, Python, ETL, Data Cleani