In [1]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors


# Download necessary NLTK packages
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [2]:
stop_words = set(nltk.corpus.stopwords.words('english'))

### Resume

In [None]:
df_resumes = pd.read_csv('../PreProcessingJobs/processed_data/Resume_proc_lemm.csv')
df_resumes.head()

Unnamed: 0,ID,Resume_str,Category
0,16852973,hr administrator marketing associate hr admini...,HR
1,22323967,hr specialist u hr operation summary versatile...,HR
2,33176873,hr director summary year experience recruiting...,HR
3,27018550,hr specialist summary dedicated driven dynamic...,HR
4,17812897,hr manager skill highlight hr skill hr departm...,HR


In [4]:
df_resumes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2483 entries, 0 to 2482
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          2483 non-null   int64 
 1   Resume_str  2483 non-null   object
 2   Category    2483 non-null   object
dtypes: int64(1), object(2)
memory usage: 58.3+ KB


### Job Description

In [5]:
df_jobs = pd.read_csv('PreProcessing Jobs\processed_data\Job_proc_lemm.csv')
df_jobs.head()

Unnamed: 0,job_id,title,description,skills_desc,word_count
0,2147609754,Workflow Coordinator Hospitality - Mon - Fri 8...,williams lea hiring workflow coordinator hospi...,,520
1,2914254129,Director of Operations,director operationsalliance strategic growth i...,,512
2,2920450495,Service Coordinator,qualificationsexperience data entry year requi...,,513
3,3189117072,Client Service Associate / Practice Manager,company overviewsignature estate investment ad...,,517
4,3190494363,Architectural Designer,company overview join dynamic team florida lif...,,510


In [6]:
df_jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7821 entries, 0 to 7820
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   job_id       7821 non-null   int64 
 1   title        7821 non-null   object
 2   description  7821 non-null   object
 3   skills_desc  175 non-null    object
 4   word_count   7821 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 305.6+ KB


### Similarity

In [7]:
tfidf_vect = TfidfVectorizer(
    lowercase=True,
    binary=False,
    max_features=10000,
    stop_words=list(stop_words),   # come prima, valuta se serve
    ngram_range=(1, 6),            # unigrams fino a 6-grams
    max_df=0.8,
    min_df=2
)

In [8]:
cv_texts = df_resumes['Resume_str'].tolist()
job_texts = df_jobs['description'].tolist()

In [9]:
corpus = cv_texts + job_texts

In [10]:
tfidf_matrix = tfidf_vect.fit_transform(corpus)

In [11]:
# Dividi la matrice
cv_vectors = tfidf_matrix[:len(cv_texts)]
job_vectors = tfidf_matrix[len(cv_texts):]

In [12]:
top_n = 3
knn = NearestNeighbors(n_neighbors=top_n, metric='cosine')
knn.fit(cv_vectors)

In [13]:
distances, indices = knn.kneighbors(job_vectors)

In [14]:
results = []
for job_idx, (dist_list, idx_list) in enumerate(zip(distances, indices)):
    job_real_id = df_jobs.iloc[job_idx]['job_id']
    job_title = df_jobs.iloc[job_idx]['title']

    for rank, (cv_idx, dist) in enumerate(zip(idx_list, dist_list)):
        similarity = 1 - dist  # sim = 1 - distanza
        cv_real_id = df_resumes.iloc[cv_idx]['ID']
        cv_category = df_resumes.iloc[cv_idx]['Category']
        
        results.append({
            'job_id': job_real_id,
            'job_title': job_title,
            'cv_id': cv_real_id,
            'cv_category': cv_category,
            'similarity_score': similarity,
            'rank': rank + 1
        })

df_matches = pd.DataFrame(results)
print(df_matches.head(10))

       job_id                                          job_title     cv_id  \
0  2147609754  Workflow Coordinator Hospitality - Mon - Fri 8...  11842274   
1  2147609754  Workflow Coordinator Hospitality - Mon - Fri 8...  11835339   
2  2147609754  Workflow Coordinator Hospitality - Mon - Fri 8...  34252537   
3  2914254129                             Director of Operations  28862054   
4  2914254129                             Director of Operations  11835339   
5  2914254129                             Director of Operations  19616406   
6  2920450495                                Service Coordinator  17095812   
7  2920450495                                Service Coordinator  18932512   
8  2920450495                                Service Coordinator  19938081   
9  3189117072        Client Service Associate / Practice Manager  18824120   

            cv_category  similarity_score  rank  
0      PUBLIC-RELATIONS          0.210897     1  
1            CONSULTANT          0.207803