In [1]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Download necessary NLTK packages
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [2]:
stop_words = set(nltk.corpus.stopwords.words('english'))

### Resume

In [3]:
df_resumes = pd.read_csv('PreProcessing/processed_data/Resume_proc_lemm.csv')
df_resumes.head()

Unnamed: 0,ID,Resume_str,Category
0,16852973,hr administrator marketing associate hr admini...,HR
1,22323967,hr specialist u hr operation summary versatile...,HR
2,33176873,hr director summary year experience recruiting...,HR
3,27018550,hr specialist summary dedicated driven dynamic...,HR
4,17812897,hr manager skill highlight hr skill hr departm...,HR


In [4]:
df_resumes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2483 entries, 0 to 2482
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          2483 non-null   int64 
 1   Resume_str  2483 non-null   object
 2   Category    2483 non-null   object
dtypes: int64(1), object(2)
memory usage: 58.3+ KB


### Job Description

In [5]:
df_jobs = pd.read_csv('PreProcessing Jobs\processed_data\Job_proc_lemm.csv')
df_jobs.head()

Unnamed: 0,job_id,title,description,skills_desc,word_count
0,23221523,Senior Elder Law / Trusts and Estates Associat...,senior associate attorney elder law trust esta...,This position requires a baseline understandin...,209
1,2264355,Worship Leader,exciting time part church looking right energe...,"Knowledge, Skills and Abilities: 1. Proficient...",231
2,175485704,Software Engineer,job description goyt seeking skilled motivated...,,223
3,231010577,Service / Construction Technician,company descriptionpierce refrigeration full s...,,234
4,263583866,Registered Nurse,currently need following team member grayling ...,,215


In [6]:
df_jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7382 entries, 0 to 7381
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   job_id       7382 non-null   int64 
 1   title        7382 non-null   object
 2   description  7382 non-null   object
 3   skills_desc  215 non-null    object
 4   word_count   7382 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 288.5+ KB


### Similarity

In [7]:
tfidf_vect = TfidfVectorizer(
    lowercase=True,
    binary=False,
    max_features=10000,
    stop_words=list(stop_words),   # come prima, valuta se serve
    ngram_range=(1, 6),            # unigrams fino a 6-grams
    max_df=0.8,
    min_df=2
)

In [8]:
cv_texts = df_resumes['Resume_str'].tolist()
job_texts = df_jobs['description'].tolist()

In [9]:
corpus = cv_texts + job_texts

In [10]:
tfidf_matrix = tfidf_vect.fit_transform(corpus)

In [11]:
# Dividi la matrice
cv_vectors = tfidf_matrix[:len(cv_texts)]
job_vectors = tfidf_matrix[len(cv_texts):]

# Calcola la similarità tra ogni job description e i CV
similarity_matrix = cosine_similarity(job_vectors, cv_vectors)

In [None]:
results = []
top_n = 3  # cambia se vuoi più/meno match

for job_idx, similarities in enumerate(similarity_matrix):
    top_matches = similarities.argsort()[::-1][:top_n]

    job_real_id = df_jobs.iloc[job_idx]['job_id']  # prendi l'ID vero del job
    job_title = df_jobs.iloc[job_idx]['title']

    for rank, cv_idx in enumerate(top_matches):
        cv_real_id = df_resumes.iloc[cv_idx]['ID']  # prendi l'ID vero del CV
        cv_category = df_resumes.iloc[cv_idx]['Category']

        results.append({
            'job_id': job_real_id,
            'job_title': job_title,
            'cv_id': cv_real_id,
            'cv_category': cv_category,
            'similarity_score': similarities[cv_idx],
            'rank': rank + 1
        })

df_matches = pd.DataFrame(results)
df_matches.head(10)


<bound method NDFrame.head of            job_id                                          job_title  \
0        23221523  Senior Elder Law / Trusts and Estates Associat...   
1        23221523  Senior Elder Law / Trusts and Estates Associat...   
2        23221523  Senior Elder Law / Trusts and Estates Associat...   
3         2264355                                     Worship Leader   
4         2264355                                     Worship Leader   
...           ...                                                ...   
22141  3906265348                  Research And Development Engineer   
22142  3906265348                  Research And Development Engineer   
22143  3906266272                                   Quality Engineer   
22144  3906266272                                   Quality Engineer   
22145  3906266272                                   Quality Engineer   

          cv_id   cv_category  similarity_score  rank  
0      26098594       APPAREL          0.228161  