In [1]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Download necessary NLTK packages
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [2]:
stop_words = set(nltk.corpus.stopwords.words('english'))

### Resume

In [3]:
df_resumes = pd.read_csv('PreProcessing/processed_data/Resume_proc_lemm.csv')
df_resumes.head()

Unnamed: 0,ID,Resume_str,Category
0,16852973,hr administrator marketing associate hr admini...,HR
1,22323967,hr specialist u hr operation summary versatile...,HR
2,33176873,hr director summary year experience recruiting...,HR
3,27018550,hr specialist summary dedicated driven dynamic...,HR
4,17812897,hr manager skill highlight hr skill hr departm...,HR


In [4]:
df_resumes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2483 entries, 0 to 2482
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          2483 non-null   int64 
 1   Resume_str  2483 non-null   object
 2   Category    2483 non-null   object
dtypes: int64(1), object(2)
memory usage: 58.3+ KB


### Job Description

In [5]:
df_jobs = pd.read_csv('PreProcessing Jobs\processed_data\Job_proc_lemm.csv')
df_jobs.head()

Unnamed: 0,job_id,title,description,skills_desc,word_count
0,921716,Marketing Coordinator,job descriptiona leading real estate firm new ...,Requirements: \n\nWe are seeking a College or ...,358
1,1829192,Mental Health Therapist/Counselor,aspen therapy wellness committed serving clien...,,492
2,91700727,Economic Development and Planning Intern,job summary economic development planning inte...,,578
3,95428182,Administrative Coordinator,job title administrative coordinatororganizati...,,555
4,111513530,"Content Writer, Communications",application opening date april title content w...,,410


In [6]:
df_jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49469 entries, 0 to 49468
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   job_id       49469 non-null  int64 
 1   title        49469 non-null  object
 2   description  49469 non-null  object
 3   skills_desc  1137 non-null   object
 4   word_count   49469 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.9+ MB


### Similarity

In [7]:
tfidf_vect = TfidfVectorizer(
    lowercase=True,
    binary=False,
    max_features=10000,
    stop_words=list(stop_words),   # come prima, valuta se serve
    ngram_range=(1, 6),            # unigrams fino a 6-grams
    max_df=0.8,
    min_df=2
)

In [8]:
cv_texts = df_resumes['Resume_str'].tolist()
job_texts = df_jobs['description'].tolist()

In [9]:
corpus = cv_texts + job_texts

In [None]:
tfidf_matrix = tfidf_vect.fit_transform(corpus)

In [None]:
# Dividi la matrice
cv_vectors = tfidf_matrix[:len(cv_texts)]
job_vectors = tfidf_matrix[len(cv_texts):]

# Calcola la similarità tra ogni job description e i CV
similarity_matrix = cosine_similarity(job_vectors, cv_vectors)

In [None]:
results = []
top_n = 10  # cambia se vuoi più/meno match

for job_idx, similarities in enumerate(similarity_matrix):
    top_matches = similarities.argsort()[::-1][:top_n]

    job_real_id = df_jobs.iloc[job_idx]['job_id']  # prendi l'ID vero del job

    for rank, cv_idx in enumerate(top_matches):
        cv_real_id = df_resumes.iloc[cv_idx]['ID']  # prendi l'ID vero del CV

        results.append({
            'job_id': job_real_id,
            'cv_id': cv_real_id,
            'similarity_score': similarities[cv_idx],
            'rank': rank + 1
        })

df_matches = pd.DataFrame(results)
print(df_matches.head(10))
