In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import seaborn as sns
from time import time
from nltk.tokenize import word_tokenize
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE
from gensim.models import Word2Vec
from imblearn.pipeline import Pipeline
from itertools import combinations
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


# Download necessary NLTK packages
try:
    nltk.download('punkt_tab')
except Exception as e:
    print(f"Errore durante il download dei pacchetti NLTK: {e}")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\filip\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
df = pd.read_csv("../PreProcessingResumes/processed_data/Resume_proc_lemm.csv")

In [3]:
def preprocess(text):
    tokens = word_tokenize(text)
    return tokens

df['tokens'] = df['Resume_str'].apply(preprocess)

In [4]:
w2v_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=2, epochs=10)

In [5]:
print(f"Numero di parole nel vocabolario Word2Vec: {len(w2v_model.wv.index_to_key)}")

Numero di parole nel vocabolario Word2Vec: 19857


In [6]:
# Funzione per ottenere i vettori concatenati da tutte le parole del documento
def get_word2vec_vector(tokens, model, vector_size=10, max_length=100):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    
    # Se non ci sono parole nel vocabolario
    if len(vectors) == 0:
        return np.zeros(vector_size * max_length)  # Padding con vettori di zeri

    # Concatenazione dei vettori delle parole
    concatenated_vectors = np.concatenate(vectors)

    # Se il numero di vettori concatenati è inferiore a max_length, aggiungiamo padding
    if len(concatenated_vectors) < vector_size * max_length:
        padding = np.zeros(vector_size * (max_length - len(vectors)))
        concatenated_vectors = np.concatenate([concatenated_vectors, padding])
    
    # Se il numero di vettori concatenati è maggiore di max_length, tagliamo la concatenazione
    if len(concatenated_vectors) > vector_size * max_length:
        concatenated_vectors = concatenated_vectors[:vector_size * max_length]
    
    return concatenated_vectors

df['vector'] = df['tokens'].apply(lambda x: get_word2vec_vector(x, w2v_model, vector_size=100, max_length=500))

In [7]:
X = np.vstack(df['vector'].values)
y = df['Category']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
# Print the sizes of the split datasets
print("Train data size:", X_train.shape)
print("Validation data size:", X_test.shape)

Train data size: (1986, 50000)
Validation data size: (497, 50000)


## Random Forest Classifier

In [10]:
def report(results, n_top=5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print(f"Model with rank: {i}")
            print(f"Mean validation score: {results['mean_test_score'][candidate]:.3f} (std: {results['std_test_score'][candidate]:.3f})")
            print(f"Parameters: {results['params'][candidate]}")
            print("")

In [11]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [12]:
class_counts = y_train.value_counts()
min_classes = class_counts[class_counts < class_counts.mean()].index.tolist()

print(f"Applicazione di SMOTE alle classi: {min_classes}")

target_size = int(class_counts.max())
sampling_strategy = {cls: target_size for cls in min_classes}

Applicazione di SMOTE alle classi: ['ARTS', 'TEACHER', 'APPAREL', 'DIGITAL-MEDIA', 'AGRICULTURE', 'AUTOMOBILE', 'BPO']


In [13]:
pipeline = Pipeline(steps=[
    ('smote', SMOTE(random_state=42)),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),
    ('clf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

In [None]:
# 2. Definisci i parametri per il grid search
param_grid = {
    'clf__n_estimators': [400, 700],
    'clf__max_features': ['sqrt', 'log2'],
    'clf__max_depth' : [8, 9, 10],
    'clf__criterion' : ['gini', 'entropy'],
    'clf__min_samples_split': [10],
    'clf__min_samples_leaf': [15]
}

# 3. StratifiedKFold mantiene lo sbilanciamento nei fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 4. GridSearchCV con cross-validation interna e pipeline
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=skf,
    scoring='f1_weighted',
    return_train_score=True,
    n_jobs=-1
)

start = time()
grid_search = grid.fit(X_train, y_train)

print(f"GridSearchCV took {(time() - start):.2f} seconds for {len(grid_search.cv_results_['params'])} candidates parameter settings.\n")
report(grid_search.cv_results_) 

In [None]:
print(grid_search.best_params_)

In [None]:
# DataFrame con i risultati
results = pd.DataFrame(grid_search.cv_results_)
param_cols = [col for col in results.columns if col.startswith("param_")]

# Converti tutti i parametri in stringa per la pivot table
for p in param_cols:
    results[p] = results[p].astype(str)

# Heatmap per ogni coppia di parametri
for p1, p2 in combinations(param_cols, 2):
    try:
        pivot_val = results.pivot_table(values='mean_test_score', index=p1, columns=p2)
        pivot_train = results.pivot_table(values='mean_train_score', index=p1, columns=p2)

        if pivot_val.shape[0] > 1 and pivot_val.shape[1] > 1:
            fig, axes = plt.subplots(1, 2, figsize=(10, 4))

            sns.heatmap(pivot_train, annot=True, fmt=".3f", cmap="Blues", ax=axes[0])
            axes[0].set_title(f"Train Score")
            axes[0].set_xlabel(p2.replace("param_", ""))
            axes[0].set_ylabel(p1.replace("param_", ""))

            sns.heatmap(pivot_val, annot=True, fmt=".3f", cmap="Greens", ax=axes[1])
            axes[1].set_title(f"Validation Score")
            axes[1].set_xlabel(p2.replace("param_", ""))
            axes[1].set_ylabel("")

            plt.tight_layout()
            plt.show()
    except Exception as e:
        print(f"Errore con {p1} e {p2}: {e}")

In [None]:
RF_Model_best = grid_search.best_estimator_
prediction = RF_Model_best.predict(X_test)

In [None]:
print("Training Score: {:.2f}".format(RF_Model_best.score(X_train, y_train)))
print("Test Score: {:.2f}".format(RF_Model_best.score(X_test, y_test)))

In [None]:
print(classification_report(y_test, prediction))

In [None]:
plt.figure(figsize=(15, 10))  # Imposta la dimensione della figura
ConfusionMatrixDisplay.from_predictions(y_test, prediction)
plt.xticks(rotation=90)
plt.show()