In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm

# Caricamento del CSV
df = pd.read_csv("../PreProcessing/processed_data/Resume_proc_lemm.csv")  # Cambia con il tuo nome file
texts = df["Resume_str"].tolist()
labels = df["Category"].astype("category").cat.codes  # Conversione in codici numerici

# Suddivisione del dataset
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Caricamento tokenizer e modello BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_model.eval()

# Funzione per ottenere l'embedding del CLS token
def get_bert_embedding(text):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        outputs = bert_model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return cls_embedding

# Estrazione embeddings da BERT
print("Estrazione embeddings da BERT...")
X_train_embed = np.array([get_bert_embedding(text) for text in tqdm(X_train)])
X_test_embed = np.array([get_bert_embedding(text) for text in tqdm(X_test)])

# Classificatore Random Forest
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_embed, y_train)

# Predizione e valutazione
y_pred = clf.predict(X_test_embed)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=df["Category"].astype("category").cat.categories))

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Estrazione embeddings da BERT...


100%|██████████| 1986/1986 [23:06<00:00,  1.43it/s]  
100%|██████████| 497/497 [04:07<00:00,  2.01it/s]



Accuracy: 0.4386317907444668

Classification Report:
                         precision    recall  f1-score   support

            ACCOUNTANT       0.59      0.79      0.68        28
              ADVOCATE       0.47      0.30      0.37        30
           AGRICULTURE       0.00      0.00      0.00        10
               APPAREL       0.47      0.50      0.48        14
                  ARTS       0.23      0.28      0.25        18
            AUTOMOBILE       0.00      0.00      0.00         5
              AVIATION       0.58      0.75      0.65        24
               BANKING       0.22      0.31      0.26        16
                   BPO       0.00      0.00      0.00         3
  BUSINESS-DEVELOPMENT       0.35      0.48      0.41        23
                  CHEF       0.65      0.77      0.71        31
          CONSTRUCTION       0.76      0.41      0.53        32
            CONSULTANT       0.08      0.04      0.06        23
              DESIGNER       0.39      0.37     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
