In [None]:

import pandas as pd
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report
from torch.utils.data import Dataset
import numpy as np
import ast
df = pd.read_csv("/kaggle/input/train-notokenize/text_with_categories_train.csv")  # Replace with your actual CSV path
df['categories'] = df['categories'].apply(ast.literal_eval)

mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(df['categories'])
num_labels = len(mlb.classes_)

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# --- Split data ---
X_train, X_val, y_train, y_val = train_test_split(df['text'], labels, test_size=0.2, random_state=42)

train_dataset = TextDataset(X_train.tolist(), y_train)
val_dataset = TextDataset(X_val.tolist(), y_val)

# df_sample = df.sample(n=7, random_state=42)  # total small sample
# df_train = df_sample.iloc[:5]
# df_val = df_sample.iloc[5:]

# X_train = df_train['text']
# y_train = mlb.transform(df_train['categories'])

# X_val = df_val['text']
# y_val = mlb.transform(df_val['categories'])

# train_dataset = TextDataset(X_train.tolist(), y_train)
# val_dataset = TextDataset(X_val.tolist(), y_val)
# --- Load PhoBERT for multi-label classification ---
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=num_labels, problem_type="multi_label_classification")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# --- Metrics ---
def compute_metrics(pred):
    logits, labels = pred
    preds = torch.sigmoid(torch.tensor(logits)).numpy() > 0.5
    return {
        'f1_micro': f1_score(labels, preds, average='micro'),
        'f1_macro': f1_score(labels, preds, average='macro'),
        'accuracy': accuracy_score(labels, preds)
    }

# --- Trainer ---
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=50,
    per_device_train_batch_size=50,
    per_device_eval_batch_size=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_steps = 5,
    save_total_limit=4,
    logging_dir='./logs',
    logging_strategy="steps",        
    report_to = 'none',
    logging_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model='f1_micro'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

print('trainning')



In [None]:
trainer.train()

preds_output = trainer.predict(val_dataset)
preds = (torch.sigmoid(torch.tensor(preds_output.predictions)).numpy() > 0.5).astype(int)

print("Classification Report:\n", classification_report(y_val, preds, target_names=mlb.classes_))

In [None]:
def predict_categories(text):
    encoding = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    encoding = {k: v.to(device) for k, v in encoding.items()}

    with torch.no_grad():
        outputs = model(**encoding)
        logits = outputs.logits
        probs = torch.sigmoid(logits)

    preds = (probs > 0.5).cpu().numpy()  # Now it's a 2D array
    predicted_categories = mlb.inverse_transform(preds)
    return predicted_categories[0]  # Return the first (and only) sample's labels


sample_text = "nhân viên phục vụ  sạch sẽ đồ ăn ngon giá rẻ dưới 50k"
predicted = predict_categories(sample_text)
print("Predicted categories:", predicted)
