In [25]:
import os
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from scipy.special import expit
from sklearn.metrics import classification_report

ALL_LABELS = ["semantic_search", "geo_filter", "geo_distance", "numeric_filter"]
CSV_PATH = "query_training_data_.csv"  # ‰øÆÊ≠£Êàê‰Ω†Ëá™Â∑±ÁöÑ CSV

def encode_labels(label_str):
    label_list = label_str.split(",")
    label_list = [l.strip() for l in label_list]
    encoding = [1 if lbl in label_list else 0 for lbl in ALL_LABELS]
    return encoding

df = pd.read_csv(CSV_PATH)
df["encoded_labels"] = df["Labels"].apply(encode_labels)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

class QueryDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=64):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row["Query"]
        labels = row["encoded_labels"]

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        item = {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(labels, dtype=torch.float)
        }
        return item

def multi_label_metrics(eval_pred):
    logits, labels = eval_pred
    probs = expit(logits)
    preds = (probs >= 0.5).astype(int)

    preds_flat = preds.flatten()
    labels_flat = labels.flatten()

    f1_micro = f1_score(labels_flat, preds_flat, average="micro")
    f1_macro = f1_score(labels_flat, preds_flat, average="macro")
    precision_micro = precision_score(labels_flat, preds_flat, average="micro")
    recall_micro = recall_score(labels_flat, preds_flat, average="micro")

    return {
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "precision_micro": precision_micro,
        "recall_micro": recall_micro
    }

def evaluate_classification_report(trainer, dataset, all_labels):
    preds_output = trainer.predict(dataset)
    logits = preds_output.predictions
    labs = preds_output.label_ids
    probs = expit(logits)
    pred_bin = (probs >= 0.5).astype(int)

    # (A) ÈÄêÂÄã label Â†±Âëä
    for i, label_name in enumerate(all_labels):
        y_true = labs[:, i]
        y_pred = pred_bin[:, i]
        print(f"=== Label: {label_name} ===")
        print(classification_report(
            y_true,
            y_pred,
            zero_division=0, digits=4
        ))
        print("-----------------------------------")

    # (B) flatten ÂÅö overall Â†±Âëä
    pred_flat = pred_bin.flatten()
    labs_flat = labs.flatten()
    print("=== Overall (flatten) ===")
    print(classification_report(
        labs_flat,
        pred_flat,
        zero_division=0, digits=4
    ))

def train_and_evaluate_model(model_name, train_df, test_df):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    train_dataset = QueryDataset(train_df, tokenizer)
    test_dataset = QueryDataset(test_df, tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(ALL_LABELS)
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    training_args = TrainingArguments(
        output_dir=f"./outputs_{model_name.replace('/', '_')}",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="f1_micro",
        greater_is_better=True,
        logging_dir=f"./logs_{model_name.replace('/', '_')}"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=multi_label_metrics
    )

    trainer.train()

    # Âú® test set ‰∏äÂÅöÊúÄÁµÇË©ï‰º∞ (Êï¥È´îÊï∏Êìö)
    result = trainer.evaluate(test_dataset)
    print("Evaluation on test set:", result)

    # Êõ¥Ë©≥Á¥∞ÈÄêÂÄã label Â†±Âëä
    evaluate_classification_report(trainer, test_dataset, ALL_LABELS)

# ÂèØÊõøÊèõÊ®°Âûã
CANDIDATE_MODELS = [
    "xlm-roberta-base",
    "xlm-roberta-large",
    "bert-base-multilingual-cased",
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
]

for model_name in CANDIDATE_MODELS:
    print(f"===== Start training {model_name} =====")
    train_and_evaluate_model(model_name, train_df, test_df)
    print(f"===== End training {model_name} =====\n")


===== Start training xlm-roberta-base =====


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,Precision Micro,Recall Micro
1,No log,0.35737,0.808989,0.808965,0.808989,0.808989
2,0.434400,0.142302,0.97191,0.971907,0.97191,0.97191
3,0.264200,0.086564,0.983146,0.983144,0.983146,0.983146


Evaluation on test set: {'eval_loss': 0.08656444400548935, 'eval_f1_micro': 0.9831460674157303, 'eval_f1_macro': 0.9831439393939394, 'eval_precision_micro': 0.9831460674157303, 'eval_recall_micro': 0.9831460674157303, 'eval_runtime': 0.3463, 'eval_samples_per_second': 256.967, 'eval_steps_per_second': 34.647, 'epoch': 3.0}
=== Label: semantic_search ===
              precision    recall  f1-score   support

         1.0     1.0000    1.0000    1.0000        89

    accuracy                         1.0000        89
   macro avg     1.0000    1.0000    1.0000        89
weighted avg     1.0000    1.0000    1.0000        89

-----------------------------------
=== Label: geo_filter ===
              precision    recall  f1-score   support

         0.0     0.9828    1.0000    0.9913        57
         1.0     1.0000    0.9688    0.9841        32

    accuracy                         0.9888        89
   macro avg     0.9914    0.9844    0.9877        89
weighted avg     0.9890    0.9888    

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,Precision Micro,Recall Micro
1,No log,0.452666,0.808989,0.808965,0.808989,0.808989
2,0.452100,0.195711,0.929775,0.929775,0.929775,0.929775
3,0.327100,0.050302,0.988764,0.988763,0.988764,0.988764


Evaluation on test set: {'eval_loss': 0.05030224472284317, 'eval_f1_micro': 0.9887640449438202, 'eval_f1_macro': 0.9887626262626263, 'eval_precision_micro': 0.9887640449438202, 'eval_recall_micro': 0.9887640449438202, 'eval_runtime': 0.7274, 'eval_samples_per_second': 122.359, 'eval_steps_per_second': 16.498, 'epoch': 3.0}
=== Label: semantic_search ===
              precision    recall  f1-score   support

         1.0     1.0000    1.0000    1.0000        89

    accuracy                         1.0000        89
   macro avg     1.0000    1.0000    1.0000        89
weighted avg     1.0000    1.0000    1.0000        89

-----------------------------------
=== Label: geo_filter ===
              precision    recall  f1-score   support

         0.0     1.0000    1.0000    1.0000        57
         1.0     1.0000    1.0000    1.0000        32

    accuracy                         1.0000        89
   macro avg     1.0000    1.0000    1.0000        89
weighted avg     1.0000    1.0000    

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,Precision Micro,Recall Micro
1,No log,0.082174,0.988764,0.988763,0.988764,0.988764
2,0.272900,0.04506,0.997191,0.99719,0.997191,0.997191
3,0.072300,0.037839,0.997191,0.99719,0.997191,0.997191


Evaluation on test set: {'eval_loss': 0.045060351490974426, 'eval_f1_micro': 0.9971910112359551, 'eval_f1_macro': 0.9971899247752334, 'eval_precision_micro': 0.9971910112359551, 'eval_recall_micro': 0.9971910112359551, 'eval_runtime': 0.3879, 'eval_samples_per_second': 229.463, 'eval_steps_per_second': 30.939, 'epoch': 3.0}
=== Label: semantic_search ===
              precision    recall  f1-score   support

         1.0     1.0000    1.0000    1.0000        89

    accuracy                         1.0000        89
   macro avg     1.0000    1.0000    1.0000        89
weighted avg     1.0000    1.0000    1.0000        89

-----------------------------------
=== Label: geo_filter ===
              precision    recall  f1-score   support

         0.0     1.0000    1.0000    1.0000        57
         1.0     1.0000    1.0000    1.0000        32

    accuracy                         1.0000        89
   macro avg     1.0000    1.0000    1.0000        89
weighted avg     1.0000    1.0000   

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/paraphrase-multilingual-mpnet-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,Precision Micro,Recall Micro
1,No log,0.255929,0.977528,0.977525,0.977528,0.977528
2,0.422900,0.138501,0.983146,0.983144,0.983146,0.983146
3,0.185900,0.109457,0.988764,0.988763,0.988764,0.988764


Evaluation on test set: {'eval_loss': 0.10945658385753632, 'eval_f1_micro': 0.9887640449438202, 'eval_f1_macro': 0.9887626262626263, 'eval_precision_micro': 0.9887640449438202, 'eval_recall_micro': 0.9887640449438202, 'eval_runtime': 0.3495, 'eval_samples_per_second': 254.668, 'eval_steps_per_second': 34.337, 'epoch': 3.0}
=== Label: semantic_search ===
              precision    recall  f1-score   support

         1.0     1.0000    1.0000    1.0000        89

    accuracy                         1.0000        89
   macro avg     1.0000    1.0000    1.0000        89
weighted avg     1.0000    1.0000    1.0000        89

-----------------------------------
=== Label: geo_filter ===
              precision    recall  f1-score   support

         0.0     1.0000    1.0000    1.0000        57
         1.0     1.0000    1.0000    1.0000        32

    accuracy                         1.0000        89
   macro avg     1.0000    1.0000    1.0000        89
weighted avg     1.0000    1.0000    

#### Ê∏¨Ë©¶ Query ÂàÜÈ°û

In [27]:
import os
from transformers import AutoTokenizer

# ‰Ω†ÁöÑÂõõÂÄãÂÄôÈÅ∏Ê®°Âûã
CANDIDATE_MODELS = [
    "xlm-roberta-base",
    "xlm-roberta-large",
    "bert-base-multilingual-cased",
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
]

# Ë®≠ÂÆöÊ®°ÂûãÂ≠òÊîæÁöÑÂü∫Á§éË≥áÊñôÂ§æ
BASE_DIR = "c:/Users/Administrator/Downloads/SCR/NLU"

# ÈÅçÊ≠∑ÊØèÂÄãÊ®°ÂûãÔºå‰∏ãËºâ‰∏¶Â≠òÂÖ•Â∞çÊáâ checkpoint-135
for model_name in CANDIDATE_MODELS:
    model_dir = os.path.join(BASE_DIR, f"outputs_{model_name.replace('/', '_')}", "checkpoint-135")
    
    # Ê™¢Êü•Ë©≤ checkpoint ÊòØÂê¶Â≠òÂú®
    if not os.path.isdir(model_dir):
        print(f"‚ö†Ô∏è {model_dir} ‰∏çÂ≠òÂú®ÔºåË∑≥ÈÅé...")
        continue

    print(f"‰∏ãËºâ tokenizer ‰∏¶Â≠òÂÖ•: {model_dir}")

    # ‰∏ãËºâÂ∞çÊáâÁöÑ tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.save_pretrained(model_dir)  # Â≠òÂÖ• `checkpoint-135`

print("ÊâÄÊúâ tokenizer Ë£úÂÖ®")


‰∏ãËºâ tokenizer ‰∏¶Â≠òÂÖ•: c:/Users/Administrator/Downloads/SCR/NLU\outputs_xlm-roberta-base\checkpoint-135
‰∏ãËºâ tokenizer ‰∏¶Â≠òÂÖ•: c:/Users/Administrator/Downloads/SCR/NLU\outputs_xlm-roberta-large\checkpoint-135
‰∏ãËºâ tokenizer ‰∏¶Â≠òÂÖ•: c:/Users/Administrator/Downloads/SCR/NLU\outputs_bert-base-multilingual-cased\checkpoint-135
‰∏ãËºâ tokenizer ‰∏¶Â≠òÂÖ•: c:/Users/Administrator/Downloads/SCR/NLU\outputs_sentence-transformers_paraphrase-multilingual-mpnet-base-v2\checkpoint-135
ÊâÄÊúâ tokenizer Ë£úÂÖ®


In [29]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import expit

ALL_LABELS = ["semantic_search", "geo_filter", "geo_distance", "numeric_filter"]

# ‰Ω†ÁöÑÊ®°ÂûãÂàóË°®
CANDIDATE_MODELS = [
    "xlm-roberta-base",
    "xlm-roberta-large",
    "bert-base-multilingual-cased",
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def predict_with_model(model_dir, query_text):
    """
    Âæû model_dir/checkpoint-135 ËºâÂÖ• safetensors Ê®°ÂûãÔºå‰∏¶Â∞ç query_text ÈÄ≤Ë°åÊé®ÁêÜ„ÄÇ
    """
    ckpt_path = os.path.join(model_dir, "checkpoint-135")  # Âõ∫ÂÆöËºâÂÖ• checkpoint-135
    ckpt_path = os.path.abspath(ckpt_path)  # ËΩâÊèõÁÇ∫ÁµïÂ∞çË∑ØÂæë
   

    tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
    model = AutoModelForSequenceClassification.from_pretrained(ckpt_path)
    model.to(device)
    model.eval()

    # Tokenize Ëº∏ÂÖ•
    inputs = tokenizer(query_text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits.cpu().numpy()
        probs = expit(logits)  # sigmoid
        preds = (probs >= 0.5).astype(int).flatten()

    return preds

# Ê∏¨Ë©¶ Query
test_query = "ÊàëÊÉ≥ÊâæÂú®ÈòøÈáåÂ±±ÈôÑËøëÁöÑÁâπËâ≤ÂíñÂï°Âª≥ÔºåÂèØ‰ª•ÁúãÂà∞Â±±ÊôØ"

print(f"Ê∏¨Ë©¶ Query: {test_query}")
for model_name in CANDIDATE_MODELS:
    # ÊØèÂÄãÊ®°ÂûãÁöÑËº∏Âá∫ÁõÆÈåÑÔºå‰æãÂ¶Ç ./outputs_xlm-roberta-base
    MODEL_DIR = f"./outputs_{model_name.replace('/', '_')}"
    MODEL_DIR = os.path.abspath(MODEL_DIR)  # ËΩâÊàêÁµïÂ∞çË∑ØÂæë

    # ‰ΩøÁî® checkpoint-135 ÂÅöÊé®ÁêÜ
    preds = predict_with_model(MODEL_DIR, test_query)
    
    # Ëº∏Âá∫ÁµêÊûú
    print(f"[Model: {model_name}] Predictions:")
    for label, p in zip(ALL_LABELS, preds):
        print(f"  {label}: {bool(p)}")
    print("-----------------------------------")


Ê∏¨Ë©¶ Query: ÊàëÊÉ≥ÊâæÂú®ÈòøÈáåÂ±±ÈôÑËøëÁöÑÁâπËâ≤ÂíñÂï°Âª≥ÔºåÂèØ‰ª•ÁúãÂà∞Â±±ÊôØ
[Model: xlm-roberta-base] Predictions:
  semantic_search: True
  geo_filter: False
  geo_distance: True
  numeric_filter: False
-----------------------------------
[Model: xlm-roberta-large] Predictions:
  semantic_search: True
  geo_filter: False
  geo_distance: True
  numeric_filter: False
-----------------------------------
[Model: bert-base-multilingual-cased] Predictions:
  semantic_search: True
  geo_filter: False
  geo_distance: True
  numeric_filter: False
-----------------------------------
[Model: sentence-transformers/paraphrase-multilingual-mpnet-base-v2] Predictions:
  semantic_search: True
  geo_filter: False
  geo_distance: True
  numeric_filter: False
-----------------------------------


# NLU Ê®°ÂûãÊ∏¨Ë©¶ÁµêÊûúÂàÜÊûê

## **ÊúÄ‰Ω≥Ê®°ÂûãÔºö`bert-base-multilingual-cased`**
- `f1_micro = 0.9972` **(ÊúÄÈ´òÊ∫ñÁ¢∫Áéá)**
- `eval_loss = 0.0451` **(ÊúÄ‰ΩéË™§Â∑Æ)**
- `eval_runtime = 0.3879s` **(ÈÄüÂ∫¶ÈÅ©‰∏≠)**
- **Êé®Ëñ¶‰ΩúÁÇ∫ NLU Ê®°ÂûãÔºåÊèê‰æõÊúÄÊ∫ñÁ¢∫ÁöÑÊ™¢Á¥¢ÂàÜÈ°ûÔºÅ**

---

## **ÂÖ∂‰ªñÊ®°ÂûãÊØîËºÉ**
| Ê®°Âûã | `f1_micro` | `eval_loss` | `eval_runtime` |
|------|------------|-------------|----------------|
| **`bert-base-multilingual-cased`** ‚úÖ | **0.9972** | **0.0451** | **0.3879s** |
| `xlm-roberta-large` | 0.9888 | 0.0503 | 0.7274s |
| `sentence-transformers/paraphrase-multilingual-mpnet-base-v2` üöÄ **(ÊúÄÂø´)** | 0.9888 | **ÊúÄÂ∑Æ 0.1094** | **ÊúÄÂø´ 0.3495s** |
| `xlm-roberta-base` ‚ùå **(ÊúÄÂ∑Æ)** | 0.9831 | 0.0865 | 0.3463s |

---

## ** Âª∫Ë≠∞**
-  **`bert-base-multilingual-cased`** **(ÊúÄÊ∫ñÁ¢∫ÔºåÊé®Ëñ¶‰ΩøÁî®)**
-  **`sentence-transformers/paraphrase-multilingual-mpnet-base-v2`** **(Êé®ÁêÜÊúÄÂø´ÔºåÈÅ©ÂêàÈÄüÂ∫¶ÈúÄÊ±Ç)**
-  **`xlm-roberta-base`** **(Ê∫ñÁ¢∫ÁéáÊúÄ‰ΩéÔºå‰∏çÂª∫Ë≠∞‰ΩøÁî®)**
