In [1]:
# =========================
# Setup & Data loading
# =========================
import os, random, math
import numpy as np
import pandas as pd

In [2]:
# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# 1) Load data (2 cột: text, category).
df_train = pd.read_csv('../data/hwu/train.csv', header=None, names=['text', 'category'])
df_val   = pd.read_csv('../data/hwu/val.csv', header=None, names=['text', 'category'])
df_test  = pd.read_csv('../data/hwu/test.csv', header=None, names=['text', 'category'])

# Nếu dòng đầu là header cũ bị đọc nhầm (ví dụ cell đầu tiên là 'text' hoặc 'category'), ta bỏ:
def drop_misread_header(df):
    if isinstance(df.iloc[0,0], str) and df.iloc[0,0].strip().lower() in {"text","utterance","sentence"}:
        return df.iloc[1:].reset_index(drop=True)
    if isinstance(df.iloc[0,1], str) and df.iloc[0,1].strip().lower() in {"category","intent","label"}:
        return df.iloc[1:].reset_index(drop=True)
    return df.reset_index(drop=True)

df_train = drop_misread_header(df_train)
df_val   = drop_misread_header(df_val)
df_test  = drop_misread_header(df_test)

print("Train shape:", df_train.shape)
print("Validation shape:", df_val.shape)
print("Test shape:", df_test.shape)
print(df_train.head())

Train shape: (8954, 2)
Validation shape: (1076, 2)
Test shape: (1076, 2)
                                                text     category
0                what alarms do i have set right now  alarm_query
1                    checkout today alarm of meeting  alarm_query
2                              report alarm settings  alarm_query
3  see see for me the alarms that you have set to...  alarm_query
4                       is there an alarm for ten am  alarm_query


In [3]:
# Clean NaN (nếu có)
for d in (df_train, df_val, df_test):
    d["text"] = d["text"].astype(str).fillna("")
    d["category"] = d["category"].astype(str).fillna("")

# Encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(pd.concat([df_train["category"], df_val["category"], df_test["category"]], axis=0))
y_train = le.transform(df_train["category"])
y_val   = le.transform(df_val["category"])
y_test  = le.transform(df_test["category"])
num_classes = len(le.classes_)
print("Num classes:", num_classes)

Num classes: 64


In [4]:
# Utility: macro F1 + report
from sklearn.metrics import f1_score, classification_report

# ==========================================================
# Nhiệm vụ 1: TF-IDF + Logistic Regression (Baseline)
# ==========================================================
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

tfidf_lr_pipeline = make_pipeline(
    TfidfVectorizer(max_features=5000, ngram_range=(1,2)),
    LogisticRegression(max_iter=1000, n_jobs=None)  # n_jobs=None để tương thích nhiều môi trường
)
tfidf_lr_pipeline.fit(df_train["text"], y_train)

y_pred_lr = tfidf_lr_pipeline.predict(df_test["text"])
f1_lr = f1_score(y_test, y_pred_lr, average="macro")

print("\n[TF-IDF + LR] Classification report (test):")
print(classification_report(y_test, y_pred_lr, target_names=le.classes_))

# LR không dùng Keras, không có test loss theo nghĩa cross-entropy ở Keras
loss_lr = np.nan


[TF-IDF + LR] Classification report (test):
                          precision    recall  f1-score   support

             alarm_query       0.95      0.95      0.95        19
            alarm_remove       1.00      0.73      0.84        11
               alarm_set       0.85      0.89      0.87        19
       audio_volume_down       1.00      0.75      0.86         8
       audio_volume_mute       0.92      0.80      0.86        15
         audio_volume_up       1.00      1.00      1.00        13
          calendar_query       0.52      0.58      0.55        19
         calendar_remove       0.78      0.95      0.86        19
            calendar_set       0.87      0.68      0.76        19
          cooking_recipe       0.93      0.68      0.79        19
        datetime_convert       0.78      0.88      0.82         8
          datetime_query       0.71      0.89      0.79        19
        email_addcontact       0.88      0.88      0.88         8
             email_query      

In [5]:
# ==========================================================
# Nhiệm vụ 2: Word2Vec (trung bình) + Dense (Keras)
# ==========================================================
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

# 1) Train Word2Vec trên text train (có thể thêm val để phong phú)
sentences_train = [simple_preprocess(t) for t in df_train["text"].tolist()]
sentences_all   = sentences_train  # hoặc: sentences_train + [simple_preprocess(t) for t in df_val["text"]]
w2v_model = Word2Vec(sentences=sentences_all, vector_size=100, window=5, min_count=1, workers=4, seed=SEED)

# 2) Hàm chuyển câu -> vector trung bình
def sentence_to_avg_vector(text, model, vector_size=100):
    tokens = simple_preprocess(text)
    vectors = []
    for tok in tokens:
        if tok in model.wv:
            vectors.append(model.wv[tok])
    if len(vectors) == 0:
        return np.zeros(vector_size, dtype=np.float32)
    return np.mean(vectors, axis=0).astype(np.float32)

# 3) Tạo X_avg cho train/val/test
X_train_avg = np.vstack([sentence_to_avg_vector(t, w2v_model, w2v_model.vector_size) for t in df_train["text"]])
X_val_avg   = np.vstack([sentence_to_avg_vector(t, w2v_model, w2v_model.vector_size) for t in df_val["text"]])
X_test_avg  = np.vstack([sentence_to_avg_vector(t, w2v_model, w2v_model.vector_size) for t in df_test["text"]])

# 4) Mô hình Dense
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

y_train_oh = to_categorical(y_train, num_classes)
y_val_oh   = to_categorical(y_val,   num_classes)
y_test_oh  = to_categorical(y_test,  num_classes)

dense_avg = Sequential([
    Dense(128, activation='relu', input_shape=(w2v_model.vector_size,)),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])
dense_avg.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

es = EarlyStopping(patience=3, restore_best_weights=True, monitor='val_loss')
hist_dense = dense_avg.fit(
    X_train_avg, y_train_oh,
    validation_data=(X_val_avg, y_val_oh),
    epochs=30, batch_size=64, callbacks=[es], verbose=0
)

test_loss_dense, test_acc_dense = dense_avg.evaluate(X_test_avg, y_test_oh, verbose=0)
y_pred_dense = dense_avg.predict(X_test_avg, verbose=0).argmax(axis=1)
f1_dense = f1_score(y_test, y_pred_dense, average="macro")

print("\n[W2V Avg + Dense] Test loss:", test_loss_dense, " | Test macro-F1:", f1_dense)
print(classification_report(y_test, y_pred_dense, target_names=le.classes_))

2025-11-11 19:49:07.176706: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-11 19:49:07.616668: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-11 19:49:09.042764: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-11-


[W2V Avg + Dense] Test loss: 3.0917906761169434  | Test macro-F1: 0.14842877911622787
                          precision    recall  f1-score   support

             alarm_query       0.16      0.32      0.21        19
            alarm_remove       0.67      0.18      0.29        11
               alarm_set       0.47      0.84      0.60        19
       audio_volume_down       0.25      0.12      0.17         8
       audio_volume_mute       0.17      0.07      0.10        15
         audio_volume_up       0.18      0.15      0.17        13
          calendar_query       0.09      0.05      0.07        19
         calendar_remove       0.00      0.00      0.00        19
            calendar_set       0.11      0.05      0.07        19
          cooking_recipe       0.25      0.05      0.09        19
        datetime_convert       0.00      0.00      0.00         8
          datetime_query       0.13      0.63      0.21        19
        email_addcontact       0.00      0.00      0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
# ==========================================================
# Nhiệm vụ 3: Embedding Pre-trained + LSTM (đóng băng)
# ==========================================================
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM

# 1) Tokenizer + sequences + padding
def estimate_max_len(texts, q=0.95):
    lens = [len(simple_preprocess(t)) for t in texts]
    return max(5, int(np.quantile(lens, q)))

max_len = estimate_max_len(df_train["text"])
tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(df_train["text"].tolist())

def to_padded(texts, tokenizer, max_len):
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seqs, maxlen=max_len, padding='post', truncating='post')

X_train_pad = to_padded(df_train["text"], tokenizer, max_len)
X_val_pad   = to_padded(df_val["text"], tokenizer, max_len)
X_test_pad  = to_padded(df_test["text"], tokenizer, max_len)

vocab_size   = len(tokenizer.word_index) + 1
embedding_dim = w2v_model.vector_size

# 2) Embedding matrix từ Word2Vec
embedding_matrix = np.zeros((vocab_size, embedding_dim), dtype=np.float32)
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

# 3) LSTM với embedding pretrained (đóng băng)
from tensorflow.keras.layers import Dense as KDense, Dropout as KDropout
from tensorflow.keras.models import Sequential as KSequential

lstm_pre = KSequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_len,
        trainable=False
    ),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    KDense(num_classes, activation='softmax')
])
lstm_pre.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

es2 = EarlyStopping(patience=3, restore_best_weights=True, monitor='val_loss')
hist_pre = lstm_pre.fit(
    X_train_pad, y_train_oh,
    validation_data=(X_val_pad, y_val_oh),
    epochs=20, batch_size=64, callbacks=[es2], verbose=0
)

test_loss_pre, test_acc_pre = lstm_pre.evaluate(X_test_pad, y_test_oh, verbose=0)
y_pred_pre = lstm_pre.predict(X_test_pad, verbose=0).argmax(axis=1)
f1_pre = f1_score(y_test, y_pred_pre, average="macro")
print("\n[LSTM + Pretrained Emb] Test loss:", test_loss_pre, " | Test macro-F1:", f1_pre)
print(classification_report(y_test, y_pred_pre, target_names=le.classes_))




[LSTM + Pretrained Emb] Test loss: 2.6293485164642334  | Test macro-F1: 0.24300538830030236
                          precision    recall  f1-score   support

             alarm_query       0.48      0.63      0.55        19
            alarm_remove       0.62      0.73      0.67        11
               alarm_set       0.56      0.79      0.65        19
       audio_volume_down       0.50      0.12      0.20         8
       audio_volume_mute       0.20      0.13      0.16        15
         audio_volume_up       0.33      0.15      0.21        13
          calendar_query       0.11      0.05      0.07        19
         calendar_remove       0.24      0.32      0.27        19
            calendar_set       0.15      0.16      0.15        19
          cooking_recipe       0.14      0.16      0.15        19
        datetime_convert       0.25      0.12      0.17         8
          datetime_query       0.35      0.63      0.45        19
        email_addcontact       0.00      0.00   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
# ==========================================================
# Nhiệm vụ 4: Embedding học từ đầu + LSTM (trainable)
# ==========================================================
lstm_scratch = KSequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=100,
        input_length=max_len
    ),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    KDense(num_classes, activation='softmax')
])
lstm_scratch.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

es3 = EarlyStopping(patience=3, restore_best_weights=True, monitor='val_loss')
hist_scratch = lstm_scratch.fit(
    X_train_pad, y_train_oh,
    validation_data=(X_val_pad, y_val_oh),
    epochs=20, batch_size=64, callbacks=[es3], verbose=0
)

test_loss_scratch, test_acc_scratch = lstm_scratch.evaluate(X_test_pad, y_test_oh, verbose=0)
y_pred_scratch = lstm_scratch.predict(X_test_pad, verbose=0).argmax(axis=1)
f1_scratch = f1_score(y_test, y_pred_scratch, average="macro")
print("\n[LSTM + Scratch Emb] Test loss:", test_loss_scratch, " | Test macro-F1:", f1_scratch)
print(classification_report(y_test, y_pred_scratch, target_names=le.classes_))




[LSTM + Scratch Emb] Test loss: 0.7727429866790771  | Test macro-F1: 0.7920257455744357
                          precision    recall  f1-score   support

             alarm_query       0.94      0.89      0.92        19
            alarm_remove       0.92      1.00      0.96        11
               alarm_set       0.81      0.89      0.85        19
       audio_volume_down       0.88      0.88      0.88         8
       audio_volume_mute       0.72      0.87      0.79        15
         audio_volume_up       0.86      0.92      0.89        13
          calendar_query       0.45      0.47      0.46        19
         calendar_remove       0.95      1.00      0.97        19
            calendar_set       0.86      0.63      0.73        19
          cooking_recipe       0.57      0.63      0.60        19
        datetime_convert       0.67      0.75      0.71         8
          datetime_query       0.89      0.84      0.86        19
        email_addcontact       0.86      0.75      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
# ==========================================================
# Nhiệm vụ 5: Bảng so sánh định lượng + Phân tích định tính
# ==========================================================
results = pd.DataFrame({
    "Pipeline": [
        "TF-IDF + Logistic Regression",
        "Word2Vec (Avg) + Dense",
        "Embedding (Pre-trained) + LSTM",
        "Embedding (Scratch) + LSTM"
    ],
    "F1-score (Macro)": [
        f1_lr,
        f1_dense,
        f1_pre,
        f1_scratch
    ],
    "Test Loss": [
        np.nan,
        test_loss_dense,
        test_loss_pre,
        test_loss_scratch
    ]
})
print("\n=== Tổng hợp kết quả (Test) ===")
print(results)

# --------- Phân tích định tính ----------
hard_sentences = [
    ("can you remind me to not call my mom", None),         # kỳ vọng: reminder_create
    ("is it going to be sunny or rainy tomorrow", None),    # kỳ vọng: weather_query
    ("find a flight from new york to london but not through paris", None)  # kỳ vọng: flight_search
]

def predict_all_models(raw_text):
    # 1) TF-IDF + LR
    pred_lr = le.classes_[tfidf_lr_pipeline.predict([raw_text])[0]]

    # 2) W2V Avg + Dense
    x_avg = sentence_to_avg_vector(raw_text, w2v_model, w2v_model.vector_size).reshape(1, -1)
    pred_dense = le.classes_[dense_avg.predict(x_avg, verbose=0).argmax(axis=1)[0]]

    # 3) LSTM pretrained
    x_pad = to_padded([raw_text], tokenizer, max_len)
    pred_pre = le.classes_[lstm_pre.predict(x_pad, verbose=0).argmax(axis=1)[0]]

    # 4) LSTM scratch
    pred_scr = le.classes_[lstm_scratch.predict(x_pad, verbose=0).argmax(axis=1)[0]]

    return pred_lr, pred_dense, pred_pre, pred_scr

print("\n=== Phân tích định tính trên các câu 'khó' ===")
for sent, gold in hard_sentences:
    p_lr, p_dense, p_pre, p_scr = predict_all_models(sent)
    print(f"\nCâu: {sent}")
    print(f" - TF-IDF+LR:              {p_lr}")
    print(f" - W2V(Avg)+Dense:         {p_dense}")
    print(f" - LSTM + Pretrained Emb:  {p_pre}")
    print(f" - LSTM + Scratch Emb:     {p_scr}")
    if gold is not None:
        print(f" -> Nhãn thật: {gold}")

# Gợi ý diễn giải (để bạn ghi trong báo cáo):
explanation = """
Phân tích:
- Các câu có phủ định ('not call', 'not through paris') và cấu trúc phụ thuộc dài thường được LSTM xử lý tốt hơn
  vì nó mô hình hóa chuỗi theo thời gian, giữ ngữ cảnh và quan hệ giữa các token.
- TF-IDF + LR bỏ qua thứ tự từ; Word2Vec(trung bình) mất cấu trúc chuỗi => có thể nhầm với các ý định gần nghĩa.
- LSTM + pretrained embedding thường hội tụ nhanh và tổng quát tốt hơn khi dữ liệu train không quá lớn.
- LSTM học từ đầu có thể theo kịp hoặc vượt nếu dữ liệu phong phú, nhưng dễ overfit nếu dữ liệu ít.
"""
print(explanation)



=== Tổng hợp kết quả (Test) ===
                         Pipeline  F1-score (Macro)  Test Loss
0    TF-IDF + Logistic Regression          0.829401        NaN
1          Word2Vec (Avg) + Dense          0.148429   3.091791
2  Embedding (Pre-trained) + LSTM          0.243005   2.629349
3      Embedding (Scratch) + LSTM          0.792026   0.772743

=== Phân tích định tính trên các câu 'khó' ===

Câu: can you remind me to not call my mom
 - TF-IDF+LR:              calendar_set
 - W2V(Avg)+Dense:         general_explain
 - LSTM + Pretrained Emb:  email_query
 - LSTM + Scratch Emb:     calendar_set

Câu: is it going to be sunny or rainy tomorrow
 - TF-IDF+LR:              weather_query
 - W2V(Avg)+Dense:         email_query
 - LSTM + Pretrained Emb:  transport_query
 - LSTM + Scratch Emb:     weather_query

Câu: find a flight from new york to london but not through paris
 - TF-IDF+LR:              transport_query
 - W2V(Avg)+Dense:         general_dontcare
 - LSTM + Pretrained Emb:  transpo