In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import uniform

In [None]:
from google.colab import drive
drive.mount('/content/Drive')

Mounted at /content/Drive


In [None]:
train_df = pd.read_csv('/content/Drive/My Drive/CS221_Datasets/after_clean_train.csv')
test_df = pd.read_csv('/content/Drive/My Drive/CS221_Datasets/after_clean_test.csv')

In [None]:
train_df.dropna(subset=['no_stopwords'], inplace=True)
test_df.dropna(subset=['no_stopwords'], inplace=True)

In [None]:
le = LabelEncoder()
train_df['labels'] = le.fit_transform(train_df['labels'])
test_df['labels'] = le.transform(test_df['labels'])

# Chia dữ liệu huấn luyện thành hai tập train và validation để tìm tham số tối ưu

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_df['no_stopwords'], train_df['labels'], test_size=0.2, stratify=train_df['labels'], random_state=42)

In [None]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

In [None]:
param_dist_nb = {
    'tfidf__max_features': [3000, 5000, 10000],
    'tfidf__ngram_range': [(1,1), (1,2), (1,3)],
    'tfidf__min_df': [1, 2, 3],
    'tfidf__sublinear_tf': [True, False],
    'tfidf__use_idf': [True, False],
    'tfidf__norm': [None]
}

In [None]:
random_search = RandomizedSearchCV(pipe, param_distributions=param_dist_nb, n_iter=20, scoring='f1_weighted', cv=3, verbose=2, n_jobs=-1)
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [None]:
best_params = random_search.best_params_
best_params

{'tfidf__use_idf': False,
 'tfidf__sublinear_tf': False,
 'tfidf__norm': None,
 'tfidf__ngram_range': (1, 3),
 'tfidf__min_df': 3,
 'tfidf__max_features': 10000}

In [None]:
grid_param = {
    'tfidf__max_features': [10000, 11000, 12000],
    'tfidf__ngram_range': [(1, 2),(1, 3)],
    'tfidf__min_df': [3, 4, 5],
    'tfidf__sublinear_tf': [False],
    'tfidf__use_idf': [False],
    'tfidf__norm': [None]
}
grid_search = GridSearchCV(pipe, param_grid=grid_param, scoring='f1_weighted', cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


In [None]:
grid_search.best_params_

{'tfidf__max_features': 12000,
 'tfidf__min_df': 4,
 'tfidf__ngram_range': (1, 3),
 'tfidf__norm': None,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': False}

In [None]:
grid_search.best_score_

np.float64(0.6994322162229413)

# Dự đoán trên tập Validation

In [None]:
y_pred_val = grid_search.predict(X_val)
print(classification_report(y_val, y_pred_val))

              precision    recall  f1-score   support

           0       0.77      0.74      0.76      6964
           1       0.41      0.43      0.42      3488
           2       0.79      0.80      0.80      6540

    accuracy                           0.70     16992
   macro avg       0.66      0.66      0.66     16992
weighted avg       0.71      0.70      0.70     16992



# Train trên toàn bộ dữ liệu huấn luyện với tham số được lấy từ GridSearch ở trên

In [None]:
train_df = pd.read_csv('/content/Drive/My Drive/CS221_Datasets/after_clean_train.csv')
test_df = pd.read_csv('/content/Drive/My Drive/CS221_Datasets/after_clean_test.csv')

In [None]:
train_df.dropna(subset=['no_stopwords'], inplace=True)
test_df.dropna(subset=['no_stopwords'], inplace=True)

In [None]:
le = LabelEncoder()
train_df['labels'] = le.fit_transform(train_df['labels'])
test_df['labels'] = le.transform(test_df['labels'])

In [None]:
X_train, X_test, y_train, y_test = train_df['no_stopwords'], test_df['no_stopwords'], train_df['labels'], test_df['labels']

In [None]:
tfidf = TfidfVectorizer(max_features=12000, min_df=4, ngram_range=(1, 3), norm=None, sublinear_tf=False, use_idf=False)
mnb = MultinomialNB()

In [None]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
mnb.fit(X_train_tfidf, y_train)

# Dự đoán trên dữ liệu kiểm thử

In [None]:
y_pred = mnb.predict(X_test_tfidf)

In [None]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.7055858450776669
              precision    recall  f1-score   support

           0       0.77      0.75      0.76      3960
           1       0.40      0.42      0.41      1930
           2       0.81      0.80      0.81      3831

    accuracy                           0.71      9721
   macro avg       0.66      0.66      0.66      9721
weighted avg       0.71      0.71      0.71      9721



# Trích xuất các đặc trưng quan trọng cho từng lớp

In [None]:
feature_names = np.array(tfidf.get_feature_names_out())
log_probs = mnb.feature_log_prob_

In [None]:
for i, class_label in enumerate(le.classes_):
    top_indices = np.argsort(log_probs[i])[::-1][:20]
    print(f"\n🔹 Top 20 từ quan trọng nhất cho lớp '{class_label}':")
    for idx in top_indices:
        print(f"{feature_names[idx]}: {log_probs[i][idx]:.4f}")


🔹 Top 20 từ quan trọng nhất cho lớp 'Negative':
not: -2.9887
feet: -4.5472
shoe: -4.6484
really: -4.6635
size: -4.6640
too: -4.6844
very: -4.7097
so: -4.8240
wear: -5.1745
not not: -5.1957
pair: -5.3061
small: -5.3463
foot: -5.4540
all: -5.5010
back: -5.5272
return: -5.5374
cannot: -5.5448
than: -5.5867
because: -5.5972
when: -5.6275

🔹 Top 20 từ quan trọng nhất cho lớp 'Neutral':
not: -3.0474
feet: -4.4131
size: -4.4895
shoe: -4.6506
really: -4.6681
too: -4.7504
so: -4.7974
very: -4.8232
wear: -4.9627
not not: -5.2668
comfortable: -5.2684
small: -5.2983
than: -5.2991
good: -5.3203
little: -5.3446
foot: -5.4448
pair: -5.4903
because: -5.5231
cut: -5.5457
look: -5.5767

🔹 Top 20 từ quan trọng nhất cho lớp 'Positive':
not: -3.3171
feet: -4.3060
very: -4.4991
great: -4.5433
comfortable: -4.5826
so: -4.6123
love: -4.6333
really: -4.7005
size: -4.7041
wear: -4.8119
shoe: -4.8369
good: -4.9735
pair: -5.1785
all: -5.2425
little: -5.2443
cannot: -5.3840
well: -5.4264
than: -5.4384
too: -5.448

# In ra 20 trường hợp dự đoán sai

In [None]:
y_test_labels = le.inverse_transform(np.array(y_test))
y_pred_labels = le.inverse_transform(np.array(y_pred))

In [None]:
cleaned_texts = test_df['cleaned_text'].tolist()
no_stopwords_texts = test_df['no_stopwords'].tolist()

In [None]:
wrong_indices = np.where(y_test != y_pred)[0]

In [None]:
for i in wrong_indices[:20]:
    print(f"Mẫu trước khi loại bỏ Stopwords: {cleaned_texts[i]}")
    print(f"Mẫu sau khi loại bỏ Stopwords: {no_stopwords_texts[i]}")
    print(f"Nhãn thực sự: {y_test_labels[i]}")
    print(f"Nhãn dự đoán: {y_pred_labels[i]}")

Mẫu trước khi loại bỏ Stopwords: they are are not comfortable what so ever and i got blisters so i no longer wear them
Mẫu sau khi loại bỏ Stopwords: not comfortable so blisters so no longer wear
Nhãn thực sự: Neutral
Nhãn dự đoán: Negative
Mẫu trước khi loại bỏ Stopwords: delivery and everything was fine however but i cannot put anything with much weight in it is without the candles coming out of place
Mẫu sau khi loại bỏ Stopwords: delivery everything fine however cannot put anything much weight without candles coming place
Nhãn thực sự: Negative
Nhãn dự đoán: Positive
Mẫu trước khi loại bỏ Stopwords: i love the shoe very comfortable feet it is perfect then the second time i wore them i leaned back and the heel popped i hope i cannot fix them this was the shoe that is i was going to try in all the colors but probably not now
Mẫu sau khi loại bỏ Stopwords: love shoe very comfortable feet perfect then second time wore leaned back heel popped hope cannot fix shoe going try all colors pr