In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import uniform, loguniform

In [None]:
from google.colab import drive
drive.mount('/content/Drive', force_remount=True)

Mounted at /content/Drive


In [None]:
train_df = pd.read_csv('/content/Drive/My Drive/CS221_Datasets/after_clean_train.csv')
test_df = pd.read_csv('/content/Drive/My Drive/CS221_Datasets/after_clean_test.csv')

In [None]:
train_df.dropna(subset=['no_stopwords'], inplace=True)
test_df.dropna(subset=['no_stopwords'], inplace=True)

In [None]:
le = LabelEncoder()
train_df['labels'] = le.fit_transform(train_df['labels'])
test_df['labels'] = le.transform(test_df['labels'])

# Chia dữ liệu huấn luyện thành hai tập train và validation để tìm tham số tối ưu

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_df['no_stopwords'], train_df['labels'], test_size=0.2, stratify=train_df['labels'], random_state=42)

In [None]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=10000))
])

In [None]:
param_dist_lr = {
    'tfidf__max_features': [5000, 10000, 15000],
    'tfidf__ngram_range': [(1,1), (1,2), (1,3)],
    'tfidf__min_df': [1, 2, 3],
    'tfidf__sublinear_tf': [True, False],
    'tfidf__use_idf': [True, False],
    'tfidf__norm': ['l2', 'l1'],
    'clf__C': [0.1, 1, 10]
}

# Sử dụng RandomizedSearch để thu hẹp phạm vi tìm kiếm

In [None]:
random_search = RandomizedSearchCV(pipe, param_distributions=param_dist_lr, n_iter=20, scoring='f1_weighted', cv=3, verbose=2, n_jobs=-1)
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [None]:
best_params = random_search.best_params_
best_params

{'tfidf__use_idf': False,
 'tfidf__sublinear_tf': True,
 'tfidf__norm': 'l2',
 'tfidf__ngram_range': (1, 2),
 'tfidf__min_df': 2,
 'tfidf__max_features': 15000,
 'clf__C': 1}

In [None]:
random_search.best_score_

np.float64(0.7054569544380941)

# Sử dụng GridSearchCV để tìm tham số tối ưu cuối cùng

In [None]:
param_grid_lr = {
    'tfidf__max_features': [15000, 16000, 17000],
    'tfidf__ngram_range': [(1,2)],
    'tfidf__min_df': [2],
    'tfidf__sublinear_tf': [True],
    'tfidf__use_idf': [False],
    'tfidf__norm': ['l2'],
    'clf__C': [0.5, 1, 1.5]
}

In [None]:
grid_search = GridSearchCV(pipe, param_grid=param_grid_lr, scoring='f1_weighted', cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


In [None]:
grid_search.best_params_

{'clf__C': 1.5,
 'tfidf__max_features': 15000,
 'tfidf__min_df': 2,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2',
 'tfidf__sublinear_tf': True,
 'tfidf__use_idf': False}

# Dự đoán trên tập Validation

In [None]:
y_pred_val = grid_search.predict(X_val)
print(classification_report(y_val, y_pred_val))

              precision    recall  f1-score   support

           0       0.75      0.85      0.79      6964
           1       0.46      0.28      0.35      3488
           2       0.79      0.84      0.82      6540

    accuracy                           0.73     16992
   macro avg       0.67      0.66      0.65     16992
weighted avg       0.71      0.73      0.71     16992



# Train trên toàn bộ dữ liệu huấn luyện với tham số được lấy từ GridSearch ở trên

In [None]:
train_df = pd.read_csv('/content/Drive/My Drive/CS221_Datasets/after_clean_train.csv')
test_df = pd.read_csv('/content/Drive/My Drive/CS221_Datasets/after_clean_test.csv')

In [None]:
train_df.dropna(subset=['no_stopwords'], inplace=True)
test_df.dropna(subset=['no_stopwords'], inplace=True)

In [None]:
le = LabelEncoder()
train_df['labels'] = le.fit_transform(train_df['labels'])
test_df['labels'] = le.transform(test_df['labels'])

In [None]:
X_train, X_test, y_train, y_test = train_df['no_stopwords'], test_df['no_stopwords'], train_df['labels'], test_df['labels']

In [None]:
tfidf = TfidfVectorizer(max_features=15000, min_df=2, ngram_range=(1, 2), norm='l2', sublinear_tf=True, use_idf=False)
lr = LogisticRegression(max_iter=10000, C=1.5)

In [None]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
lr.fit(X_train_tfidf, y_train)

# Dự đoán trên dữ liệu kiểm thử

In [None]:
y_pred = lr.predict(X_test_tfidf)

In [None]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.7382985289579261
              precision    recall  f1-score   support

           0       0.75      0.86      0.80      3960
           1       0.46      0.28      0.35      1930
           2       0.80      0.85      0.83      3831

    accuracy                           0.74      9721
   macro avg       0.67      0.66      0.66      9721
weighted avg       0.71      0.74      0.72      9721



# Trích xuất các đặc trưng quan trọng

In [None]:
feature_names = np.array(tfidf.get_feature_names_out())
coef = lr.coef_

In [None]:
for i, class_label in enumerate(le.classes_):
    top_indices = np.argsort(coef[i])[-20:][::-1]
    print(f"\n🔹 Top 20 từ quan trọng nhất cho lớp '{class_label}':")
    for idx in top_indices:
        print(f"{feature_names[idx]}: {coef[i][idx]:.4f}")


🔹 Top 20 từ quan trọng nhất cho lớp 'Negative':
horrible: 4.4802
terrible: 4.2496
poor: 4.1250
not happy: 4.0353
not money: 4.0273
worst: 4.0243
not worth: 3.7957
no good: 3.7576
disappointed: 3.7246
wanted love: 3.4994
very disappointed: 3.4875
not good: 3.4500
useless: 3.4417
disappointing: 3.4287
not comfortable: 3.3929
broke: 3.2875
poorly: 3.2622
going back: 3.1773
not work: 3.1713
awful: 3.1096

🔹 Top 20 từ quan trọng nhất cho lớp 'Neutral':
not most: 2.6687
okay not: 2.1954
not bad: 2.1027
okay: 2.0739
not best: 1.8961
broke month: 1.8543
than most: 1.8071
cannot really: 1.8059
higher: 1.8028
max: 1.7464
all day: 1.7325
within week: 1.7057
rub: 1.6872
hanging: 1.6565
beautiful not: 1.6557
little disappointed: 1.6367
however: 1.6166
ok: 1.5890
so so: 1.5781
than not: 1.5598

🔹 Top 20 từ quan trọng nhất cho lớp 'Positive':
not hurt: 4.2412
love: 4.1885
perfect: 4.0229
great: 3.8914
loves: 3.8192
favorite: 3.6081
not disappointed: 3.5831
compliments: 3.5453
best: 3.5388
perfectly:

# Các trường hợp dự đoán sai

In [None]:
y_test_labels = le.inverse_transform(np.array(y_test))
y_pred_labels = le.inverse_transform(np.array(y_pred))

In [None]:
cleaned_texts = test_df['cleaned_text'].tolist()
no_stopwords_texts = test_df['no_stopwords'].tolist()

In [None]:
wrong_indices = np.where(y_test != y_pred)[0]

In [None]:
for i in wrong_indices[:20]:
    print(f"Mẫu trước khi loại bỏ Stopwords: {cleaned_texts[i]}")
    print(f"Mẫu sau khi loại bỏ Stopwords: {no_stopwords_texts[i]}")
    print(f"Nhãn thực sự: {y_test_labels[i]}")
    print(f"Nhãn dự đoán: {y_pred_labels[i]}")

Mẫu trước khi loại bỏ Stopwords: they are are not comfortable what so ever and i got blisters so i no longer wear them
Mẫu sau khi loại bỏ Stopwords: not comfortable so blisters so no longer wear
Nhãn thực sự: Neutral
Nhãn dự đoán: Negative
Mẫu trước khi loại bỏ Stopwords: delivery and everything was fine however but i cannot put anything with much weight in it is without the candles coming out of place
Mẫu sau khi loại bỏ Stopwords: delivery everything fine however cannot put anything much weight without candles coming place
Nhãn thực sự: Negative
Nhãn dự đoán: Positive
Mẫu trước khi loại bỏ Stopwords: she is look amazing but run smaller than i was expecting also the sole of one of the she is was coming off i had to superglue it is not a problem but should not have not had to do that is with new she is
Mẫu sau khi loại bỏ Stopwords: look amazing run smaller than expecting sole coming off superglue not problem not not new
Nhãn thực sự: Neutral
Nhãn dự đoán: Negative
Mẫu trước khi loại 