In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import uniform, loguniform

In [None]:
from google.colab import drive
drive.mount('/content/Drive')

Mounted at /content/Drive


In [None]:
train_df = pd.read_csv('/content/Drive/My Drive/CS221_Datasets/after_clean_train.csv')
test_df = pd.read_csv('/content/Drive/My Drive/CS221_Datasets/after_clean_test.csv')

In [None]:
train_df.dropna(subset=['no_stopwords'], inplace=True)
test_df.dropna(subset=['no_stopwords'], inplace=True)

In [None]:
le = LabelEncoder()
train_df['labels'] = le.fit_transform(train_df['labels'])
test_df['labels'] = le.transform(test_df['labels'])

In [None]:
le.classes_

array(['Negative', 'Neutral', 'Positive'], dtype=object)

# Chia dữ liệu huấn luyện thành hai tập train và validation để tìm tham số tối ưu

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_df['no_stopwords'], train_df['labels'], test_size=0.2, stratify=train_df['labels'], random_state=42)

In [None]:
pipe_svm = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

# Sử dụng RandomizedSearchCV để thu hẹp phạm vi tìm kiếm

In [None]:
param_dist_svm = {
    # TF-IDF
    'tfidf__max_features': [3000, 5000, 10000, 15000],
    'tfidf__ngram_range': [(1,1), (1,2), (1,3)],
    'tfidf__min_df': [1, 2, 3],
    'tfidf__sublinear_tf': [True, False],
    'tfidf__use_idf': [True, False],
    'tfidf__norm': ['l1', 'l2', None],

    # SVM
    'clf__C': uniform(0.01, 10),  # từ 0.01 đến 10
    'clf__loss': ['hinge', 'squared_hinge']
}

In [None]:
random_search_svm = RandomizedSearchCV(
    pipe_svm,
    param_distributions=param_dist_svm,
    n_iter=30,
    scoring='f1_weighted',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

random_search_svm.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


In [None]:
print("Best params:", random_search_svm.best_params_)
print("Best score:", random_search_svm.best_score_)

Best params: {'clf__C': np.float64(0.4766566321361543), 'clf__loss': 'squared_hinge', 'tfidf__max_features': 15000, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': True, 'tfidf__use_idf': False}
Best score: 0.700611220574275


# Sử dụng GridSearchCV để tìm tham số tối ưu cuối cùng

In [None]:
param_grid_svm = {
    'tfidf__max_features': [15000],
    'tfidf__min_df': [3],
    'tfidf__ngram_range': [(1, 2)],
    'tfidf__sublinear_tf': [True],
    'tfidf__use_idf': [False, True],   
    'tfidf__norm': ['l2'],

    'clf__C': [0.3, 0.5, 0.7],
    'clf__loss': ['squared_hinge']
}

In [None]:
grid_search_svm = GridSearchCV(
    estimator=pipe_svm,
    param_grid=param_grid_svm,
    scoring='f1_weighted',
    cv=3,
    verbose=2,
    n_jobs=-1
)
grid_search_svm.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [None]:
grid_search_svm.best_params_

{'clf__C': 0.5,
 'clf__loss': 'squared_hinge',
 'tfidf__max_features': 15000,
 'tfidf__min_df': 3,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2',
 'tfidf__sublinear_tf': True,
 'tfidf__use_idf': False}

# Dự đoán trên tập Validation

In [None]:
y_pred_val = grid_search_svm.predict(X_val)
print(classification_report(y_val, y_pred_val))

              precision    recall  f1-score   support

           0       0.74      0.86      0.79      6964
           1       0.47      0.23      0.31      3488
           2       0.78      0.86      0.82      6540

    accuracy                           0.73     16992
   macro avg       0.66      0.65      0.64     16992
weighted avg       0.70      0.73      0.70     16992



# Train trên toàn bộ dữ liệu huấn luyện với tham số được lấy từ GridSearch ở trên

In [None]:
train_df = pd.read_csv('/content/Drive/My Drive/CS221_Datasets/after_clean_train.csv')
test_df = pd.read_csv('/content/Drive/My Drive/CS221_Datasets/after_clean_test.csv')

In [None]:
train_df.dropna(subset=['no_stopwords'], inplace=True)
test_df.dropna(subset=['no_stopwords'], inplace=True)

In [None]:
le = LabelEncoder()
train_df['labels'] = le.fit_transform(train_df['labels'])
test_df['labels'] = le.transform(test_df['labels'])

In [None]:
X_train, X_test, y_train, y_test = train_df['no_stopwords'], test_df['no_stopwords'], train_df['labels'], test_df['labels']

In [None]:
tfidf = TfidfVectorizer(max_features=15000, min_df=3, ngram_range=(1, 2), norm='l2', sublinear_tf=True, use_idf=False)
svc = LinearSVC(C=0.5, loss='squared_hinge')

In [None]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
svc.fit(X_train_tfidf, y_train)

# Dự đoán trên dữ liệu kiểm thử

In [None]:
y_pred = svc.predict(X_test_tfidf)

In [None]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.7351095566299763
              precision    recall  f1-score   support

           0       0.74      0.87      0.80      3960
           1       0.46      0.22      0.30      1930
           2       0.79      0.85      0.82      3831

    accuracy                           0.74      9721
   macro avg       0.66      0.65      0.64      9721
weighted avg       0.71      0.74      0.71      9721



# In ra các hệ số biểu diễn mức độ quan trọng của từng đặc trưng

In [None]:
feature_names = np.array(tfidf.get_feature_names_out())
coef = svc.coef_

In [None]:
for i, class_label in enumerate(le.classes_):
    top_indices = np.argsort(coef[i])[-20:][::-1]
    print(f"\n🔹 Top 20 từ quan trọng nhất cho lớp {class_label}:")
    for idx in top_indices:
        print(f"{feature_names[idx]}: {coef[i][idx]:.4f}")


🔹 Top 20 từ quan trọng nhất cho lớp Negative:
not happy: 2.5754
worst: 2.5582
no good: 2.5407
horrible: 2.4250
most uncomfortable: 2.4240
terrible: 2.4209
useless: 2.3836
wanted love: 2.3727
not money: 2.3262
not worth: 2.2765
going back: 2.2274
poor: 2.1171
suck: 2.0758
dangerous: 1.9878
not good: 1.9488
poorly: 1.9405
not work: 1.8566
very disappointed: 1.8183
june: 1.8129
tore: 1.7718

🔹 Top 20 từ quan trọng nhất cho lớp Neutral:
little disappointed: 2.1273
not most: 1.9852
okay not: 1.7281
top very: 1.6129
sure great: 1.5767
not bad: 1.5716
max: 1.5479
beautiful not: 1.5442
not cushion: 1.5123
wear quickly: 1.4947
kept anyway: 1.4782
cannot really: 1.4570
than typical: 1.4451
less comfortable: 1.4365
boot too: 1.4308
broke month: 1.4257
beautiful too: 1.4002
quiet: 1.3787
than most: 1.3633
kind narrow: 1.3628

🔹 Top 20 từ quan trọng nhất cho lớp Positive:
not hurt: 2.8708
not disappointed: 2.5812
never disappointed: 2.4299
go wrong: 2.3433
only issue: 2.1746
wonderful: 2.0991
favo

# In ra 20 trường hợp dự đoán sai

In [None]:
y_test_labels = le.inverse_transform(np.array(y_test))
y_pred_labels = le.inverse_transform(np.array(y_pred))

cleaned_texts = test_df['cleaned_text'].tolist()
no_stopwords_texts = test_df['no_stopwords'].tolist()

wrong_indices = np.where(y_test != y_pred)[0]

for i in wrong_indices[:20]:
    print(f"Mẫu trước khi loại bỏ Stopwords: {cleaned_texts[i]}")
    print(f"Mẫu sau khi loại bỏ Stopwords: {no_stopwords_texts[i]}")
    print(f"Nhãn thực sự: {y_test_labels[i]}")
    print(f"Nhãn dự đoán: {y_pred_labels[i]}")

Mẫu trước khi loại bỏ Stopwords: they are are not comfortable what so ever and i got blisters so i no longer wear them
Mẫu sau khi loại bỏ Stopwords: not comfortable so blisters so no longer wear
Nhãn thực sự: Neutral
Nhãn dự đoán: Negative
Mẫu trước khi loại bỏ Stopwords: bag came seeking of milder or mold
Mẫu sau khi loại bỏ Stopwords: bag came seeking milder mold
Nhãn thực sự: Negative
Nhãn dự đoán: Positive
Mẫu trước khi loại bỏ Stopwords: she is look amazing but run smaller than i was expecting also the sole of one of the she is was coming off i had to superglue it is not a problem but should not have not had to do that is with new she is
Mẫu sau khi loại bỏ Stopwords: look amazing run smaller than expecting sole coming off superglue not problem not not new
Nhãn thực sự: Neutral
Nhãn dự đoán: Negative
Mẫu trước khi loại bỏ Stopwords: very comfortable size was right looks good but the first season the material around the edge started to show ware strap came away from the sole neede