## **1. Import các thư viện cần thiết**

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import re
import nltk
import unidecode

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## **2. Tiền xử lý**

In [2]:
english_stop_words = stopwords.words('english') # Lấy danh sách stopwords từ thư viện ntlk
stemmer = PorterStemmer() # Khai báo stemmer object (dùng để stemming trong hàm normalize text)

# Xây dựng hàm text normalization
def text_normalize(text):
    text = text.lower() # Chuyển chữ viết thường 
    text = unidecode.unidecode(text) # Mã hóa về ASCII
    text = text.strip() # Xóa kí tự đặc biệt ở đầu và cuối string
    text = re.sub(r'[^\w\s]', '', text) # Loại bỏ dấu câu
    text = ' '.join([word for word in text.split(' ') if word not in english_stop_words]) # Xóa stopwords
    text = ' '.join([stemmer.stem(word) for word in text.split(' ')]) # Stemming
 
    return text

In [3]:
BATCH_SIZE = 128
LR = 1e-1
MAX_SEQ_LEN = 128
MAX_FEATURES = 5000 
EMBEDDING_DIMS = 64

train_filepath = "dataset/train.csv"
val_filepath = "dataset/val.csv"
test_filepath = "dataset/test.csv"

train_df = pd.read_csv(train_filepath, 
                index_col=0) 
val_df = pd.read_csv(val_filepath, 
                index_col=0) 
test_df = pd.read_csv(test_filepath, 
                index_col=0) 

In [4]:
train_df['Tweet'] = train_df['Tweet'].apply(lambda p: text_normalize(p)).astype(str) 
val_df['Tweet'] = val_df['Tweet'].apply(lambda p: text_normalize(p)).astype(str) 
test_df['Tweet'] = test_df['Tweet'].apply(lambda p: text_normalize(p)).astype(str) 

class_lst = np.array(train_df.columns[2:])
n_classes = len(class_lst)

X_train, y_train = train_df['Tweet'].to_numpy(), train_df[class_lst].astype('int').to_numpy()
X_val, y_val = val_df['Tweet'].to_numpy(), val_df[class_lst].astype('int').to_numpy()
X_test, y_test = test_df['Tweet'].to_numpy(), test_df[class_lst].astype('int').to_numpy()

In [5]:
corpus = X_train.tolist()
vectorizer = TfidfVectorizer().fit(corpus)

In [6]:
X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

In [7]:
def inverse_label(class_lst, onehot_label):

    return class_lst[onehot_label > 0]

## **3. Xây dựng mô hình**

In [8]:
xgboost_model = XGBClassifier(objective='binary:logistic',
                              learning_rate=LR,
                              verbosity=2)
xgboost_multilabel_model = MultiOutputClassifier(xgboost_model)

## **4. Huấn luyện và đánh giá**

In [None]:
xgboost_multilabel_model.fit(X_train, y_train)

In [None]:
xgboost_multilabel_model.score(X_test, y_test)