In [3]:
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import joblib
import os
from sklearn.model_selection import train_test_split
import numpy as np
df = pd.read_csv('../processed_data.csv')

X = df['text']
y = df['target']

nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/nt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
tokenized_texts = [word_tokenize(sentence.lower()) for sentence in X]


In [6]:
max_length = max(len(sentence) for sentence in tokenized_texts)
print("Maximum sentence length:", max_length)

Maximum sentence length: 21


In [7]:
w2v_model = Word2Vec(
    sentences=tokenized_texts, vector_size=300, window=21, min_count=1, sg=0
)

In [8]:
def vectorize_text(text, model):
    words = word_tokenize(text.lower())
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    return (
        np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model.vector_size)
    )

In [9]:
X_vectorized = np.array([vectorize_text(text, w2v_model) for text in X])

print("Vectorized data shape: ", X_vectorized.shape)
print("Number of classes: ", len(y.unique()))

Vectorized data shape:  (7593, 300)
Number of classes:  2


In [10]:
save_path = "../saved_models/word2vec"
os.makedirs(save_path, exist_ok=True)

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

models = {
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "Bayes": GaussianNB(),  # Sử dụng mô hình Gaussian Naive Bayes vì Multinomial không hỗ trợ giá trị âm
    "Decision Tree": DecisionTreeClassifier(max_depth=50, min_samples_split=4, criterion='gini', random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=100, min_samples_split=4, criterion='entropy'),
    "Logistic Regression": LogisticRegression(C=0.1, random_state=42, max_iter=1000),
    "SVM Linear": SVC(kernel='linear', C=1.0, random_state=42),
    "SVM Non-linear": SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42),
}

In [12]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

metrics = {model_name: {'accuracy': [], 'precision': [],
                        'recall': [], 'f1': []} for model_name in models.keys()}

for _ in range(20):
    X_train, X_test, y_train, y_test = train_test_split(
        X_vectorized, y, test_size=0.2, shuffle=True)

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        if model_name == "Logistic Regression":
            y_prob = model.predict_proba(X_test)[:, 1]
            y_pred = (y_prob >= 0.5).astype(int)
        else:
            y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary', zero_division=0)
        recall = recall_score(y_test, y_pred, average='binary', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='binary', zero_division=0)

        metrics[model_name]['accuracy'].append(accuracy)
        metrics[model_name]['precision'].append(precision)
        metrics[model_name]['recall'].append(recall)
        metrics[model_name]['f1'].append(f1)

mean_metrics = {model_name: {metric: np.mean(values) for metric, values in model_metrics.items()}
                for model_name, model_metrics in metrics.items()}

for model_name, model in models.items():
    model_filename = os.path.join(save_path, f"{model_name}.joblib")
    joblib.dump(model, model_filename)
    print(f"Model {model_name} saved to {model_filename}")

for model_name, mean in mean_metrics.items():
    print(f"{model_name} Mean Metrics:")
    print(f"  Accuracy: {mean['accuracy']:.4f}")
    print(f"  Precision: {mean['precision']:.4f}")
    print(f"  Recall: {mean['recall']:.4f}")
    print(f"  F1 Score: {mean['f1']:.4f}\n")

Model KNN saved to ../saved_models/word2vec/KNN.joblib
Model Bayes saved to ../saved_models/word2vec/Bayes.joblib
Model Decision Tree saved to ../saved_models/word2vec/Decision Tree.joblib
Model Random Forest saved to ../saved_models/word2vec/Random Forest.joblib
Model Logistic Regression saved to ../saved_models/word2vec/Logistic Regression.joblib
Model SVM Linear saved to ../saved_models/word2vec/SVM Linear.joblib
Model SVM Non-linear saved to ../saved_models/word2vec/SVM Non-linear.joblib
KNN Mean Metrics:
  Accuracy: 0.6447
  Precision: 0.6350
  Recall: 0.4132
  F1 Score: 0.5003

Bayes Mean Metrics:
  Accuracy: 0.5280
  Precision: 0.4678
  Recall: 0.6919
  F1 Score: 0.5581

Decision Tree Mean Metrics:
  Accuracy: 0.6376
  Precision: 0.5800
  Recall: 0.5769
  F1 Score: 0.5783

Random Forest Mean Metrics:
  Accuracy: 0.7248
  Precision: 0.7744
  Recall: 0.5106
  F1 Score: 0.6151

Logistic Regression Mean Metrics:
  Accuracy: 0.5639
  Precision: 0.3957
  Recall: 0.0227
  F1 Score: 0.0