In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import joblib
import pandas as pd
import numpy as np

In [16]:
import joblib
import tensorflow as tf

# Load Word2Vec data
X_train_w2v = joblib.load('/content/drive/MyDrive/aap/X_train_w2v.joblib')
X_val_w2v = joblib.load('/content/drive/MyDrive/aap/X_val_w2v.joblib')
X_test_w2v = joblib.load('/content/drive/MyDrive/aap/X_test_w2v.joblib')


y_train_w2v = joblib.load('/content/drive/MyDrive/aap/y_train_w2v.joblib')
y_val_w2v = joblib.load('/content/drive/MyDrive/aap/y_val_w2v.joblib')


y_test = joblib.load('/content/drive/MyDrive/aap/y_test.joblib')


In [7]:
%%time
xgb_clf = XGBClassifier(
        learning_rate=0.1,
        n_estimators=400,
        max_depth=8,
        objective='multi:softmax',
        use_label_encoder=False,
        eval_metric='logloss',
        random_state = 42
    )

    # Train the model


xgb_clf.fit(X_train_w2v, y_train_w2v)

# Make predictions on validation and test sets
y_val_pred = xgb_clf.predict(X_val_w2v)
y_pred_w2v = xgb_clf.predict(X_test_w2v)

# Calculate validation accuracy
val_accuracy = accuracy_score(y_val_w2v, y_val_pred)
# Print the validation accuracy
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

accuracy_cv = accuracy_score(y_test, y_pred_w2v)
report_cv = classification_report(y_test, y_pred_w2v)

print(f"Test Accuracy for word2vec: {accuracy_cv * 100:.2f}%")

Parameters: { "use_label_encoder" } are not used.



Validation Accuracy: 76.66%
Test Accuracy for word2vec: 90.20%
CPU times: user 8min 55s, sys: 565 ms, total: 8min 55s
Wall time: 1min 11s


we can find this model is trained very fast compared to other models, what's more the dimension is only 100 compared with the 2-gram cv and tfidf,
so i want to improve my data using data-centric methods

## using cross avlidation

In [26]:
X_train_w2v_cv = np.concatenate([X_train_w2v,X_val_w2v],axis = 0)
y_train_w2v_cv = np.concatenate([y_train_w2v,y_val_w2v],axis = 0)

In [25]:
np.concatenate([X_train_w2v,X_val_w2v],axis = 0).shape

(69491, 100)

In [30]:
## using cross validation for training process
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(xgb_clf, X_train_w2v_cv, y_train_w2v_cv, cv=5, scoring='accuracy')
print("Average cross-validation accuracy:", cv_scores.mean())


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Average cross-validation accuracy: 0.7684160458168753


we can see there is a slightly improvement in the validation set

In [31]:
y_val_pred = xgb_clf.predict(X_val_w2v)
y_pred_w2v = xgb_clf.predict(X_test_w2v)

# Calculate validation accuracy
val_accuracy = accuracy_score(y_val_w2v, y_val_pred)
# Print the validation accuracy
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

accuracy_cv = accuracy_score(y_test, y_pred_w2v)
report_cv = classification_report(y_test, y_pred_w2v)

print(f"Test Accuracy for word2vec: {accuracy_cv * 100:.2f}%")

Validation Accuracy: 76.78%
Test Accuracy for word2vec: 90.20%


## add regularization

In [32]:
xgb_clf = XGBClassifier(
    learning_rate=0.1,
    n_estimators=400,
    max_depth=8,
    objective='multi:softmax',
    use_label_encoder=False,
    eval_metric='logloss',
    random_state = 42
    reg_alpha=0.5,      # L1 regularization term
    reg_lambda=1.5      # L2 regularization term
)

# Train the model
xgb_clf.fit(X_train_w2v, y_train_w2v)

# Make predictions on validation and test sets
y_val_pred = xgb_clf.predict(X_val_w2v)
y_pred_w2v = xgb_clf.predict(X_test_w2v)

# Calculate and print the validation accuracy
val_accuracy = accuracy_score(y_val_w2v, y_val_pred)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")


accuracy_cv = accuracy_score(y_test, y_pred_w2v)
report_cv = classification_report(y_test, y_pred_w2v)

print(f"Test Accuracy for word2vec: {accuracy_cv * 100:.2f}%")



Parameters: { "use_label_encoder" } are not used.



Validation Accuracy: 76.78%
Test Accuracy for word2vec: 90.20%


## dealing with unbalanced

In [33]:
from imblearn.over_sampling import SMOTE    # using smote to deal with the imbalance


# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
X_train_smote, y_train_smote = smote.fit_resample(X_train_w2v, y_train_w2v)

# Define the XGBoost classifier
xgb_clf = XGBClassifier(
    learning_rate=0.1,
    n_estimators=400,
    max_depth=8,
    random_state = 42
    objective='multi:softmax',
    use_label_encoder=False,
    eval_metric='logloss',
    reg_alpha=0.5,      # L1 regularization term
    reg_lambda=1.5      # L2 regularization term
)

# Train the model using the resampled training data
xgb_clf.fit(X_train_smote, y_train_smote)

# Make predictions on validation and test sets
y_val_pred = xgb_clf.predict(X_val_w2v)
y_pred_w2v = xgb_clf.predict(X_test_w2v)

# Calculate and print the validation accuracy
val_accuracy = accuracy_score(y_val_w2v, y_val_pred)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

# Calculate test accuracy and print classification report
accuracy_cv = accuracy_score(y_test, y_pred_w2v)
report_cv = classification_report(y_test, y_pred_w2v)

print(f"Test Accuracy for word2vec: {accuracy_cv * 100:.2f}%")
print("\nClassification Report:\n", report_cv)


Parameters: { "use_label_encoder" } are not used.



Validation Accuracy: 76.43%
Test Accuracy for word2vec: 90.10%

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.88      0.88       172
           1       0.92      0.92      0.92       266
           2       0.91      0.88      0.89       285
           3       0.89      0.92      0.91       277

    accuracy                           0.90      1000
   macro avg       0.90      0.90      0.90      1000
weighted avg       0.90      0.90      0.90      1000



## normalization for the data

In [34]:
from sklearn.preprocessing import StandardScaler

# Initialize the Standard Scaler
scaler = StandardScaler()

# Fit the scaler only on the training data and transform both training and validation/test data
X_train_smote = scaler.fit_transform(X_train_smote)
X_val_w2v = scaler.transform(X_val_w2v)
X_test_w2v = scaler.transform(X_test_w2v)

# Now, you can proceed to train the model as before
xgb_clf.fit(X_train_smote, y_train_smote)

# Make predictions and evaluate the model
y_val_pred = xgb_clf.predict(X_val_w2v)
y_pred_w2v = xgb_clf.predict(X_test_w2v)

# Calculate and print the validation accuracy
val_accuracy = accuracy_score(y_val_w2v, y_val_pred)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

# Calculate test accuracy and print classification report
accuracy_cv = accuracy_score(y_test, y_pred_w2v)
report_cv = classification_report(y_test, y_pred_w2v)

print(f"Test Accuracy for word2vec: {accuracy_cv * 100:.2f}%")
print("\nClassification Report:\n", report_cv)


Parameters: { "use_label_encoder" } are not used.



Validation Accuracy: 76.68%
Test Accuracy for word2vec: 90.30%

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.88      0.88       172
           1       0.92      0.92      0.92       266
           2       0.91      0.90      0.90       285
           3       0.90      0.90      0.90       277

    accuracy                           0.90      1000
   macro avg       0.90      0.90      0.90      1000
weighted avg       0.90      0.90      0.90      1000



we can see there is a slightly improvement from our dataset now, from 90.1% to 90.3%, and make our model more robust
