In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import joblib

# Load Count Vectorizer data
X_train_cv = joblib.load('/content/drive/MyDrive/X_train_cv.joblib')
X_val_cv = joblib.load('/content/drive/MyDrive/X_val_cv.joblib')
y_train_cv = joblib.load('/content/drive/MyDrive/y_train_cv.joblib')
y_val_cv = joblib.load('/content/drive/MyDrive/y_val_cv.joblib')
X_test_cv = joblib.load('/content/drive/MyDrive/X_test_cv.joblib')
cv = joblib.load('/content/drive/MyDrive/count_vectorizer.joblib')

# Load TFIDF Vectorizer data
X_train_tfidf = joblib.load('/content/drive/MyDrive/X_train_tfidf.joblib')
X_val_tfidf = joblib.load('/content/drive/MyDrive/X_val_tfidf.joblib')
y_train_tfidf = joblib.load('/content/drive/MyDrive/y_train_tfidf.joblib')
y_val_tfidf = joblib.load('/content/drive/MyDrive/y_val_tfidf.joblib')
X_test_tfidf = joblib.load('/content/drive/MyDrive/X_test_tfidf.joblib')
tfidf = joblib.load('/content/drive/MyDrive/tfidf_vectorizer.joblib')

# Load Word2Vec data
X_train_w2v = joblib.load('/content/drive/MyDrive/X_train_w2v.joblib')
X_val_w2v = joblib.load('/content/drive/MyDrive/X_val_w2v.joblib')
y_train_w2v = joblib.load('/content/drive/MyDrive/y_train_w2v.joblib')
y_val_w2v = joblib.load('/content/drive/MyDrive/y_val_w2v.joblib')
X_test_w2v = joblib.load('/content/drive/MyDrive/X_test_w2v.joblib')
w2v = joblib.load('/content/drive/MyDrive/word2vec_model.joblib')

# Load y_test
y_test = joblib.load('/content/drive/MyDrive/y_test.joblib')

In [5]:
from sklearn.model_selection import ParameterGrid
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import joblib

# Define the hyperparameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
}

# Function to run hypertuning for each vectorizer and return the best XGBoost model
def tune_and_run_xgb_model(X_train, y_train, X_val, y_val, X_test, vectorizer_name):
    param_combinations = list(ParameterGrid(param_grid))  # Get all parameter combinations
    best_score = 0
    best_params = None
    best_xgb = None

    # Progress bar using tqdm to track tuning process
    for params in tqdm(param_combinations, desc=f"Tuning XGBoost for {vectorizer_name}", unit="combination"):
        # Initialize XGBoost with current hyperparameters
        xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, **params)

        # Fit the XGBoost model
        xgb.fit(X_train, y_train)

        # Make predictions on validation data
        y_pred_val = xgb.predict(X_val)

        # Evaluate the model
        accuracy = accuracy_score(y_val, y_pred_val)

        # Keep track of the best model based on validation accuracy
        if accuracy > best_score:
            best_score = accuracy
            best_params = params
            best_xgb = xgb

    # Print the best results for the current vectorizer
    print(f"\nBest model for {vectorizer_name}:")
    print(f"Best Validation Accuracy: {best_score * 100:.2f}%")
    print(f"Best Hyperparameters: {best_params}")

    # Now use the best model to predict on the test data
    y_pred_test = best_xgb.predict(X_test)

    # Save the best model's predictions for the test set

    return best_xgb, best_score, best_params, y_pred_test


In [6]:

# Run the hypertuning process for Count Vectorizer
best_xg_cv, best_score_cv, best_params_cv, y_pred_cv = tune_and_run_xgb_model(
    X_train_cv, y_train_cv, X_val_cv, y_val_cv, X_test_cv, "Count_Vectorizer"
)



# Print out the best results for each model
print(f"Best KNN model for Count Vectorizer: {best_params_cv}, Validation Accuracy: {best_score_cv * 100:.2f}%")


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Tuning XGBoost for Count_Vectorizer: 100%|██████████| 18/18 [13:26<00:00, 44.80s/c


Best model for Count_Vectorizer:
Best Validation Accuracy: 65.08%
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300}
Best KNN model for Count Vectorizer: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300}, Validation Accuracy: 65.08%





In [7]:
from sklearn.metrics import accuracy_score, classification_report

# Assuming y_test is your true test labels and y_pred is your predicted test labels
# Replace `y_test` with your actual test labels variable

# Example: Test Accuracy and Classification Report for Count Vectorizer
print("\n--- Count Vectorizer Test Set Evaluation ---")
accuracy_cv = accuracy_score(y_test, y_pred_cv)
report_cv = classification_report(y_test, y_pred_cv)

print(f"Test Accuracy for Count Vectorizer: {accuracy_cv * 100:.2f}%")
print("Classification Report:")
print(report_cv)






--- Count Vectorizer Test Set Evaluation ---
Test Accuracy for Count Vectorizer: 72.90%
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.56      0.66       172
           1       0.63      0.87      0.73       266
           2       0.80      0.65      0.72       285
           3       0.76      0.78      0.77       277

    accuracy                           0.73      1000
   macro avg       0.75      0.71      0.72      1000
weighted avg       0.75      0.73      0.73      1000



In [8]:

# Run the hypertuning process for TFIDF
best_xg_tfidf, best_score_tfidf, best_params_tfidf, y_pred_tfidf = tune_and_run_xgb_model(
    X_train_tfidf, y_train_tfidf, X_val_tfidf, y_val_tfidf, X_test_tfidf, "TFIDF"
)


print(f"Best KNN model for TFIDF: {best_params_tfidf}, Validation Accuracy: {best_score_tfidf * 100:.2f}%")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Tuning XGBoost for TFIDF: 100%|██████████| 18/18 [2:37:35<00:00, 525.29s/combinati


Best model for TFIDF:
Best Validation Accuracy: 64.14%
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300}
Best KNN model for TFIDF: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300}, Validation Accuracy: 64.14%





In [9]:
# Example: Test Accuracy and Classification Report for TFIDF
print("\n--- TFIDF Vectorizer Test Set Evaluation ---")
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
report_tfidf = classification_report(y_test, y_pred_tfidf)

print(f"Test Accuracy for TFIDF: {accuracy_tfidf * 100:.2f}%")
print("Classification Report:")
print(report_tfidf)


--- TFIDF Vectorizer Test Set Evaluation ---
Test Accuracy for TFIDF: 76.10%
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.60      0.72       172
           1       0.68      0.90      0.78       266
           2       0.80      0.71      0.75       285
           3       0.78      0.78      0.78       277

    accuracy                           0.76      1000
   macro avg       0.79      0.75      0.76      1000
weighted avg       0.78      0.76      0.76      1000



In [10]:

# Run the hypertuning process for Word2Vec
best_xg_w2v, best_score_w2v, best_params_w2v, y_pred_w2v = tune_and_run_xgb_model(
    X_train_w2v, y_train_w2v, X_val_w2v, y_val_w2v, X_test_w2v, "Word2Vec"
)

print(f"Best KNN model for Word2Vec: {best_params_w2v}, Validation Accuracy: {best_score_w2v * 100:.2f}%")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Tuning XGBoost for Word2Vec: 100%|██████████| 18/18 [18:43<00:00, 62.42s/combinati


Best model for Word2Vec:
Best Validation Accuracy: 72.57%
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300}
Best KNN model for Word2Vec: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300}, Validation Accuracy: 72.57%





In [11]:

# Example: Test Accuracy and Classification Report for Word2Vec
print("\n--- Word2Vec Test Set Evaluation ---")
accuracy_w2v = accuracy_score(y_test, y_pred_w2v)
report_w2v = classification_report(y_test, y_pred_w2v)

print(f"Test Accuracy for Word2Vec: {accuracy_w2v * 100:.2f}%")
print("Classification Report:")
print(report_w2v)


--- Word2Vec Test Set Evaluation ---
Test Accuracy for Word2Vec: 88.00%
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.84      0.86       172
           1       0.88      0.92      0.90       266
           2       0.89      0.84      0.86       285
           3       0.87      0.90      0.89       277

    accuracy                           0.88      1000
   macro avg       0.88      0.88      0.88      1000
weighted avg       0.88      0.88      0.88      1000



This time, we can see word2vec embedding outperformed!!! then let's do more hypertuning because every time we have 'max_depth': 7, 'n_estimators': 300 as our best model,
So I think we need to increase the depth and the number of the trees

In [14]:
xgb_clf = XGBClassifier(
        learning_rate=0.1,
        n_estimators=400,
        max_depth=8,
        objective='multi:softmax',
        use_label_encoder=False,
        eval_metric='logloss'
    )

    # Train the model


xgb_clf.fit(X_train_w2v, y_train_w2v)

# Make predictions on validation and test sets
y_val_pred = xgb_clf.predict(X_val_w2v)
y_pred_w2v = xgb_clf.predict(X_test_w2v)

# Calculate validation accuracy
val_accuracy = accuracy_score(y_val_w2v, y_val_pred)
report_cv = classification_report(y_val_w2v, y_val_w2v)
# Print the validation accuracy
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

accuracy_cv = accuracy_score(y_test, y_pred_w2v)
report_cv = classification_report(y_test, y_pred_w2v)

print(f"Test Accuracy for word2vec: {accuracy_cv * 100:.2f}%")
print("Classification Report:")
print(report_cv)


Parameters: { "use_label_encoder" } are not used.



Validation Accuracy: 76.66%
Test Accuracy for word2vec: 90.20%
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.85      0.88       172
           1       0.91      0.93      0.92       266
           2       0.91      0.88      0.90       285
           3       0.89      0.92      0.91       277

    accuracy                           0.90      1000
   macro avg       0.90      0.90      0.90      1000
weighted avg       0.90      0.90      0.90      1000



Finally we reached 90% in the last, and next week, Our group is going to use the neural networks for a higher value.

Looks like count vectorizer embedding is the best!