In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
import joblib
import tensorflow as tf

# Load Count Vectorizer data
X_train_cv = joblib.load('/content/drive/MyDrive/X_train_cv.joblib')
X_val_cv = joblib.load('/content/drive/MyDrive/X_val_cv.joblib')
X_test_cv = joblib.load('/content/drive/MyDrive/X_test_cv.joblib')

# Check if the data is sparse, then convert it to dense using .toarray()
if hasattr(X_train_cv, "toarray"):
    X_train_cv = X_train_cv.toarray()

if hasattr(X_val_cv, "toarray"):
    X_val_cv = X_val_cv.toarray()

if hasattr(X_test_cv, "toarray"):
    X_test_cv = X_test_cv.toarray()

# Load the labels and other data
y_train_cv = joblib.load('/content/drive/MyDrive/y_train_cv.joblib')
y_val_cv = joblib.load('/content/drive/MyDrive/y_val_cv.joblib')

# Load TFIDF Vectorizer data
X_train_tfidf = joblib.load('/content/drive/MyDrive/X_train_tfidf.joblib')
X_val_tfidf = joblib.load('/content/drive/MyDrive/X_val_tfidf.joblib')
X_test_tfidf = joblib.load('/content/drive/MyDrive/X_test_tfidf.joblib')

# Convert if needed
if hasattr(X_train_tfidf, "toarray"):
    X_train_tfidf = X_train_tfidf.toarray()

if hasattr(X_val_tfidf, "toarray"):
    X_val_tfidf = X_val_tfidf.toarray()

if hasattr(X_test_tfidf, "toarray"):
    X_test_tfidf = X_test_tfidf.toarray()


y_train_tfidf = joblib.load('/content/drive/MyDrive/y_train_tfidf.joblib')
y_val_tfidf = joblib.load('/content/drive/MyDrive/y_val_tfidf.joblib')


# Load Word2Vec data
X_train_w2v = joblib.load('/content/drive/MyDrive/X_train_w2v.joblib')
X_val_w2v = joblib.load('/content/drive/MyDrive/X_val_w2v.joblib')
X_test_w2v = joblib.load('/content/drive/MyDrive/X_test_w2v.joblib')

# Word2Vec data should already be dense, but you can check and convert if needed
if hasattr(X_train_w2v, "toarray"):
    X_train_w2v = X_train_w2v.toarray()

if hasattr(X_val_w2v, "toarray"):
    X_val_w2v = X_val_w2v.toarray()

if hasattr(X_test_w2v, "toarray"):
    X_test_w2v = X_test_w2v.toarray()

y_train_w2v = joblib.load('/content/drive/MyDrive/y_train_w2v.joblib')
y_val_w2v = joblib.load('/content/drive/MyDrive/y_val_w2v.joblib')


# Load the rest of your data such as vectorizers and models as usual
cv = joblib.load('/content/drive/MyDrive/count_vectorizer.joblib')
tfidf = joblib.load('/content/drive/MyDrive/tfidf_vectorizer.joblib')
w2v = joblib.load('/content/drive/MyDrive/word2vec_model.joblib')

y_test = joblib.load('/content/drive/MyDrive/y_test.joblib')


In [3]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

/bin/bash: line 1: nvidia-smi: command not found


In [4]:
# tf.config.optimizer.set_jit(False)  # Disable XLA JIT compilation


In [5]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [6]:

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  0


## MLP models for 3 embedings

In [7]:
## since we got the best params before we can derectly use them

In [8]:
X_train_cv.shape

(55592, 22126)

In [None]:
from sklearn.model_selection import ParameterGrid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import joblib

# Define the hyperparameter grid
param_grid = {
    'n_neighbors': [7],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Function to run hypertuning for each vectorizer and return the best model
def tune_and_run_knn_model(X_train, y_train, X_val, y_val, X_test, vectorizer_name):
    param_combinations = list(ParameterGrid(param_grid))  # Get all parameter combinations
    best_score = 0
    best_params = None
    best_knn = None

    # Progress bar using tqdm to track tuning process
    for params in tqdm(param_combinations, desc=f"Tuning KNN for {vectorizer_name}", unit="combination"):
        # Initialize KNN with current hyperparameters
        knn = KNeighborsClassifier(**params)

        # Fit the KNN model
        knn.fit(X_train, y_train)

        # Make predictions on validation data
        y_pred_val = knn.predict(X_val)

        # Evaluate the model
        accuracy = accuracy_score(y_val, y_pred_val)

        # Keep track of the best model based on validation accuracy
        if accuracy > best_score:
            best_score = accuracy
            best_params = params
            best_knn = knn

    # Print the best results for the current vectorizer
    print(f"\nBest model for {vectorizer_name}:")
    print(f"Best Validation Accuracy: {best_score * 100:.2f}%")
    print(f"Best Hyperparameters: {best_params}")

    # Now use the best model to predict on the test data
    y_pred_test = best_knn.predict(X_test)

    print(f"Test predictions saved for {vectorizer_name} KNN model.\n")

    return best_knn, best_score, best_params

In [2]:

# Run the hypertuning process for Count Vectorizer
best_knn_cv, best_score_cv, best_params_cv = tune_and_run_knn_model(
    X_train_cv, y_train_cv, X_val_cv, y_val_cv, X_test_cv, "Count_Vectorizer"
)



# Print out the best results for each model
print(f"Best KNN model for Count Vectorizer: {best_params_cv}, Validation Accuracy: {best_score_cv * 100:.2f}%")


Best model for Count_Vectorizer:
Best Validation Accuracy: 89.96%
Best Hyperparameters: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Test predictions saved for Count_Vectorizer KNN model.

Best KNN model for Count Vectorizer: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Validation Accuracy: 89.96%.


In [3]:

# Run the hypertuning process for TFIDF
best_knn_tfidf, best_score_tfidf, best_params_tfidf = tune_and_run_knn_model(
    X_train_tfidf, y_train_tfidf, X_val_tfidf, y_val_tfidf, X_test_tfidf, "TFIDF"
)


print(f"Best KNN model for TFIDF: {best_params_tfidf}, Validation Accuracy: {best_score_tfidf * 100:.2f}%")

Best model for TFIDF:
Best Validation Accuracy: 86.51%
Best Hyperparameters: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
Test predictions saved for TFIDF KNN model.

Best KNN model for TFIDF: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
Validation Accuracy: 86.51%.


In [4]:

# Run the hypertuning process for Word2Vec
best_knn_w2v, best_score_w2v, best_params_w2v = tune_and_run_knn_model(
    X_train_w2v, y_train_w2v, X_val_w2v, y_val_w2v, X_test_w2v, "Word2Vec"
)

print(f"Best KNN model for Word2Vec: {best_params_w2v}, Validation Accuracy: {best_score_w2v * 100:.2f}%")

Best model for Word2Vec:
Best Validation Accuracy: 82.32%
Best Hyperparameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
Test predictions saved for Word2Vec KNN model.

Best KNN model for Word2Vec: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
Validation Accuracy: 82.32%.


## XGB models for 3 embeddings

In [None]:
from sklearn.model_selection import ParameterGrid
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import joblib

# Define the hyperparameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
}

# Function to run hypertuning for each vectorizer and return the best XGBoost model
def tune_and_run_xgb_model(X_train, y_train, X_val, y_val, X_test, vectorizer_name):
    param_combinations = list(ParameterGrid(param_grid))  # Get all parameter combinations
    best_score = 0
    best_params = None
    best_xgb = None

    # Progress bar using tqdm to track tuning process
    for params in tqdm(param_combinations, desc=f"Tuning XGBoost for {vectorizer_name}", unit="combination"):
        # Initialize XGBoost with current hyperparameters
        xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, **params)

        # Fit the XGBoost model
        xgb.fit(X_train, y_train)

        # Make predictions on validation data
        y_pred_val = xgb.predict(X_val)

        # Evaluate the model
        accuracy = accuracy_score(y_val, y_pred_val)

        # Keep track of the best model based on validation accuracy
        if accuracy > best_score:
            best_score = accuracy
            best_params = params
            best_xgb = xgb

    # Print the best results for the current vectorizer
    print(f"\nBest model for {vectorizer_name}:")
    print(f"Best Validation Accuracy: {best_score * 100:.2f}%")
    print(f"Best Hyperparameters: {best_params}")

    # Now use the best model to predict on the test data
    y_pred_test = best_xgb.predict(X_test)

    # Save the best model's predictions for the test set

    return best_xgb, best_score, best_params, y_pred_test


In [5]:

# Run the hypertuning process for Count Vectorizer
best_xg_cv, best_score_cv, best_params_cv, y_pred_cv = tune_and_run_xgb_model(
    X_train_cv, y_train_cv, X_val_cv, y_val_cv, X_test_cv, "Count_Vectorizer"
)



# Print out the best results for each model
print(f"Best KNN model for Count Vectorizer: {best_params_cv}, Validation Accuracy: {best_score_cv * 100:.2f}%")


Best model for Count_Vectorizer:
Best Validation Accuracy: 65.08%
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300}
Best KNN model for Count Vectorizer: {{'learning_rate': {best_learning_rate_cv}, 'max_depth': {best_max_depth_cv}, 'n_estimators': {best_n_estimators_cv}}}, Validation Accuracy: {best_validation_accuracy_cv}%.


In [6]:

# Run the hypertuning process for TFIDF
best_xg_tfidf, best_score_tfidf, best_params_tfidf, y_pred_tfidf = tune_and_run_xgb_model(
    X_train_tfidf, y_train_tfidf, X_val_tfidf, y_val_tfidf, X_test_tfidf, "TFIDF"
)


print(f"Best KNN model for TFIDF: {best_params_tfidf}, Validation Accuracy: {best_score_tfidf * 100:.2f}%")

Best model for TFIDF:
Best Validation Accuracy: 64.14%
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300}
Best KNN model for TFIDF: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300}, Validation Accuracy: 64.14%.


In [10]:
xgb_clf = XGBClassifier(
        learning_rate=0.1,
        n_estimators=400,
        max_depth=8,
        objective='multi:softmax',
        use_label_encoder=False,
        eval_metric='logloss'
    )

    # Train the model


xgb_clf.fit(X_train_w2v, y_train_w2v)

# Make predictions on validation and test sets
y_val_pred = xgb_clf.predict(X_val_w2v)
y_pred_w2v = xgb_clf.predict(X_test_w2v)

# Calculate validation accuracy
val_accuracy = accuracy_score(y_val_w2v, y_val_pred)
# Print the validation accuracy
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

accuracy_cv = accuracy_score(y_test, y_pred_w2v)
report_cv = classification_report(y_test, y_pred_w2v)

print(f"Test Accuracy for word2vec: {accuracy_cv * 100:.2f}%")


Validation Accuracy: 76.66%
Test Accuracy for Word2Vec: 90.2%


## MLP models for 3 embedingsx

In [None]:
with tf.device('/GPU:0'):


  # Define the model
  model = Sequential()

  # Input layer
  model.add(Dense(1024, activation='relu', input_shape=(X_train_cv.shape[1],)))

  # Hidden layer 1
  model.add(Dense(256, activation='relu'))

  # Dropout to avoid overfitting
  model.add(Dropout(0.5))

  # Hidden layer 2
  model.add(Dense(64, activation='relu'))

  model.add(Dropout(0.5))

  model.add(Dense(16, activation='relu'))


  # Output layer (Assuming multi-class classification with softmax)
  model.add(Dense(4, activation='softmax'))

  # Compile the model
  model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

  # Train the model
  history = model.fit(X_train_cv, y_train_cv,
                      epochs=5,
                      batch_size=32,
                      validation_data=(X_val_cv, y_val_cv))

  # Evaluate the model on test data
  test_loss, test_acc = model.evaluate(X_test_cv, y_test)
  print(f"Test accuracy: {test_acc * 100:.2f}%")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m1738/1738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 14ms/step - accuracy: 0.5630 - loss: 1.0408 - val_accuracy: 0.8719 - val_loss: 0.3739
Epoch 2/5
[1m1738/1738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 8ms/step - accuracy: 0.9333 - loss: 0.2096 - val_accuracy: 0.9205 - val_loss: 0.2586
Epoch 3/5
[1m1738/1738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 8ms/step - accuracy: 0.9737 - loss: 0.0828 - val_accuracy: 0.9301 - val_loss: 0.2698
Epoch 4/5
[1m1738/1738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 8ms/step - accuracy: 0.9816 - loss: 0.0573 - val_accuracy: 0.9272 - val_loss: 0.3342
Epoch 5/5
[1m1738/1738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 8ms/step - accuracy: 0.9847 - loss: 0.0463 - val_accuracy: 0.9214 - val_loss: 0.3953
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9631 - loss: 0.2455
Test accuracy: 96.50%


In [None]:
  # Define the model
  model = Sequential()

  # Input layer
  model.add(Dense(1024, activation='relu', input_shape=(X_train_tfidf.shape[1],)))

  # Hidden layer 1
  model.add(Dense(256, activation='relu'))

  # Dropout to avoid overfitting
  model.add(Dropout(0.5))

  # Hidden layer 2
  model.add(Dense(64, activation='relu'))

  model.add(Dropout(0.5))

  model.add(Dense(16, activation='relu'))


  # Output layer (Assuming multi-class classification with softmax)
  model.add(Dense(4, activation='softmax'))

  # Compile the model
  model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

  # Train the model
  history = model.fit(X_train_tfidf, y_train_tfidf,
                      epochs=4,
                      batch_size=32,
                      validation_data=(X_val_tfidf, y_val_tfidf))

  # Evaluate the model on test data
  test_loss, test_acc = model.evaluate(X_test_tfidf, y_test)
  print(f"Test accuracy: {test_acc * 100:.2f}%")

Epoch 1/4
[1m1738/1738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 13ms/step - accuracy: 0.5639 - loss: 1.0269 - val_accuracy: 0.8755 - val_loss: 0.3518
Epoch 2/4
[1m1738/1738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 8ms/step - accuracy: 0.9320 - loss: 0.2176 - val_accuracy: 0.9188 - val_loss: 0.2513
Epoch 3/4
[1m1738/1738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 8ms/step - accuracy: 0.9721 - loss: 0.0845 - val_accuracy: 0.9275 - val_loss: 0.2636
Epoch 4/4
[1m1738/1738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 8ms/step - accuracy: 0.9808 - loss: 0.0561 - val_accuracy: 0.9258 - val_loss: 0.3030
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9728 - loss: 0.1536
Test accuracy: 97.30%


In [None]:
## here is the best score until now

In [14]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define the model
model = Sequential()

# Input layer
model.add(Dense(256, activation='relu', input_shape=(X_train_w2v.shape[1],)))

# Dropout to avoid overfitting
model.add(Dropout(0.5))

# Hidden layer 2
model.add(Dense(64, activation='relu'))

model.add(Dropout(0.5))

model.add(Dense(16, activation='relu'))


# Output layer (Assuming multi-class classification with softmax)
model.add(Dense(4, activation='softmax'))

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train_w2v, y_train_w2v,
                    epochs=100,
                    batch_size=32,
                    validation_data=(X_val_w2v, y_val_w2v))

# Evaluate the model on test data
test_loss, test_acc = model.evaluate(X_test_w2v, y_test)
print(f"Test accuracy: {test_acc * 100:.2f}%")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m1738/1738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.4413 - loss: 1.2532 - val_accuracy: 0.5064 - val_loss: 1.1458
Epoch 2/100
[1m1738/1738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.4913 - loss: 1.1756 - val_accuracy: 0.5169 - val_loss: 1.1300
Epoch 3/100
[1m1738/1738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.5014 - loss: 1.1640 - val_accuracy: 0.5210 - val_loss: 1.1200
Epoch 4/100
[1m1738/1738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.5071 - loss: 1.1510 - val_accuracy: 0.5185 - val_loss: 1.1209
Epoch 5/100
[1m1738/1738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.5065 - loss: 1.1460 - val_accuracy: 0.5241 - val_loss: 1.1101
Epoch 6/100
[1m1738/1738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.5111 - loss: 1.1414 - val_accuracy: 0.5295 - val_loss: 1.1089
Epoch 7/10

# here is the best score until now
## tfidf with Multi-Perceptron Neural Network with accuracy 97.40