In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import joblib

# Load Count Vectorizer data
X_train_cv = joblib.load('/content/drive/MyDrive/X_train_cv.joblib')
X_val_cv = joblib.load('/content/drive/MyDrive/X_val_cv.joblib')
y_train_cv = joblib.load('/content/drive/MyDrive/y_train_cv.joblib')
y_val_cv = joblib.load('/content/drive/MyDrive/y_val_cv.joblib')
X_test_cv = joblib.load('/content/drive/MyDrive/X_test_cv.joblib')
cv = joblib.load('/content/drive/MyDrive/count_vectorizer.joblib')

# Load TFIDF Vectorizer data
X_train_tfidf = joblib.load('/content/drive/MyDrive/X_train_tfidf.joblib')
X_val_tfidf = joblib.load('/content/drive/MyDrive/X_val_tfidf.joblib')
y_train_tfidf = joblib.load('/content/drive/MyDrive/y_train_tfidf.joblib')
y_val_tfidf = joblib.load('/content/drive/MyDrive/y_val_tfidf.joblib')
X_test_tfidf = joblib.load('/content/drive/MyDrive/X_test_tfidf.joblib')
tfidf = joblib.load('/content/drive/MyDrive/tfidf_vectorizer.joblib')

# Load Word2Vec data
X_train_w2v = joblib.load('/content/drive/MyDrive/X_train_w2v.joblib')
X_val_w2v = joblib.load('/content/drive/MyDrive/X_val_w2v.joblib')
y_train_w2v = joblib.load('/content/drive/MyDrive/y_train_w2v.joblib')
y_val_w2v = joblib.load('/content/drive/MyDrive/y_val_w2v.joblib')
X_test_w2v = joblib.load('/content/drive/MyDrive/X_test_w2v.joblib')
w2v = joblib.load('/content/drive/MyDrive/word2vec_model.joblib')

# Load y_test
y_test = joblib.load('/content/drive/MyDrive/y_test.joblib')

In [5]:
from sklearn.model_selection import ParameterGrid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import joblib

# Define the hyperparameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Function to run hypertuning for each vectorizer and return the best model
def tune_and_run_knn_model(X_train, y_train, X_val, y_val, X_test, vectorizer_name):
    param_combinations = list(ParameterGrid(param_grid))  # Get all parameter combinations
    best_score = 0
    best_params = None
    best_knn = None

    # Progress bar using tqdm to track tuning process
    for params in tqdm(param_combinations, desc=f"Tuning KNN for {vectorizer_name}", unit="combination"):
        # Initialize KNN with current hyperparameters
        knn = KNeighborsClassifier(**params)

        # Fit the KNN model
        knn.fit(X_train, y_train)

        # Make predictions on validation data
        y_pred_val = knn.predict(X_val)

        # Evaluate the model
        accuracy = accuracy_score(y_val, y_pred_val)

        # Keep track of the best model based on validation accuracy
        if accuracy > best_score:
            best_score = accuracy
            best_params = params
            best_knn = knn

    # Print the best results for the current vectorizer
    print(f"\nBest model for {vectorizer_name}:")
    print(f"Best Validation Accuracy: {best_score * 100:.2f}%")
    print(f"Best Hyperparameters: {best_params}")

    # Now use the best model to predict on the test data
    y_pred_test = best_knn.predict(X_test)

    # Save the best model's predictions for the test set
    joblib.dump(y_pred_test, f'/content/drive/MyDrive/knn_test_predictions_{vectorizer_name}.joblib')

    print(f"Test predictions saved for {vectorizer_name} KNN model.\n")

    return best_knn, best_score, best_params

In [6]:

# Run the hypertuning process for Count Vectorizer
best_knn_cv, best_score_cv, best_params_cv = tune_and_run_knn_model(
    X_train_cv, y_train_cv, X_val_cv, y_val_cv, X_test_cv, "Count_Vectorizer"
)



# Print out the best results for each model
print(f"Best KNN model for Count Vectorizer: {best_params_cv}, Validation Accuracy: {best_score_cv * 100:.2f}%")


Tuning KNN for Count_Vectorizer: 100%|██████████| 20/20 [22:15<00:00, 66.79s/combination] 



Best model for Count_Vectorizer:
Best Validation Accuracy: 89.96%
Best Hyperparameters: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Test predictions saved for Count_Vectorizer KNN model.

Best KNN model for Count Vectorizer: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}, Validation Accuracy: 89.96%


In [7]:

# Run the hypertuning process for TFIDF
best_knn_tfidf, best_score_tfidf, best_params_tfidf = tune_and_run_knn_model(
    X_train_tfidf, y_train_tfidf, X_val_tfidf, y_val_tfidf, X_test_tfidf, "TFIDF"
)


print(f"Best KNN model for TFIDF: {best_params_tfidf}, Validation Accuracy: {best_score_tfidf * 100:.2f}%")

Tuning KNN for TFIDF: 100%|██████████| 20/20 [19:13<00:00, 57.70s/combination]



Best model for TFIDF:
Best Validation Accuracy: 86.51%
Best Hyperparameters: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
Test predictions saved for TFIDF KNN model.

Best KNN model for TFIDF: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}, Validation Accuracy: 86.51%


In [8]:

# Run the hypertuning process for Word2Vec
best_knn_w2v, best_score_w2v, best_params_w2v = tune_and_run_knn_model(
    X_train_w2v, y_train_w2v, X_val_w2v, y_val_w2v, X_test_w2v, "Word2Vec"
)

print(f"Best KNN model for Word2Vec: {best_params_w2v}, Validation Accuracy: {best_score_w2v * 100:.2f}%")

Tuning KNN for Word2Vec: 100%|██████████| 20/20 [19:29<00:00, 58.49s/combination] 



Best model for Word2Vec:
Best Validation Accuracy: 82.32%
Best Hyperparameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
Test predictions saved for Word2Vec KNN model.

Best KNN model for Word2Vec: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}, Validation Accuracy: 82.32%


Looks like count vectorizer embedding is the best for the KNN Classifier model!
Let's wait and see the next model the next week!