# Import and Preprocessing

In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv('df_train.csv')
df_val = pd.read_csv('df_val.csv')
df_test = pd.read_csv('df_test.csv')

In [3]:
df_train = df_train.dropna()
df_val = df_val.dropna()
df_test = df_test.dropna()

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report


training_texts = df_train['tweet']
training_labels = df_train['label']

validation_texts = df_val['tweet']
validation_labels = df_val['label']

testing_texts = df_test['tweet']
testing_labels = df_test['label']

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(training_texts)
X_val_tfidf = tfidf_vectorizer.transform(validation_texts)
X_test_tfidf = tfidf_vectorizer.transform(testing_texts)

# Baseline

In [5]:
knn = KNeighborsClassifier(n_neighbors=5)  # Adjust the number of neighbors as needed
knn.fit(X_train_tfidf, training_labels)

y_val_pred = knn.predict(X_val_tfidf)
print(classification_report(validation_labels, y_val_pred))

y_test_pred = knn.predict(X_test_tfidf)
print(classification_report(testing_labels, y_test_pred))

              precision    recall  f1-score   support

           0       0.88      0.66      0.75      1037
           1       0.63      0.86      0.73       693

    accuracy                           0.74      1730
   macro avg       0.75      0.76      0.74      1730
weighted avg       0.78      0.74      0.74      1730

              precision    recall  f1-score   support

           0       0.88      0.63      0.73      1411
           1       0.64      0.89      0.74      1051

    accuracy                           0.74      2462
   macro avg       0.76      0.76      0.74      2462
weighted avg       0.78      0.74      0.74      2462



# KNN Optimized by GWO

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

from niapy.problems import Problem
from niapy.task import OptimizationType, Task
from niapy.algorithms.basic import GreyWolfOptimizer

def get_hyperparameters(x):
    """Get hyperparameters for solution `x`."""
    algorithms = ('ball_tree', 'kd_tree', 'brute')
    n_neighbors = int(5 + x[0] * 10)
    weights = 'uniform' if x[1] < 0.5 else 'distance'
    algorithm = algorithms[int(x[2] * 2)]
    leaf_size = int(10 + x[3] * 40)

    params =  {
        'n_neighbors': n_neighbors,
        'weights': weights,
        'algorithm': algorithm,
        'leaf_size': leaf_size
    }
    return params


def get_classifier(x):
    """Get classifier from solution `x`."""
    params = get_hyperparameters(x)
    return KNeighborsClassifier(**params)

In [13]:
class KNNHyperparameterOptimization(Problem):
    def __init__(self, X_train_tfidf, y_train):
        super().__init__(dimension=4, lower=0, upper=1)
        self.X_train_tfidf = X_train_tfidf
        self.y_train = y_train

    def _evaluate(self, x):
        model = get_classifier(x)
        scores = cross_val_score(model, self.X_train_tfidf, self.y_train, cv=2, n_jobs=-1)
        return scores.mean()

In [16]:
problem = KNNHyperparameterOptimization(X_train_tfidf, y_train)

# We will be running maximization for 100 iters on `problem`
task = Task(problem, max_iters=100, optimization_type=OptimizationType.MAXIMIZATION)

gwo = GreyWolfOptimizer(population_size=10, seed=42)
best_params, best_score = gwo.run(task)

print('Best parameters:', get_hyperparameters(best_params))

Best parameters: {'n_neighbors': 6, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 47}


In [17]:
default_model = KNeighborsClassifier()
best_model = get_classifier(best_params)

default_model.fit(X_train_tfidf, y_train)
best_model.fit(X_train_tfidf, y_train)

default_score = default_model.score(X_test_tfidf, y_test)
best_score = best_model.score(X_test_tfidf, y_test)

print('Default model accuracy:', default_score)
print('Best model accuracy:', best_score)



Default model accuracy: 0.7384240454914703
Best model accuracy: 0.7680747359870025
