## Task 2: PCA Dimension Reduction

In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

train_df = pd.read_csv("./dataset/train_tfidf_features.csv")
test_df = pd.read_csv("./dataset/test_tfidf_features.csv")

X_train = train_df.drop(["id", "label"], axis=1)
y_train = train_df["label"]
X_test = test_df.drop(["id"], axis=1)

X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [3]:
components_list = [2000, 1000, 500, 100]
results = []

for n_components in components_list:
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_split)
    X_val_pca = pca.transform(X_val)
    X_test_pca = pca.transform(X_test)
    
    neigh = KNeighborsClassifier(n_neighbors=2)
    neigh.fit(X_train_pca, y_train_split)
    
    y_val_pred = neigh.predict(X_val_pca)
    
    f1 = f1_score(y_val, y_val_pred, average='weighted')
    results.append((n_components, f1))
    print(f"Components: {n_components}, F1 Score: {f1:.4f}")

    y_test_pred = neigh.predict(X_test_pca)

    output = pd.DataFrame({"id": test_df["id"], "label": y_test_pred})
    output.to_csv(f"./predictions/KNN_Predictions_{n_components}_components.csv", index=False)

print("Results Summary:")
for n_components, f1 in results:
    print(f"Number of components: {n_components}, F1 Score: {f1}")

Components: 2000, F1 Score: 0.4858
Components: 1000, F1 Score: 0.5881
Components: 500, F1 Score: 0.5954
Components: 100, F1 Score: 0.6031
Results Summary:
Number of components: 2000, F1 Score: 0.48576472368733437
Number of components: 1000, F1 Score: 0.588069875239095
Number of components: 500, F1 Score: 0.5954067618257358
Number of components: 100, F1 Score: 0.6031241190672758
