In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

import warnings

warnings.filterwarnings("ignore")

In [2]:
# import tagged unique award descriptions
aw_desc = pd.read_csv(
    "tagged_award_descriptions.csv",
    usecols=["award_description", "desc_purpose"],
)

In [3]:
X = aw_desc["award_description"]
y = aw_desc["desc_purpose"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [4]:
model_funcs = [{"knn": KNeighborsClassifier(), "svc": SVC()}]

pipe = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfdiff", TfidfTransformer()),
    ]
)

base_tune_grid = [
    {
        "vect__stop_words": [None, "english"],
        "vect__ngram_range": [(1, 2), (1, 2), (1, 3)],
        "vect__analyzer": ["char", "word"],
        "vect__max_df": [1.0, 0.1],
        "vect__min_df": [1, 0.1],
        "vect__max_features": [None, 10, 50, 100],
    }
]

knn_steps = {"knn__n_neighbors": [1, 5, 10], "knn__weights": ["uniform", "distance"]}

svc_steps = {
    "svc__kernel": ["linear", "rbf", "poly"],
    "svc__gamma": [0.1, 1, 10, 100],
    "svc__C": [0.1, 1, 10, 100],
    "svc__degree": [0, 1, 2, 3, 4, 5],
}

cv_results = {"knn": None, "svc": None}

In [5]:
for key, val in model_funcs[0].items():
    if key == "knn":
        tune_grid = [{**base_tune_grid[0], **knn_steps}]
    elif key == "svc":
        tune_grid = [{**base_tune_grid[0], **svc_steps}]
    pipe.steps.append((key, val))
    clf = GridSearchCV(pipe, tune_grid, scoring="f1_weighted", refit="f1_weighted")

    try:
        clf.fit(X_train, y_train)
        cv_results[key] = (clf.best_estimator_, clf.best_score_)
    except Exception as e:
        print(f"Error fitting key: {e}")

    pipe.steps.pop()

In [6]:
print(classification_report(y_test, cv_results["svc"][0].predict(X_test)))
print(classification_report(y_test, cv_results["knn"][0].predict(X_test)))

              precision    recall  f1-score   support

           0       0.95      0.73      0.83        26
           1       0.82      0.97      0.89        33

    accuracy                           0.86        59
   macro avg       0.89      0.85      0.86        59
weighted avg       0.88      0.86      0.86        59

              precision    recall  f1-score   support

           0       0.78      0.54      0.64        26
           1       0.71      0.88      0.78        33

    accuracy                           0.73        59
   macro avg       0.74      0.71      0.71        59
weighted avg       0.74      0.73      0.72        59

