In [81]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [35]:
# import raw award descriptions
aw_desc = pd.read_csv(
    '../data/Prime_Award_Summary_Loans_GTE1M.csv', 
    usecols=['assistance_award_unique_key', 'award_description'])

In [38]:
# export unique award descriptions for manual tagging
aw_desc['award_description'].drop_duplicates().to_csv('../data/untagged_unique_desc.csv', index=False)

In [55]:
# import tagged unique award descriptions
aw_desc = pd.read_csv(
    "../data/tagged_award_descriptions.csv",
    usecols=['award_description', 'desc_purpose'])
print(aw_desc.shape)
aw_desc.sample(5)

(235, 2)


Unnamed: 0,award_description,desc_purpose
168,C-470 EXPRESS LANE PROJECT IN DENVER COLORADO,1
1,PILOT BROADBAND LOAN GRANT COMBO - LOAN,1
110,THE I-15 EXPRESS LANES PROJECT IN RIVERSIDE CO...,1
24,"CONSTRUCTION OF A MULTI-PURPOSE, ENERGY-EFFICI...",1
189,TRAIN CONTROL IMPLEMENTATION AND RESILIENCY PR...,1


In [56]:
X = aw_desc['award_description']
y = aw_desc['desc_purpose']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [225]:
model_funcs = [{'knn': KNeighborsClassifier(), 'svc': SVC()}]

pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfdiff', TfidfTransformer()),
])

base_tune_grid = [
    {
        'vect__stop_words': [None, 'english'], 
        'vect__ngram_range': [(1,2), (1,2), (1,3)],
        'vect__max_df': [1.0, 0.1], 
        'vect__min_df': [1, 0.1],
        'vect__max_features': [None, 10, 50, 100],
    }
]

knn_steps = {
    'knn__n_neighbors': [1,5,10],
    'knn__weights': ['uniform', 'distance']
}

svc_steps = {
    'svc__kernel': ['linear', 'rbf', 'poly'],
    'svc__gamma': [0.1, 1, 10, 100],
    'svc__C': [0.1, 1, 10, 100],
    'svc__degree': [0, 1, 2, 3, 4, 5]
}

cv_results = {'knn': None, 'svc': None}

In [None]:
for key, val in model_funcs[0].items():
    if key == 'knn':
        tune_grid = [{**base_tune_grid[0], **knn_steps}]
    elif key == 'svc':
        tune_grid = [{**base_tune_grid[0], **svc_steps}]
    pipe.steps.append((key, val))
    clf = GridSearchCV(pipe, tune_grid, refit='f1')
    clf.fit(X_train, y_train)
    pipe.steps.pop()
    cv_results[key] = (clf.best_estimator_, clf.best_score_)

In [236]:
sum(y_test)/len(y_test)

0.5423728813559322

In [234]:
print(classification_report(y_test, cv_results['svc'][0].predict(X_test)))
print(classification_report(y_test, cv_results['knn'][0].predict(X_test)))

              precision    recall  f1-score   support

           0       0.58      0.70      0.63        27
           1       0.69      0.56      0.62        32

    accuracy                           0.63        59
   macro avg       0.63      0.63      0.63        59
weighted avg       0.64      0.63      0.63        59

              precision    recall  f1-score   support

           0       0.64      0.59      0.62        27
           1       0.68      0.72      0.70        32

    accuracy                           0.66        59
   macro avg       0.66      0.66      0.66        59
weighted avg       0.66      0.66      0.66        59

