In [None]:
#%%

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from functions import load_datasets, VectorizerWrapper, analyze_model, meta_grid_search, write_to_file, print_scores
from sklearn.model_selection import GridSearchCV

import copy

In [None]:
# %%

vectorizer_name = "count"
vectorizer = VectorizerWrapper(vectorizer_name=vectorizer_name)
model = MultinomialNB(alpha=0.9)

train_data, val_data, test_data = load_datasets(vectorizer_name=vectorizer_name)
train_labels = train_data['label']
val_labels = val_data['label']
test_labels = test_data['label']

max_features_ls, max_df_ls = [1000, 5000, 7000, 12000, 20000, 30000], [0.3, 0.5, 0.6, 0.8, 0.9, 0.95]
print(vectorizer_name + ", max_features:" + str(max_features_ls) + ", max_df:" + str(max_df_ls))

highest_score = 0
best_features = {}
results_ls = []
for max_features in max_features_ls:
    for max_df in max_df_ls: 
        vect_tmp = copy.deepcopy(vectorizer)
        model_tmp = copy.deepcopy(model)

        print("fitting a model for: max_features: {} max_df: {}".format(max_features, max_df))

        X_train = vect_tmp.fit_transform(train_data['tweet'], max_features=max_features, max_df=max_df)
        X_val = vect_tmp.transform(val_data['tweet'])
        X_test = vect_tmp.transform(test_data['tweet'])
        model_tmp.fit(X_train, train_data['label'])

        train_pred = model_tmp.predict(X_train)
        print_scores(s="train", y_true=train_labels, y_pred=train_pred)

        results = analyze_model(model=model_tmp, X_val=X_val, val_labels=val_labels, X_test=X_test, test_labels=test_labels, plot_conf_mats=False)
        cur_score = results.get("val_f1")
        if cur_score > highest_score:
            highest_score = cur_score
            best_features = {'max_features': max_features, 'max_df': max_df}
        results_ls.append([max_features, max_df, results])

count, max_features:[1000, 5000, 7000, 12000, 20000, 30000], max_df:[0.3, 0.5, 0.6, 0.8, 0.9, 0.95]
fitting a model for: max_features: 1000 max_df: 0.3
Printing results for the model for train
train F1: 0.62
train Accuracy: 0.62
train Recall: 0.62
Analyzing the model:
0 = negative, 1= neutral, 2=positive
Validation F1: 0.60
Validation Accuracy: 0.60
Validation Recall: 0.60
Test F1: 0.53
Test Accuracy: 0.54
Validation Recall: 0.54
fitting a model for: max_features: 1000 max_df: 0.5
Printing results for the model for train
train F1: 0.62
train Accuracy: 0.62
train Recall: 0.62
Analyzing the model:
0 = negative, 1= neutral, 2=positive
Validation F1: 0.60
Validation Accuracy: 0.60
Validation Recall: 0.60
Test F1: 0.53
Test Accuracy: 0.54
Validation Recall: 0.54
fitting a model for: max_features: 1000 max_df: 0.6
Printing results for the model for train
train F1: 0.62
train Accuracy: 0.62
train Recall: 0.62
Analyzing the model:
0 = negative, 1= neutral, 2=positive
Validation F1: 0.60
Valida

In [None]:
#%%

print(highest_score)
print(best_features)
print(results_ls)

0.630498884173135
{'max_features': 7000, 'max_df': 0.3}
[[1000, 0.3, {'test_accuracy': 0.536470205144904, 'test_f1': 0.5296099372107789, 'test_recall': 0.536470205144904, 'val_accuracy': 0.601, 'val_f1': 0.5996145482140275, 'val_recall': 0.601}], [1000, 0.5, {'test_accuracy': 0.536470205144904, 'test_f1': 0.5296099372107789, 'test_recall': 0.536470205144904, 'val_accuracy': 0.601, 'val_f1': 0.5996145482140275, 'val_recall': 0.601}], [1000, 0.6, {'test_accuracy': 0.536470205144904, 'test_f1': 0.5296099372107789, 'test_recall': 0.536470205144904, 'val_accuracy': 0.601, 'val_f1': 0.5996145482140275, 'val_recall': 0.601}], [1000, 0.8, {'test_accuracy': 0.536470205144904, 'test_f1': 0.5296099372107789, 'test_recall': 0.536470205144904, 'val_accuracy': 0.601, 'val_f1': 0.5996145482140275, 'val_recall': 0.601}], [1000, 0.9, {'test_accuracy': 0.536470205144904, 'test_f1': 0.5296099372107789, 'test_recall': 0.536470205144904, 'val_accuracy': 0.601, 'val_f1': 0.5996145482140275, 'val_recall': 0.