In [1]:
import os
import yaml
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from definitions import CONFIG_PATH
from data_reader import load_text_data

import warnings
warnings.filterwarnings('ignore')

In [2]:
with open(CONFIG_PATH, "r") as config_file:
    config = yaml.full_load(config_file)
    cleaned_data_path = config["cleaned_data_path"]
    stemmed_data_path = config["stemmed_data_path"]

In [3]:
X_train, y_train = load_text_data(os.path.join(cleaned_data_path, "train"))
X_val, y_val = load_text_data(os.path.join(cleaned_data_path, "validation"))
X_test, y_test = load_text_data(os.path.join(cleaned_data_path, "test"))

In [4]:
def model_summary(model, parameter_name, parameter_values):
    best_accuracy = float("-inf")
    for parameter in parameter_values:
        model.set_params(**{parameter_name: parameter})
        model.fit(X_train, y_train)
        validation_accuracy = np.mean(y_val == model.predict(X_val))
        
        if validation_accuracy > best_accuracy:
            best_param, best_accuracy = parameter, validation_accuracy
    
    model.set_params(**{parameter_name: best_param})
    model.fit(X_train, y_train)
    test_accuracy = np.mean(y_test == model.predict(X_test))
    print(f"Best parameter = {best_param}; best validation accuracy = {best_accuracy}; test accuracy = {test_accuracy}")

In [13]:
naive_bayes_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

model_summary(naive_bayes_clf, "tfidf__ngram_range", [(1, 1), (1, 2), (1, 3)])

Best parameter = (1, 3); best validation accuracy = 0.8746374476313246; test accuracy = 0.8793909120206252


In [6]:
logistic_regression_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

model_summary(logistic_regression_clf, "clf__C", [0.01, 0.1, 1, 10, 100])

Best parameter = 10; best validation accuracy = 0.88208; test accuracy = 0.87968


In [15]:
random_forest_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier())
])

model_summary(random_forest_clf, "clf__n_estimators", [10, 50, 100])

Best parameter = 100; best validation accuracy = 0.8320979697067354; test accuracy = 0.8361263293586851


In [16]:
X_train, y_train = load_text_data("stemmed_data/train")
X_val, y_val = load_text_data("stemmed_data/validation")
X_test, y_test = load_text_data("stemmed_data/test")

In [21]:
model_summary(naive_bayes_clf, "tfidf__ngram_range", [(1, 1), (1, 2), (1, 3)])

Best parameter = (1, 3); best validation accuracy = 0.8752014179825975; test accuracy = 0.8801965839510152


In [18]:
model_summary(logistic_regression_clf, "clf__C", [0.01, 0.1, 1, 10, 100])

Best parameter = 1; best validation accuracy = 0.8796326135997422; test accuracy = 0.8785852400902353


In [19]:
model_summary(random_forest_clf, "clf__n_estimators", [10, 50, 100])

Best parameter = 100; best validation accuracy = 0.8336287463744763; test accuracy = 0.8370931356751531
