In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
VOCAB_SIZE = 15000

In [3]:
def get_data_and_labels(tsv_path):
    df = pd.read_csv(tsv_path, sep='\t')
    data = df['tweet'].to_list()
    labels = df['label'].to_list()
    labels = [t.strip().upper() for t in labels]
    return data, np.array(labels)

In [4]:
def identity_tokenizer(text):
    return text

def get_features(corpus, vocab_size):
    vectorizer = CountVectorizer(
        ngram_range=(1, 2),
        max_features=vocab_size,
        tokenizer=identity_tokenizer, # already receiving tokenized text from AUtotokenizer
        lowercase=False,
        token_pattern=None
    )
    vectorizer.fit(corpus)
    X = vectorizer.transform(corpus)
    return X, vectorizer

In [5]:
def evaluate_kflod(data, true_labels, tokenizer):
    skf = StratifiedKFold(n_splits=10)
    tokenized_texts_str = [tokenizer.convert_ids_to_tokens(text) for text in tokenizer(data)['input_ids']]
    features,_ = get_features(tokenized_texts_str, VOCAB_SIZE)
    true_test_labels = []
    predicted_test_labels = []
    for train_index, test_index in skf.split(features, true_labels):
        classifier = MultinomialNB()
        train_labels = true_labels[train_index]
        classifier.fit(features[train_index], train_labels)
        train_predictions = classifier.predict(features[train_index])
        test_predictions = classifier.predict(features[test_index])
        test_labels = true_labels[test_index]
        true_test_labels.extend(test_labels)
        predicted_test_labels.extend(test_predictions)
        # train_f1 = f1_score(train_labels, train_predictions, average='micro', labels=['NEGATIVE', 'NEUTRAL','POSITIVE'])
        # test_f1 = f1_score(test_labels, test_predictions, average='micro', labels=['NEGATIVE', 'NEUTRAL','POSITIVE'])
        # print(f"Train F1: {train_f1}, Test F1: {test_f1}")
    return true_test_labels, predicted_test_labels

In [6]:
def evaluate_model_and_dataset(data_set_path):
    data_files = sorted(Path(data_set_path).glob('*.tsv'))
    african_language_model = 'Davlan/afro-xlmr-mini'
    tokenizer = AutoTokenizer.from_pretrained(african_language_model)
    results = []
    for csv_file in tqdm(data_files, total=len(data_files)):
        language = csv_file.stem.split('_')[0]
        model = "Multinomial Naive Bayes"
        data, true_labels = get_data_and_labels(csv_file)
        label_set = ['NEGATIVE', 'POSITIVE', 'NEUTRAL']
        true_labels, predicted_labels = evaluate_kflod(data, true_labels, tokenizer)
        accuracy = accuracy_score(true_labels, predicted_labels)
        precision = precision_score(true_labels, predicted_labels, labels=label_set, average=None)
        recall = recall_score(true_labels, predicted_labels, labels=label_set, average=None)
        f1 = f1_score(true_labels, predicted_labels, average='macro')
        output = {  'model': model,
                    'language': language,
                    'num_examples': len(data),
                    'precision': {label_set[i]: precision[i] for i in range(len(label_set))},
                    'recall': {label_set[i]: recall[i] for i in range(len(label_set))},
                    'f1_score_macro': f1,
                    'accuracy': accuracy
                }
        results.append(output)
    return results 

In [7]:
data_path = Path('../data/raw/train/')
results = evaluate_model_and_dataset(data_path)
df = pd.DataFrame(results)
df.to_excel('../reports/multinomial_naive_bayes_evaluation.xlsx')

100%|██████████| 9/9 [00:10<00:00,  1.20s/it]
