In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score
from transformers import AutoTokenizer
from tqdm import tqdm

In [2]:
VOCAB_SIZE = 15000

In [3]:
def get_data_and_labels(tsv_path):
    df = pd.read_csv(tsv_path, sep='\t')
    df.columns = df.columns.str.replace(' ', '')
    data = df['text'].to_list()
    labels = df['label'].to_list()
    labels = [t.strip().upper() for t in labels]
    return data, np.array(labels)

In [4]:
def identity_tokenizer(text):
    return text

def get_features(corpus, vocab_size):
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=vocab_size,
        tokenizer=identity_tokenizer, # already receiving tokenized text from AUtotokenizer
        lowercase=False,
        token_pattern=None
    )
    vectorizer.fit(corpus)
    X = vectorizer.transform(corpus)
    return X, vectorizer

In [5]:
def evaluate(train_data, train_labels, test_data, tokenizer):
    train_tokenized_texts_str = [tokenizer.convert_ids_to_tokens(text) for text in tokenizer(train_data)['input_ids']]
    train_features,train_vectorizer = get_features(train_tokenized_texts_str, VOCAB_SIZE)
    test_tokenized_texts_str = [tokenizer.convert_ids_to_tokens(text) for text in tokenizer(test_data)['input_ids']]
    test_features = train_vectorizer.transform(test_tokenized_texts_str)
    classifier = LinearSVC(C=0.2, class_weight='balanced')
    classifier.fit(train_features, train_labels)
    test_predictions = classifier.predict(test_features)
    return test_predictions

In [6]:
def evaluate_model_and_dataset(data_set_path):
    data_files = sorted([x for x in Path(data_set_path).iterdir() if x.is_dir()])
    african_language_model = 'Davlan/afro-xlmr-mini'
    tokenizer = AutoTokenizer.from_pretrained(african_language_model)
    results = []
    for language_folder in tqdm(data_files, total=len(data_files)):
        language = language_folder.stem
        model = "Linear SVC"
        train_data, train_labels = get_data_and_labels(language_folder/'train.tsv')
        test_data, test_labels = get_data_and_labels(language_folder/'test.tsv')
        label_set = ['NEGATIVE', 'POSITIVE', 'NEUTRAL']
        predicted_labels = evaluate(train_data, train_labels, test_data, tokenizer)
        accuracy = accuracy_score(test_labels, predicted_labels)
        precision = precision_score(test_labels, predicted_labels, labels=label_set, average=None)
        recall = recall_score(test_labels, predicted_labels, labels=label_set, average=None)
        f1 = f1_score(test_labels, predicted_labels, average='macro')
        output = {  'model': model,
                    'language': language,
                    'num_examples': len(train_data),
                    'precision': {label_set[i]: precision[i] for i in range(len(label_set))},
                    'recall': {label_set[i]: recall[i] for i in range(len(label_set))},
                    'f1_score_macro': f1,
                    'accuracy': accuracy
                }
        results.append(output)
    return results 

In [7]:
data_path = Path('../data/raw/train/splitted-train-dev-test')
results = evaluate_model_and_dataset(data_path)
df = pd.DataFrame(results)
df.to_excel('../reports/linear_svm_evaluation_test_split_balanced.xlsx')

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.24it/s]
