In [35]:
from tuwnlp.utils import Language, TokenType, LabelLevel
from tuwnlp.utils import read_file_as_string, read_files_to_df
from tuwnlp.utils import get_low_lvl_label_mappings, get_top_lvl_label_mappings
from tuwnlp.utils import get_file_labels_dataframe

from pathlib import Path
from enum import Enum
import pandas as pd
from pandas import DataFrame
import numpy as np

In [36]:
en_naratives_labels = get_file_labels_dataframe(
    Path("../data"),
    Language.EN,
    LabelLevel.NARATIVES
)

en_texts = read_files_to_df(Path("../data/tmp"), Language.EN)
en_texts.index = en_texts["file name"].values
en_texts = en_texts.drop(columns = ["file name", "file path"])
df = pd.merge(en_texts, en_naratives_labels, left_index=True, right_index=True)


200it [00:00, 6627.75it/s]


### Bag of words model

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


#Split into features and target
text_col = "text"
y_cols = [col for col in df.columns if col != text_col]
X = df[text_col]
y = df[y_cols]

#build bag of words
embedding_size = 1024
vectorizer = CountVectorizer(max_features=embedding_size)
X = vectorizer.fit_transform(X)

#build model
model = MultiOutputClassifier(MultinomialNB())
model.fit(X,y)

#evaluate model
y_pred = model.predict(X)
macro = f1_score(y, y_pred, average="macro")
micro = f1_score(y, y_pred, average="micro")
weighted = f1_score(y, y_pred, average="weighted")
macro, micro, weighted

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.5656539902103368, 0.6783754116355654, 0.7372732813970767)

### Define models

In [52]:
from itertools import product
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone
from tqdm import tqdm
LANGUAGES = [Language.BG, Language.EN, Language.HI, Language.PT]
LEVELS = [LabelLevel.NARATIVES, LabelLevel.SUBNARATIVES]
TEXT_COL = "text"
RANDOM_STATE = 42

models = [
    MultiOutputClassifier(MultinomialNB()),
    MultiOutputClassifier(RandomForestClassifier()),
]

coombinations = product(LANGUAGES, LEVELS, models)

all_res = []

for language, level, model in tqdm(coombinations):
    labels = get_file_labels_dataframe(
    Path("../data"),
    language,
    level,
)
    # Read and split into train and test subsets
    text = read_files_to_df(Path("../data/tmp"), language)
    text.index = text["file name"].values
    text = text.drop(columns = ["file name", "file path"])
    df = pd.merge(text, labels, left_index=True, right_index=True)
    X = df[TEXT_COL]
    y = df.drop(columns=[TEXT_COL])
    X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=RANDOM_STATE)
    
    vectorizer = CountVectorizer(max_features=embedding_size)
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    res = {}
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    res["language"] = language.value
    res["level"] = level.value
    res["model"] = clone(model)
    res["macro"] = f1_score(y_test, y_pred, average="macro")
    res["micro"] = f1_score(y_test, y_pred, average="micro")
    res["weighted"] = f1_score(y_test, y_pred, average="weighted")

    all_res.append(res)

211it [00:00, 8607.76it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
211it [00:00, 13816.01it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
211it [00:00, 13157.48it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
211it [00:00, 8361.19it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
200it [00:00, 9300.01it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
200it [00:00, 9397.32it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"

In [53]:
df = pd.DataFrame(all_res)
df.query("level == 'NARATIVES'")

Unnamed: 0,language,level,model,macro,micro,weighted
0,BG,NARATIVES,MultiOutputClassifier(estimator=MultinomialNB()),0.336714,0.460938,0.508557
1,BG,NARATIVES,MultiOutputClassifier(estimator=RandomForestCl...,0.038433,0.142857,0.121264
4,EN,NARATIVES,MultiOutputClassifier(estimator=MultinomialNB()),0.263058,0.376147,0.433652
5,EN,NARATIVES,MultiOutputClassifier(estimator=RandomForestCl...,0.025145,0.252427,0.161056
8,HI,NARATIVES,MultiOutputClassifier(estimator=MultinomialNB()),0.217841,0.42735,0.428643
9,HI,NARATIVES,MultiOutputClassifier(estimator=RandomForestCl...,0.099206,0.246154,0.220666
12,PT,NARATIVES,MultiOutputClassifier(estimator=MultinomialNB()),0.301818,0.523659,0.545026
13,PT,NARATIVES,MultiOutputClassifier(estimator=RandomForestCl...,0.131749,0.417266,0.323978


In [54]:
df.query("level == 'SUBNARATIVES'")

Unnamed: 0,language,level,model,macro,micro,weighted
2,BG,SUBNARATIVES,MultiOutputClassifier(estimator=MultinomialNB()),0.117377,0.240964,0.265447
3,BG,SUBNARATIVES,MultiOutputClassifier(estimator=RandomForestCl...,0.007523,0.048387,0.039486
6,EN,SUBNARATIVES,MultiOutputClassifier(estimator=MultinomialNB()),0.085146,0.258065,0.300117
7,EN,SUBNARATIVES,MultiOutputClassifier(estimator=RandomForestCl...,0.007246,0.253968,0.15534
10,HI,SUBNARATIVES,MultiOutputClassifier(estimator=MultinomialNB()),0.076058,0.369231,0.332924
11,HI,SUBNARATIVES,MultiOutputClassifier(estimator=RandomForestCl...,0.032862,0.184211,0.166283
14,PT,SUBNARATIVES,MultiOutputClassifier(estimator=MultinomialNB()),0.130295,0.367347,0.387248
15,PT,SUBNARATIVES,MultiOutputClassifier(estimator=RandomForestCl...,0.010417,0.055944,0.048309
