In [1]:
from tuwnlp.utils import Language, TokenType, LabelLevel
from tuwnlp.utils import read_file_as_string, read_files_to_df
from tuwnlp.utils import get_low_lvl_label_mappings, get_top_lvl_label_mappings
from tuwnlp.utils import get_file_labels_dataframe

from pathlib import Path
from enum import Enum
import pandas as pd
from pandas import DataFrame
import numpy as np

In [2]:
en_naratives_labels = get_file_labels_dataframe(
    Path("../data"),
    Language.EN,
    LabelLevel.NARATIVES
)

en_texts = read_files_to_df(Path("../data/tmp"), Language.EN)
en_texts.index = en_texts["file name"].values
en_texts = en_texts.drop(columns = ["file name", "file path"])
df = pd.merge(en_texts, en_naratives_labels, left_index=True, right_index=True)


200it [00:00, 6064.73it/s]


### Bag of words model

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score, precision_score

#Split into features and target
text_col = "text"
y_cols = [col for col in df.columns if col != text_col]
X = df[text_col]
y = df[y_cols]

#build bag of words
embedding_size = 1024
vectorizer = CountVectorizer(max_features=embedding_size)
X = vectorizer.fit_transform(X)

#build model
model = MultiOutputClassifier(MultinomialNB())
model.fit(X,y)

#evaluate model
y_pred = model.predict(X)
macro = f1_score(y, y_pred, average="macro")
micro = f1_score(y, y_pred, average="micro")
weighted = f1_score(y, y_pred, average="weighted")
macro, micro, weighted

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(np.float64(0.5730234010388228),
 np.float64(0.6858407079646017),
 np.float64(0.7430433988640016))

### Define models

In [4]:
from itertools import product
from sklearn.model_selection import train_test_split 
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone
from tqdm import tqdm
from copy import deepcopy

LANGUAGES = [Language.BG, Language.EN, Language.HI, Language.PT]
LEVELS = [LabelLevel.NARATIVES, LabelLevel.SUBNARATIVES]
TEXT_COL = "text"
RANDOM_STATE = 42

models = [
    MultiOutputClassifier(MultinomialNB()),
    #MultiOutputClassifier(SVC()),
    MultiOutputClassifier(RandomForestClassifier())
]

coombinations = product(LANGUAGES, LEVELS, models)

all_res = []

for language, level, model in tqdm(coombinations):
    labels = get_file_labels_dataframe(
    Path("../data"),
    language,
    level,
)
    # Read and split into train and test subsets
    text = read_files_to_df(Path("../data/tmp"), language)
    text.index = text["file name"].values
    text = text.drop(columns = ["file name", "file path"])
    df = pd.merge(text, labels, left_index=True, right_index=True)
    X = df[TEXT_COL]
    y = df.drop(columns=[TEXT_COL])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    vectorizer = CountVectorizer(max_features=embedding_size)
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    res = {}
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    res["language"] = language.value
    res["level"] = level.value
    res["model_name"] = str(model.estimators_[0])
    res["model"] = deepcopy(model)
    res["f1_macro"] = f1_score(y_test, y_pred, average="macro")
    res["f1_micro"] = f1_score(y_test, y_pred, average="micro")
    res["f1_weighted"] = f1_score(y_test, y_pred, average="weighted")
    res["recal_macro"] = recall_score(y_test, y_pred, average="macro")
    res["recal_micro"] = recall_score(y_test, y_pred, average="micro")
    res["recal_weighted"] = recall_score(y_test, y_pred, average="weighted")
    res["precision_macro"] = precision_score(y_test, y_pred, average="macro")
    res["precision_micro"] = precision_score(y_test, y_pred, average="micro")
    res["precision_weighted"] = precision_score(y_test, y_pred, average="weighted")

    all_res.append(res)

211it [00:00, 7742.22it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
211it [00:00, 9037.42it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
211it [00:00, 9345.28it/s]
  _warn_prf(average, modifier, f"{metric.ca

In [5]:
res_df = pd.DataFrame(all_res)
res_df.sort_values(["language", "level", "f1_macro"], ascending=False)

Unnamed: 0,language,level,model_name,model,f1_macro,f1_micro,f1_weighted,recal_macro,recal_micro,recal_weighted,precision_macro,precision_micro,precision_weighted
14,PT,SUBNARATIVES,MultinomialNB(),MultiOutputClassifier(estimator=MultinomialNB()),0.122684,0.353887,0.383418,0.212674,0.647059,0.647059,0.093038,0.243542,0.282804
15,PT,SUBNARATIVES,RandomForestClassifier(),MultiOutputClassifier(estimator=RandomForestCl...,0.017708,0.071429,0.052941,0.013889,0.039216,0.039216,0.03125,0.4,0.102941
12,PT,NARATIVES,MultinomialNB(),MultiOutputClassifier(estimator=MultinomialNB()),0.32632,0.536965,0.56727,0.574495,0.8625,0.8625,0.238928,0.389831,0.432858
13,PT,NARATIVES,RandomForestClassifier(),MultiOutputClassifier(estimator=RandomForestCl...,0.161975,0.421053,0.358826,0.137041,0.3,0.3,0.252273,0.705882,0.570625
10,HI,SUBNARATIVES,MultinomialNB(),MultiOutputClassifier(estimator=MultinomialNB()),0.055754,0.289157,0.241711,0.058904,0.222222,0.222222,0.067361,0.413793,0.329012
11,HI,SUBNARATIVES,RandomForestClassifier(),MultiOutputClassifier(estimator=RandomForestCl...,0.006771,0.065574,0.062037,0.004092,0.037037,0.037037,0.020833,0.285714,0.203704
8,HI,NARATIVES,MultinomialNB(),MultiOutputClassifier(estimator=MultinomialNB()),0.284848,0.415584,0.4,0.275325,0.363636,0.363636,0.338636,0.484848,0.52803
9,HI,NARATIVES,RandomForestClassifier(),MultiOutputClassifier(estimator=RandomForestCl...,0.038384,0.150943,0.134343,0.025974,0.090909,0.090909,0.075758,0.444444,0.265152
6,EN,SUBNARATIVES,MultinomialNB(),MultiOutputClassifier(estimator=MultinomialNB()),0.085612,0.215054,0.283895,0.132864,0.32967,0.32967,0.094258,0.159574,0.33769
7,EN,SUBNARATIVES,RandomForestClassifier(),MultiOutputClassifier(estimator=RandomForestCl...,0.017544,0.258621,0.160787,0.01491,0.164835,0.164835,0.027282,0.6,0.18158


In [6]:
res_df[["model_name","f1_macro", "recal_macro", "precision_macro"]].groupby("model_name").agg(["mean", "std"]).sort_values(("f1_macro", "mean"), ascending=False)

Unnamed: 0_level_0,f1_macro,f1_macro,recal_macro,recal_macro,precision_macro,precision_macro
Unnamed: 0_level_1,mean,std,mean,std,mean,std
model_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
MultinomialNB(),0.190844,0.110106,0.292809,0.18358,0.172694,0.102337
RandomForestClassifier(),0.044626,0.052123,0.035758,0.044361,0.079812,0.082257


In [7]:
res_df[["language","f1_macro", "recal_macro", "precision_macro"]].groupby("language").agg(["mean", "std"]).sort_values(("f1_macro", "mean"), ascending=False)

Unnamed: 0_level_0,f1_macro,f1_macro,recal_macro,recal_macro,precision_macro,precision_macro
Unnamed: 0_level_1,mean,std,mean,std,mean,std
language,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
PT,0.157172,0.128158,0.234525,0.240998,0.153872,0.109017
EN,0.11043,0.116445,0.166729,0.207414,0.124298,0.094955
BG,0.106898,0.118555,0.164806,0.202239,0.101195,0.090667
HI,0.096439,0.127232,0.091074,0.124883,0.125647,0.144033


In [8]:
narratives_model = res_df.query("language == 'EN' and level == 'NARATIVES' and model_name == 'MultinomialNB()'")["model"].values[0]

labels = get_file_labels_dataframe(
    Path("../data"),
    Language.EN,
    LabelLevel.NARATIVES,
)
text = read_files_to_df(Path("../data/tmp"), Language.EN)
text.index = text["file name"].values
text = text.drop(columns = ["file name", "file path"])
df = pd.merge(text, labels, left_index=True, right_index=True)
X = df[TEXT_COL]
y = df.drop(columns=[TEXT_COL])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

200it [00:00, 4608.39it/s]


In [9]:
narratives_model.predict(X_test).shape

NotFittedError: This MultiOutputClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [22]:
y_test.columns

Index(['CC: Amplifying Climate Fears', 'CC: Climate change is beneficial',
       'CC: Controversy about green technologies',
       'CC: Criticism of climate movement',
       'CC: Criticism of climate policies',
       'CC: Criticism of institutions and authorities',
       'CC: Downplaying climate change',
       'CC: Green policies are geopolitical instruments',
       'CC: Hidden plots by secret schemes of powerful groups',
       'CC: Questioning the measurements and science', 'Other',
       'URW: Amplifying war-related fears',
       'URW: Blaming the war on others rather than the invader',
       'URW: Discrediting Ukraine', 'URW: Discrediting the West, Diplomacy',
       'URW: Distrust towards Media',
       'URW: Hidden plots by secret schemes of powerful groups',
       'URW: Negative Consequences for the West', 'URW: Overpraising the West',
       'URW: Praise of Russia', 'URW: Russia is the Victim',
       'URW: Speculating war outcomes'],
      dtype='object')

In [20]:
labels.columns

Index(['CC: Amplifying Climate Fears', 'CC: Climate change is beneficial',
       'CC: Controversy about green technologies',
       'CC: Criticism of climate movement',
       'CC: Criticism of climate policies',
       'CC: Criticism of institutions and authorities',
       'CC: Downplaying climate change',
       'CC: Green policies are geopolitical instruments',
       'CC: Hidden plots by secret schemes of powerful groups',
       'CC: Questioning the measurements and science', 'Other',
       'URW: Amplifying war-related fears',
       'URW: Blaming the war on others rather than the invader',
       'URW: Discrediting Ukraine', 'URW: Discrediting the West, Diplomacy',
       'URW: Distrust towards Media',
       'URW: Hidden plots by secret schemes of powerful groups',
       'URW: Negative Consequences for the West', 'URW: Overpraising the West',
       'URW: Praise of Russia', 'URW: Russia is the Victim',
       'URW: Speculating war outcomes'],
      dtype='object')