In [1]:
from pathlib import Path
from enum import Enum
import pandas as pd
from pandas import DataFrame
from tqdm import tqdm

In [2]:
class Language(Enum):
    BG = "BG"
    EN = "EN"
    HI = "HI"
    PT = "PT"

class TokenType(Enum):
    DEFAULT = 1
    LEMMA = 3

class LabelLevel(Enum):
    NARATIVES = "NARATIVES"
    SUBNARATIVES = "SUBNARATIVES"

class Topic(Enum):
    CLIMATE_CHANGE = "cc"
    UKRAINE = "ua"

In [3]:
def read_file_as_string(filepath: Path, tokenType = TokenType.DEFAULT) -> str:
    lines = open(filepath).readlines()[1:-1]
    res = []
    for line in lines:
        res.append(line.split("\t")[tokenType.value])
    return " ".join(res)

filepath = Path("../data/tmp/EN/EN_CC_100000.conllu")
read_file_as_string(filepath)

'pentagon plans serve labgrown meat troops name climate change washington free beacon reported biomade publicprivate partnership received million dod responsible endeavor biomade announced website june looking new ideas reduce co emissions food production transport military sites ideas include novel cell culture methods suitable production cultivated meat protein essentially euphemism labgrown fake meat labgrown meat new technology animal muscle fat tissues grown modified animal cells special equipment process uses mix chemicals pressure temperature create meat resembles beef chicken pork though still experimental stage labgrown meat sparked debate efficiency ethics producing meat without killing animals related labmade chicken meat grown cancer cells receives fda approval ready eat tumor nuggets according study researchers university californiadavis carbon footprint labgrown beef might worse product seeks replace zoomed use highly refined purified growth media used help animal cells m

In [4]:
def read_files_to_df(basedir: Path, language: Language) -> DataFrame:
    dir = basedir / language.value
    res = []
    for filepath in tqdm(dir.iterdir()):
        res.append({
            "file path": filepath,
            "file name": filepath.name,
            "text": read_file_as_string(filepath),
            "type": filepath.name.split("_")[1]
        })
    return pd.DataFrame(data=res)
basedir = Path("../data/tmp")
df = read_files_to_df(basedir, Language.EN)
df

200it [00:00, 7068.97it/s]


Unnamed: 0,file path,file name,text,type
0,../data/tmp/EN/EN_UA_023211.conllu,EN_UA_023211.conllu,europe putin thanks us journalist tucker carls...,UA
1,../data/tmp/EN/EN_CC_100137.conllu,EN_CC_100137.conllu,letters editor korean bbq is nt going away las...,CC
2,../data/tmp/EN/EN_UA_104791.conllu,EN_UA_104791.conllu,russia could split global internet us fears ru...,UA
3,../data/tmp/EN/EN_UA_104434.conllu,EN_UA_104434.conllu,putin amassed staggering six thousand nuclear ...,UA
4,../data/tmp/EN/EN_UA_102703.conllu,EN_UA_102703.conllu,millions left aid west africa suffers worst hu...,UA
...,...,...,...,...
195,../data/tmp/EN/EN_CC_100095.conllu,EN_CC_100095.conllu,climate change hoaxer greta thunberg charged d...,CC
196,../data/tmp/EN/EN_UA_002531.conllu,EN_UA_002531.conllu,kyivs mayor says he s ready fight russians con...,UA
197,../data/tmp/EN/EN_UA_018789.conllu,EN_UA_018789.conllu,rouble weakens month low past dollar gmt roubl...,UA
198,../data/tmp/EN/EN_UA_004616.conllu,EN_UA_004616.conllu,one predict wars length zelenskiy although ukr...,UA


In [34]:
ccLabelDir = Path("../data/labels/cc-labels.txt")
uaLabelDir = Path("../data/labels/ua-labels.txt")

def get_top_lvl_label_mappings(labelDir: Path):
    lines = open(labelDir).readlines()
    top_lvl_labels = [
        line.strip()
        for line in lines
        if line[0] != "-"
    ]
    top_lvl_labels_to_index = {
        label:indx
        for indx, label in enumerate(top_lvl_labels)
    }
    top_lvl_indx_to_labels = {
        indx:label
        for indx, label in enumerate(top_lvl_labels)
    }
    return top_lvl_labels_to_index, top_lvl_indx_to_labels 

def get_low_lvl_label_mappings(labelDir: Path):
    lines = open(labelDir).readlines()
    low_lvl_labels = []
    prefix = ""
    for line in lines:
        if line.strip() == "Other":
            low_lvl_labels.append("Other")
        elif line[0] != "-":
            prefix = f"{line.strip()}: "
            low_lvl_labels.append(prefix + "Other")
        else:
            low_lvl_labels.append(prefix + line[1:].strip())

    low_lvl_labels_to_index = {
        label:indx
        for indx, label in enumerate(low_lvl_labels)
    }
    low_lvl_indx_to_labels = {
        indx:label
        for indx, label in enumerate(low_lvl_labels)
    }
    return low_lvl_labels_to_index, low_lvl_indx_to_labels

cc_top_to_index, _ = get_top_lvl_label_mappings(ccLabelDir)
ua_top_to_index, _ = get_top_lvl_label_mappings(uaLabelDir)
all_top_lvl_labels = set([f"CC: {el}" if el != "Other" else el for el in cc_top_to_index.keys() ] + [f"URW: {el}" if el != "Other" else el for el in ua_top_to_index.keys() ])
top_lvl_to_index = {label:i for i, label in enumerate(all_top_lvl_labels)}
index_to_top_lvl = {i:label for i, label in enumerate(all_top_lvl_labels)}

cc_low_to_index, _ = get_low_lvl_label_mappings(ccLabelDir)
ua_low_to_index, _ = get_low_lvl_label_mappings(uaLabelDir)
all_low_lvl_labels = set([f"CC: {el}" if el != "Other" else el for el in cc_low_to_index.keys() ] + [f"URW: {el}" if el != "Other" else el for el in ua_low_to_index.keys() ])

In [85]:
ua_low_to_index

{'Other': 0,
 'Blaming the war on others rather than the invader: Other': 1,
 'Blaming the war on others rather than the invader: Ukraine is the aggressor': 2,
 'Blaming the war on others rather than the invader: The West are the aggressors': 3,
 'Discrediting Ukraine: Other': 4,
 'Discrediting Ukraine: Rewriting Ukraine’s history': 5,
 'Discrediting Ukraine: Discrediting Ukrainian nation and society': 6,
 'Discrediting Ukraine: Discrediting Ukrainian military': 7,
 'Discrediting Ukraine: Discrediting Ukrainian government and officials and policies': 8,
 'Discrediting Ukraine: Ukraine is a puppet of the West': 9,
 'Discrediting Ukraine: Ukraine is a hub for criminal activities': 10,
 'Discrediting Ukraine: Ukraine is associated with nazism': 11,
 'Discrediting Ukraine: Situation in Ukraine is hopeless': 12,
 'Russia is the Victim: Other': 13,
 'Russia is the Victim: The West is russophobic': 14,
 'Russia is the Victim: Russia actions in Ukraine are only self-defence': 15,
 'Russia is t

In [86]:
import numpy as np
def get_file_labels_dataframe(
        data_dir: Path,
        language: Language,
        labelLevel: LabelLevel
) -> tuple[DataFrame, dict, dict]:    

    annotation_file = data_dir / "training_data_16_October_release"/ language.value /"subtask-2-annotations.txt"
    ccLabelDir = data_dir / "labels" / "cc-labels.txt"
    uaLabelDir = data_dir / "labels" / "ua-labels.txt"

    #Get label mapper
    label_to_indx = {}
    if labelLevel == LabelLevel.NARATIVES:
        labels_to_indx_cc, _ = get_top_lvl_label_mappings(ccLabelDir)
        labels_to_indx_ua, _ = get_top_lvl_label_mappings(uaLabelDir)

    else:
        labels_to_indx_cc, _ = get_low_lvl_label_mappings(ccLabelDir)
        labels_to_indx_ua, _ = get_low_lvl_label_mappings(uaLabelDir)
    
    all_labels = list(set([f"CC: {el}" if el != "Other" else el for el in labels_to_indx_cc.keys() ] + [f"URW: {el}" if el != "Other" else el for el in labels_to_indx_ua.keys() ]))
    all_labels = sorted(all_labels)
    label_to_indx = {label:i for i, label in enumerate(all_labels)}
    print(label_to_indx)

    # Read file annotations and generate labels
    lines = open(annotation_file).readlines()
    filenames = []
    label_indxs = []
    for line in lines:
        filename, naratives, subnaratives = line.strip().split("\t")
        to_labels = naratives if labelLevel == LabelLevel.NARATIVES else subnaratives
        labels = set(to_labels.split(";"))
        maped_labels = list(map(lambda x: label_to_indx.get(x, None), labels))
        filenames.append(filename.strip())
        label_indxs.append(maped_labels)

    n = len(filenames)
    m = len(all_labels)
    x = np.zeros((n,m))
    for i, entry in enumerate(label_indxs):
        x[i][entry] = 1
    df = pd.DataFrame(columns=all_labels, data=x, dtype=np.bool_)
    df.index = filenames
    return df

In [91]:
df = get_file_labels_dataframe(
    Path("../data"),
    Language.HI,
    LabelLevel.SUBNARATIVES
)

{'CC: Amplifying Climate Fears: Amplifying existing fears of global warming': 0, 'CC: Amplifying Climate Fears: Doomsday scenarios for humans': 1, 'CC: Amplifying Climate Fears: Earth will be uninhabitable soon': 2, 'CC: Amplifying Climate Fears: Other': 3, 'CC: Amplifying Climate Fears: Whatever we do it is already too late': 4, 'CC: Climate change is beneficial: CO2 is beneficial': 5, 'CC: Climate change is beneficial: Other': 6, 'CC: Climate change is beneficial: Temperature increase is beneficial': 7, 'CC: Controversy about green technologies: Nuclear energy is not climate friendly': 8, 'CC: Controversy about green technologies: Other': 9, 'CC: Controversy about green technologies: Renewable energy is costly': 10, 'CC: Controversy about green technologies: Renewable energy is dangerous': 11, 'CC: Controversy about green technologies: Renewable energy is unreliable': 12, 'CC: Criticism of climate movement: Ad hominem attacks on key activists': 13, 'CC: Criticism of climate movement:

CC: Amplifying Climate Fears: Amplifying existing fears of global warming             12
CC: Amplifying Climate Fears: Doomsday scenarios for humans                            2
CC: Amplifying Climate Fears: Earth will be uninhabitable soon                         1
CC: Amplifying Climate Fears: Other                                                    2
CC: Amplifying Climate Fears: Whatever we do it is already too late                    0
                                                                                      ..
URW: Russia is the Victim: UA is anti-RU extremists                                    2
URW: Speculating war outcomes: Other                                                   6
URW: Speculating war outcomes: Russian army is collapsing                              3
URW: Speculating war outcomes: Russian army will lose all the occupied territories     3
URW: Speculating war outcomes: Ukrainian army is collapsing                            7
Length: 96, dtype: in

In [80]:
df["Other"]

EN_UA_103861.txt     True
EN_UA_103667.txt     True
EN_UA_021270.txt    False
EN_UA_103403.txt     True
EN_CC_100145.txt     True
                    ...  
EN_UA_013257.txt    False
EN_UA_000104.txt     True
EN_UA_102958.txt    False
EN_UA_027787.txt    False
EN_CC_100139.txt     True
Name: Other, Length: 200, dtype: bool