In [6]:
import pandas as pd
import re
import spacy
from tqdm import tqdm

tqdm.pandas()

In [7]:
en_train_path = (
    "../data/processed/processed_CT24_checkworthy_english/processed_train.tsv"
)
du_train_path = "../data/processed/processed_CT24_checkworthy_dutch/processed_dutch_train.tsv"  # noqa
es_train_path = "../data/processed/processed_CT24_checkworthy_spanish/processed_spanish_train.tsv"  # noqa
ar_train_path = "../data/processed/processed_CT24_checkworthy_arabic/processed_arabic_train.tsv"  # noqa

en_train = pd.read_csv(en_train_path, sep="\t")
du_train = pd.read_csv(du_train_path, sep="\t")
es_train = pd.read_csv(es_train_path, sep="\t")
ar_train = pd.read_csv(ar_train_path, sep="\t")

In [8]:
import re


def correct_tags(text):
    pattern = r"(<s>)+|(<\/s>)+"

    def replace_func(match):
        tag = match.group(0)
        if "<s>" in tag:
            return "<s>"
        else:
            return "</s>"

    corrected_text = re.sub(pattern, replace_func, text)
    return corrected_text

In [14]:
# https://spacy.io/models
# python -m spacy download en_core_web_trf
# python -m spacy download nl_core_news_lg
# python -m spacy download es_dep_news_trf
# python -m spacy download xx_sent_ud_sm

spacy_models = {
    "en": "en_core_web_sm",
    "nl": "nl_core_news_lg",
    "es": "es_core_news_sm",
    "ar": "xx_sent_ud_sm",
}

for dataset, dataset_path in [
    (en_train, en_train_path),
    (du_train, du_train_path),
    (es_train, es_train_path),
    (ar_train, ar_train_path),
]:
    if "english" in dataset_path:
        nlp = spacy.load(spacy_models["en"])
        output_path = "../data/processed/processed_CT24_checkworthy_english/processed_train.tsv"  # noqa: E501
    elif "dutch" in dataset_path:
        nlp = spacy.load(spacy_models["nl"])
        output_path = "../data/processed/processed_CT24_checkworthy_dutch/processed_dutch_train.tsv"  # noqa: E501
    elif "spanish" in dataset_path:
        nlp = spacy.load(spacy_models["es"])
        output_path = "../data/processed/processed_CT24_checkworthy_spanish/processed_spanish_train.tsv"  # noqa: E501
    elif "arabic" in dataset_path:
        nlp = spacy.load(spacy_models["ar"])
        output_path = "../data/processed/processed_CT24_checkworthy_arabic/processed_arabic_train.tsv"  # noqa: E501
    else:
        raise ValueError("Invalid dataset")

    def add_stop_token(text: str) -> str:
        doc = nlp(text)
        sentences = "".join([f"<s>{sent.text}</s>" for sent in doc.sents])
        sentences = correct_tags(sentences)
        return sentences

    dataset["tweet_text"] = dataset["tweet_text"].progress_apply(add_stop_token)

    display(dataset)

    dataset.to_csv(output_path, sep="\t")

100%|██████████| 22494/22494 [01:39<00:00, 226.15it/s]


Unnamed: 0,Sentence_id,tweet_text,class_label,text_length,hashtags,mentions,text_length_category,class_label_encoded,hashtags_frequency,hashtags_sentiment,hashtags_topics
0,30313,<s></s><s>And so I know that this campaign has...,No,118,,,100+,0,0,0.0,0
1,19099,"<s>Now, let's balance the budget and protect M...",No,94,,,91-100,0,0,0.0,0
2,33964,<s>I'd like to mention one thing.</s>,No,30,,,21-30,0,0,0.0,0
3,16871,<s>I must remind him the Democrats have contro...,Yes,124,,,100+,1,0,0.0,0
4,13150,<s>And to take a chance - now be - and not mak...,No,160,,,100+,0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
22489,29631,"<s>It would be squandered, too, believe me.</s>",No,42,,,41-50,0,0,0.0,0
22490,7136,<s>We're not allowed to vote on it.</s>,Yes,32,,,31-40,1,0,0.0,0
22491,181,<s>More Americans at work today than any time ...,Yes,96,,,91-100,1,0,0.0,0
22492,12863,<s>We indicated at that time that we were not ...,Yes,73,,,71-80,1,0,0.0,0


100%|██████████| 994/994 [00:05<00:00, 170.94it/s]


Unnamed: 0,tweet_id,tweet_text,class_label,text_length,hashtags,mentions,text_length_category,class_label_encoded,hashtags_frequency,hashtags_sentiment,hashtags_topics
0,1240603314980392966,<s>#SamenTegenCorona:</s><s>applaus voor zorgh...,No,98,SamenTegenCorona,,91-100,0,1,0.0,0
1,1238094774453833730,<s>Kabinet ondersteunt ondernemer in Corona-cr...,No,69,coronavirusNederland,,61-70,0,1,0.0,1
2,1238154603583156225,<s>Heropening van het @<USER> in #Oosterbeek</...,No,84,Oosterbeek Airborne Covid19 coronavirus,airbornemuseum,81-90,0,4,0.0,3
3,1239152526026518534,<s>Aantal restaurants in</s><s>#groningen nu o...,Yes,56,groningen blijfthuis,,51-60,1,2,0.0,2
4,1244341963480076290,<s>Nederland heeft het niet in de hand.</s><s>...,Yes,220,RIVM COVID2019NL Coronavirusnl RIVM coronapati...,,100+,1,5,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...
989,1253322004544892937,<s>Door Covid-19 zetten organisaties massaal i...,No,249,bedrijfscontinuïteit IT wayofworking,,100+,0,3,0.0,0
990,1240614429127041024,<s>Ons crisisteam</s><s>#Corona gaat locaties ...,No,257,Corona vitaleberoepen Waardering noodopvang ki...,,100+,0,7,0.0,3
991,1305431406647087106,<s>@<USER></s><s</s><s>></s><s>En die Covid-19...,Yes,214,,demandieallesk2 aajbrouwer volkskrant,100+,1,0,0.0,0
992,1268546629935804418,<s></s><s>Vandaag op onze website: nieuwe vers...,No,234,,gkv ngk_nl,100+,0,0,0.0,0


100%|██████████| 19942/19942 [01:25<00:00, 234.50it/s]


Unnamed: 0,tweet_id,tweet_text,class_label,text_length,hashtags,mentions,text_length_category,class_label_encoded,hashtags_frequency,hashtags_sentiment,hashtags_topics
0,1192517071348699136,<s>Mañana viernes no puedes perderte el gran a...,No,45,[],"['@Pablo_Iglesias_,', '@AdaColau,', '@Irene_Mo...",41-50,0,0,0.0,0
1,1220987059701190656,"<s>DIRECTO Reyes Maroto: ""Si nos centramos en ...",No,35,[],[],31-40,0,0,0.0,4
2,1226786513226280961,"<s>@<USER> diputado : ""Que los partidos catala...",No,34,[],"['@JonInarritu', '@EHBilduCongreso', '@JonInar...",31-40,0,0,0.0,1
3,1217566329823252481,<s>@<USER> vuelve a quedar en evidencia.</s><s...,Yes,43,['#CGPJ'],"['@tve_tve', '@rtve', '@Enric_Hernandez']",41-50,1,1,0.0,3
4,1172059020095610881,<s>El nuevo curso escolar en #Andalucía comien...,Yes,42,['#Andalucía'],[],41-50,1,1,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
19937,189627,<s>Cuando Inés Arrimadas le contó dijo a Pablo...,Yes,72,[],[],71-80,1,0,0.0,4
19938,222938,<s>Al menos yo ser presidente no tocar ese asu...,No,17,[],[],11-20,0,0,0.0,4
19939,207681,<s>En memoria de quienes durante años nos han ...,No,10,[],[],0-10,0,0,0.0,0
19940,330393,<s>Enrique López Muchas gracias por estar con ...,No,16,[],[],11-20,0,0,0.0,3


100%|██████████| 7333/7333 [00:06<00:00, 1221.74it/s]


Unnamed: 0,tweet_id,tweet_text,class_label,text_length,hashtags,mentions,text_length_category,class_label_encoded,hashtags_frequency,hashtags_sentiment,hashtags_topics
0,1221949644554587904,"<s>وأي خيانة أكبر من خيانة الدين , الوطن , اله...",No,189,صفقة_القرن القدس_عاصمة_فلسطين_الأبدية,,100+,0,2,0.0,4
1,1222030473385345024,<s>صباح الخير على فلسطين وقدسها وشعبها ومهجرين...,No,169,القدس_عاصمة_فلسطين_الأبديه تسقط_صفقة_الوهم,,100+,0,2,0.0,1
2,1222035929105338368,<s>ذنبگ أنگ جميله گ يوسف وخانگ العالم گ أاخوات...,No,98,صفقة_القرن_لن_تمر,,91-100,0,1,0.0,1
3,1222048121145962496,<s>لا يلزمك ان تكون فلسطينياً لتحب فلسطين، حُب...,No,168,صباح_الخير القدس_عاصمة_فلسطين_الأبدية,,100+,0,2,0.0,4
4,1222053294266372096,<s>#</s><s>مناصرون رحل البروفسور زيك فريد فوغل...,No,270,مناصرون صفقة_القرن_لن_تمر,,100+,0,2,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
7328,1366469668089827328,<s>وزير الصحة: ربط قبول المشاركين في موسم #الح...,No,106,الحج كورونا,,100+,0,2,0.0,2
7329,1296531196462202880,<s>#</s><s>روسيا تخطط لتصدير اللقاح ضد #كورونا...,Yes,58,روسيا كورونا,,51-60,1,2,0.0,0
7330,1293180035663241216,"<s>#</s><s>روسيا تطلق اسم ""سبوتنيك 5"" على أول ...",No,75,روسيا فيروس_كورونا كورونا,,71-80,0,3,0.0,0
7331,1242563204661809152,<s>#</s><s>عاجل برئاسة #خادم_الحرمين_الشريفين ...,No,147,عاجل خادم_الحرمين_الشريفين مجموعة_العشرين فيرو...,,100+,0,5,0.0,1
