In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import spacy
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from textblob import TextBlob
from sklearn.utils import resample

In [28]:
en_train = "../data/processed/processed_CT24_checkworthy_english/processed_train.tsv"
du_train = "../data/processed/processed_CT24_checkworthy_dutch/processed_dutch_train.tsv"  # noqa
es_train = "../data/processed/processed_CT24_checkworthy_spanish/processed_spanish_train.tsv"  # noqa
ar_train = "../data/processed/processed_CT24_checkworthy_arabic/processed_arabic_train.tsv"  # noqa

en_test = "../data/processed/processed_CT24_checkworthy_english/processed_dev.tsv"
du_test = "../data/processed/processed_CT24_checkworthy_dutch/processed_dutch_dev.tsv"
es_test = "../data/processed/processed_CT24_checkworthy_spanish/processed_spanish_dev.tsv"  # noqa
ar_test = "../data/processed/processed_CT24_checkworthy_arabic/processed_arabic_dev.tsv"

en_dev_test = "../data/processed/processed_CT24_checkworthy_english/processed_dev_test.tsv"  # noqa
du_dev_test = "../data/processed/processed_CT24_checkworthy_dutch/processed_dutch_dev_test.tsv"  # noqa
es_dev_test = "../data/processed/processed_CT24_checkworthy_spanish/processed_spanish_dev_test.tsv"  # noqa
ar_dev_test = "../data/processed/processed_CT24_checkworthy_arabic/processed_arabic_dev_test.tsv"  # noqa

en_train = pd.read_csv(en_train, sep="\t")
du_train = pd.read_csv(du_train, sep="\t")
es_train = pd.read_csv(es_train, sep="\t")
ar_train = pd.read_csv(ar_train, sep="\t")

en_test = pd.read_csv(en_test, sep="\t")
du_test = pd.read_csv(du_test, sep="\t")
es_test = pd.read_csv(es_test, sep="\t")
ar_test = pd.read_csv(ar_test, sep="\t")

en_dev_test = pd.read_csv(en_dev_test, sep="\t")
du_dev_test = pd.read_csv(du_dev_test, sep="\t")
es_dev_test = pd.read_csv(es_dev_test, sep="\t")
ar_dev_test = pd.read_csv(ar_dev_test, sep="\t")


def resample_to_fixed_number(df, n_samples=10000):
    ones = df[df["class_label"] == "Yes"]
    zeros = df[df["class_label"] == "No"]
    print(len(ones), len(zeros))
    sets = []
    for dset in [ones, zeros]:
        if len(dset) < n_samples // 2:
            sets.append(
                resample(dset, replace=True, n_samples=n_samples // 2, random_state=567)
            )
        else:
            sets.append(
                resample(
                    dset, replace=False, n_samples=n_samples // 2, random_state=567
                )
            )
    return pd.concat(sets)


# Applying the resampling function to each train, test, and dev_test dataset
en_train = resample_to_fixed_number(en_train, 10000)
du_train = resample_to_fixed_number(du_train, 10000)
es_train = resample_to_fixed_number(es_train, 10000)
ar_train = resample_to_fixed_number(ar_train, 10000)

en_test = resample_to_fixed_number(en_test, 500)
du_test = resample_to_fixed_number(du_test, 500)
es_test = resample_to_fixed_number(es_test, 500)
ar_test = resample_to_fixed_number(ar_test, 500)

en_dev_test = resample_to_fixed_number(en_dev_test, 500)
du_dev_test = resample_to_fixed_number(du_dev_test, 500)
es_dev_test = resample_to_fixed_number(es_dev_test, 500)
ar_dev_test = resample_to_fixed_number(ar_dev_test, 500)

5413 17081
404 590
3120 16822
2243 5090
238 794
102 150
704 4296
411 682
108 210
316 350
509 4491
377 123


In [29]:
display(en_train.shape)
display(du_train.shape)
display(es_train.shape)
display(ar_train.shape)

(10000, 11)

(10000, 11)

(10000, 11)

(10000, 11)

In [30]:
ones = []
zeroes = []
for df in [en_train, du_train, es_train, ar_train]:
    for i in range(len(df)):
        if df.iloc[i]["class_label"] == "Yes":
            ones.append(1)
        else:
            zeroes.append(0)

print("Ones:", len(ones))
print("Zeroes:", len(zeroes))

Ones: 20000
Zeroes: 20000


In [31]:
trains = [en_train, du_train, es_train, ar_train]
tests = [en_test, du_test, es_test, ar_test]
dev_tests = [en_dev_test, du_dev_test, es_dev_test, ar_dev_test]


cols_to_keep = [
    "tweet_text",
    "Text",
    "class_label",
    "hashtags",
    "mentions",
    "text_length",
    "text_length_category",
    "hashtags_frequency",
    "hashtags_sentiment",
    "hashtags_topics",
]

for train in [
    en_train,
    du_train,
    es_train,
    ar_train,
    en_test,
    du_test,
    es_test,
    ar_test,
    en_dev_test,
    du_dev_test,
    es_dev_test,
    ar_dev_test,
]:
    cols_to_drop = [col for col in train.columns if col not in cols_to_keep]
    train.drop(columns=cols_to_drop, inplace=True)
    train.rename(
        columns={"Text": "text", "class_label": "label", "tweet_text": "text"},
        inplace=True,
    )

merged_train = pd.concat(trains).sample(frac=1).reset_index(drop=True)
merged_test = pd.concat(tests).sample(frac=1).reset_index(drop=True)
merged_dev_test = pd.concat(dev_tests).sample(frac=1).reset_index(drop=True)

merged_train["label"] = merged_train["label"].apply(lambda x: 1 if x == "Yes" else 0)
merged_test["label"] = merged_test["label"].apply(lambda x: 1 if x == "Yes" else 0)
merged_dev_test["label"] = merged_dev_test["label"].apply(
    lambda x: 1 if x == "Yes" else 0
)

merged_train.to_csv("../data/processed/merged_train.tsv", sep="\t", index=False)
merged_test.to_csv("../data/processed/merged_test.tsv", sep="\t", index=False)
merged_dev_test.to_csv("../data/processed/merged_dev_test.tsv", sep="\t", index=False)