# TXT to CSV conversion

Data aggregation scripts to assemble CSV datasets. 

In [2]:
import pandas as pd
import numpy as np
import os

## TEXT DATA

In [None]:
# TRAIN DATA language subfolders
data = []

output_directory = '/Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/TRAIN'
root_directory = '/Users/aahepburn/Development/Data/Train Data'
raw_folder = 'raw-documents'

for lang_folder in os.listdir(root_directory):
    language_path = os.path.join(root_directory, lang_folder)
    raw_documents_path = os.path.join(language_path, raw_folder)
    if not os.path.isdir(language_path):
        continue 
    for file in os.listdir(raw_documents_path):
        file_path = os.path.join(raw_documents_path, file)
        with open(file_path,'r', encoding='utf-8') as file:
            content = file.read()
            data.append({
                        "Filename": os.path.basename(file_path),
                        "Language": lang_folder, 
                        "Content": content
                    })

df1 = pd.DataFrame(data, header=None)

# TRAIN DATA - extra Russian articles
data = []

root_directory = '/Users/aahepburn/Development/Data/More Train Data RU/raw-documents'

for file in os.listdir(root_directory):
    file_path = os.path.join(root_directory, file)
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        data.append({
                    "Filename": os.path.basename(file_path),
                    "Language": 'RU', 
                    "Content": content
                })
        
# Save combined CSV file        
df2 = pd.DataFrame(data, header=None)
df = pd.concat([df1,df2])
df.to_csv(output_directory + '/train_raw_text.csv')


In [None]:
# DEV SET - language subfolders - subtask 1
data = []

output_directory = '/Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data'
root_directory = '/Users/aahepburn/Development/Data/Dev Set Labels'
raw_folder = 'subtask-2-documents'

for lang_folder in os.listdir(root_directory):
    language_path = os.path.join(root_directory, lang_folder)
    raw_documents_path = os.path.join(language_path, raw_folder)
    if not os.path.isdir(language_path):
        continue 
    for file in os.listdir(raw_documents_path):
        file_path = os.path.join(raw_documents_path, file)
        with open(file_path,'r', encoding='utf-8') as file:
            content = file.read()
            data.append({
                        "Filename": os.path.basename(file_path),
                        "Language": lang_folder, 
                        "Content": content
                    })

dev_s1 = pd.DataFrame(data)
dev_s1.to_csv(output_directory + '/dev_s1_nolabels.csv')

In [None]:
# DEV SET - language subfolders - subtask 2
data = []

output_directory = '/Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data'
root_directory = '/Users/aahepburn/Development/Data/Dev Set Labels'
raw_folder = 'subtask-2-documents'

for lang_folder in os.listdir(root_directory):
    language_path = os.path.join(root_directory, lang_folder)
    raw_documents_path = os.path.join(language_path, raw_folder)
    if not os.path.isdir(language_path):
        continue 
    for file in os.listdir(raw_documents_path):
        file_path = os.path.join(raw_documents_path, file)
        with open(file_path,'r', encoding='utf-8') as file:
            content = file.read()
            data.append({
                        "Filename": os.path.basename(file_path),
                        "Language": lang_folder, 
                        "Content": content
                    })

dev_s2 = pd.DataFrame(data)
dev_s2.to_csv(output_directory + '/dev_s2_nolabels.csv')

## LABELS

In [None]:
# TRAIN labels ---> read TXT ---> write CSV

# Define root and output directories
root_directory = '/Users/aahepburn/Development/Data/Train Data'
subtask1 = 'subtask-1-annotations.txt'
subtask2 = 'subtask-2-annotations.txt'
output_directory = '/Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data'
os.makedirs(output_directory, exist_ok=True)

for lang_folder in os.listdir(root_directory):
    language_path = os.path.join(root_directory, lang_folder)
    if not os.path.isdir(language_path): 
        continue 

    # subtask-1-annotations.txt
    file_path = os.path.join(language_path, subtask1)
    if os.path.exists(file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
            data = [line.strip().split('\t') for line in lines]
            max_cols = max(len(row) for row in data)
            column_names = ['File', 'Entity', 'Start', 'End'] + [f'Label{i}' for i in range(1, max_cols - 3)]
            subtask1_df = pd.DataFrame(data, columns=column_names)

            output_file = os.path.join(output_directory, f'subtask1_{lang_folder}.csv')
            subtask1_df.to_csv(output_file, index=False)
            print(f"Saved: {output_file}")

        except Exception as e:
            print(f"Error processing {file_path}: {e}")

    # subtask-2-annotations.txt
    file_path = os.path.join(language_path, subtask2)
    if os.path.exists(file_path):
        try:
            subtask2_df = pd.read_csv(file_path, sep='\t', header=None, engine='python')
            output_file = os.path.join(output_directory, f'subtask2_{lang_folder}.csv')
            
            subtask2_df.to_csv(output_file, index=False)
            print(f"Saved: {output_file}")

        except Exception as e:
            print(f"Error processing {file_path}: {e}")


Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/subtask1_RU.csv
Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/subtask2_RU.csv
Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/subtask1_PT.csv
Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/subtask2_PT.csv
Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/subtask1_BG.csv
Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/subtask2_BG.csv
Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/subtask1_HI.csv
Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/subtask2_HI.csv
Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/subtask1_EN.csv
Saved: /Users/aahepburn/Deve

In [57]:
output_directory = '/Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data'

with open('/Users/aahepburn/Development/Data/More Train Data RU/subtask-1-annotations.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()
data = [line.strip().split('\t') for line in lines]
max_cols = max(len(row) for row in data)
column_names = ['File', 'Entity', 'Start', 'End'] + [f'Label{i}' for i in range(1, max_cols - 3)]
subtask1_df = pd.DataFrame(data, columns=column_names)

output_file = os.path.join(output_directory, f'subtask1moreRU_{lang_folder}.csv')
subtask1_df.to_csv(output_file, index=False)
print(f"Saved: {output_file}")

Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/subtask1moreRU_EN.csv


In [58]:

subtask2_df = pd.read_csv('/Users/aahepburn/Development/Data/More Train Data RU/subtask-2-annotations.txt', sep='\t', header=None, engine='python')
output_file = os.path.join(output_directory, f'subtask2moreRU_{lang_folder}.csv')
subtask2_df.to_csv(output_file, index=False)
print(f"Saved: {output_file}")

Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/subtask2moreRU_EN.csv


In [3]:
# DEV labels ---> read TXT ---> write CSV

# Define root and output directories
root_directory = '/Users/aahepburn/Development/Data/Dev Set Labels'
subtask1 = 'subtask-1-annotations.txt'
subtask2 = 'subtask-2-annotations.txt'
output_directory = '/Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/TEST'
os.makedirs(output_directory, exist_ok=True)

for lang_folder in os.listdir(root_directory):
    language_path = os.path.join(root_directory, lang_folder)
    if not os.path.isdir(language_path): 
        continue 

    # subtask-1-annotations.txt
    file_path = os.path.join(language_path, subtask1)
    if os.path.exists(file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
            data = [line.strip().split('\t') for line in lines]
            max_cols = max(len(row) for row in data)
            column_names = ['File', 'Entity', 'Start', 'End'] + [f'Label{i}' for i in range(1, max_cols - 3)]
            subtask1_df = pd.DataFrame(data, columns=column_names)

            output_file = os.path.join(output_directory, f'subtask1_{lang_folder}.csv')
            subtask1_df.to_csv(output_file, index=False)
            print(f"Saved: {output_file}")

        except Exception as e:
            print(f"Error processing {file_path}: {e}")

    # subtask-2-annotations.txt
    file_path = os.path.join(language_path, subtask2)
    if os.path.exists(file_path):
        try:
            subtask2_df = pd.read_csv(file_path, sep='\t', header=None, engine='python')
            output_file = os.path.join(output_directory, f'subtask2_{lang_folder}.csv')
            
            subtask2_df.to_csv(output_file, index=False)
            print(f"Saved: {output_file}")

        except Exception as e:
            print(f"Error processing {file_path}: {e}")


Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/TEST/subtask1_RU.csv
Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/TEST/subtask2_RU.csv
Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/TEST/subtask1_PT.csv
Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/TEST/subtask2_PT.csv
Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/TEST/subtask1_BG.csv
Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/TEST/subtask2_BG.csv
Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/TEST/subtask1_HI.csv
Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/TEST/subtask2_HI.csv
Saved: /Users/aahepburn/Development/Multi-task-Propaganda-Detection-MSc-Thesis-2025-AH/Data/TEST