In [2]:
import os
import re
from tqdm import tqdm

# Percorsi input
train_folder = "../data/dataset_originale/training/"
test_folder = "../data/dataset_originale/test/"

# Percorso output
base_output_dir = "../data/texts/"
train_output = os.path.join(base_output_dir, "training_texts")
test_output = os.path.join(base_output_dir, "test_texts")

# Generi previsti
genres = ["children", "diary", "journalism", "twitter", "youtube"]

# Crea directory di output
for output_dir in [train_output, test_output]:
    for genre in genres:
        os.makedirs(os.path.join(output_dir, genre), exist_ok=True)

# Regex per i blocchi <doc>
doc_pattern = re.compile(r'<doc id="(\d+)" genre="(.*?)" gender="(.*?)">(.*?)</doc>', re.DOTALL)

def extract_and_save_texts(input_folder, output_folder, dataset_type):
    files = [f for f in os.listdir(input_folder) if f.endswith(".txt")]

    for file_name in tqdm(files, desc=f"Processing {dataset_type}"):
        file_path = os.path.join(input_folder, file_name)

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        for match in doc_pattern.finditer(content):
            doc_id, genre, gender, text = match.groups()
            text = text.strip().replace("\n", " ")

            if genre not in genres:
                continue  # Salta generi non previsti

            # Costruzione percorso output
            gender_label = gender if gender in ["M", "F"] else "unknown"
            output_path = os.path.join(output_folder, genre)
            filename = f"{dataset_type}#{doc_id}#{genre}#{gender_label}.txt"

            with open(os.path.join(output_path, filename), "w", encoding="utf-8") as out_file:
                out_file.write(text)

# Avvio estrazione
if __name__ == "__main__":
    extract_and_save_texts(train_folder, train_output, "training")
    extract_and_save_texts(test_folder, test_output, "test")
    print("✅ Testi estratti e salvati in ../data/texts_from_docs/")

Processing training: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:20<00:00,  4.05s/it]
Processing test: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:24<00:00,  4.97s/it]

✅ Testi estratti e salvati in ../data/texts/



