## Train Model on 22k Allsides Data

Clean/Prep/Zip Merge AllSides Data

Create train/validate/test datasets
train_ratio = 0.8
val_ratio   = 0.1
test_ratio  = 0.1

In [4]:
import os
import random
import csv
import unicodedata
import re
from collections import defaultdict
from ftfy import fix_text

# -------------- Minimal Cleaning Function -------------- #
def minimal_clean(text: str) -> str:
    """
    Minimal cleaning for training data:
      - Normalizes unicode (NFKC) to handle odd characters 
      - Removes control characters
      - Replaces consecutive whitespace with a single space
      - Retains punctuation (including quotes)
    """
    # 1) First, fix text encoding issues (mojibake)
    text = fix_text(text)

    # 2) Normalize unicode (NFKC) to handle half-width forms, etc.
    text = unicodedata.normalize("NFKC", text)

    # 3) Remove control chars (but keep punctuation)
    text = re.sub(r"[\x00-\x1F\x7F]", "", text)
    text = text.replace("\n", " ").replace("\r", " ")

    # 4) Collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    return text
# Paths to your label folders
DATA_DIR = "/Users/bryce/Desktop/INLS697/INLS697_proj/allsides_data"
LEFT_DIR = os.path.join(DATA_DIR, "left")
CENTER_DIR = os.path.join(DATA_DIR, "center")
RIGHT_DIR = os.path.join(DATA_DIR, "right")

# Where we'll save our output CSV files
TRAIN_CSV = "train.csv"
VAL_CSV   = "val.csv"
TEST_CSV  = "test.csv"

# Train/Val/Test split ratios
train_ratio = 0.8
val_ratio   = 0.1
test_ratio  = 0.1

def get_filepaths_and_labels(folder_path, label):
    """
    Returns a list of (filepath, label) for all .txt files in folder_path.
    """
    files = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            full_path = os.path.join(folder_path, filename)
            files.append((full_path, label))
    return files

# Collect filepaths from each label folder
left_files   = get_filepaths_and_labels(LEFT_DIR, "left")
center_files = get_filepaths_and_labels(CENTER_DIR, "center")
right_files  = get_filepaths_and_labels(RIGHT_DIR, "right")

# Combine into one list
all_files = left_files + center_files + right_files

# Shuffle to randomize
random.shuffle(all_files)

# We'll create separate arrays for each label,
# to ensure we do a label-stratified split.
label_map = defaultdict(list)
for filepath, label in all_files:
    label_map[label].append(filepath)

# Now do a split for each label
train_data = []
val_data   = []
test_data  = []

for label, paths in label_map.items():
    random.shuffle(paths)
    total_count = len(paths)
    train_end  = int(total_count * train_ratio)
    val_end    = int(total_count * (train_ratio + val_ratio))

    train_paths = paths[:train_end]
    val_paths   = paths[train_end:val_end]
    test_paths  = paths[val_end:]

    # Read text from each path, clean, and store (text, label)
    for p in train_paths:
        with open(p, "r", encoding="utf-8", errors="ignore") as f:
            raw_text = f.read()
        cleaned_text = minimal_clean(raw_text)
        train_data.append((cleaned_text, label))

    for p in val_paths:
        with open(p, "r", encoding="utf-8", errors="ignore") as f:
            raw_text = f.read()
        cleaned_text = minimal_clean(raw_text)
        val_data.append((cleaned_text, label))

    for p in test_paths:
        with open(p, "r", encoding="utf-8", errors="ignore") as f:
            raw_text = f.read()
        cleaned_text = minimal_clean(raw_text)
        test_data.append((cleaned_text, label))

# Shuffle each split again (to ensure randomness across labels)
random.shuffle(train_data)
random.shuffle(val_data)
random.shuffle(test_data)

def write_to_csv(data, out_csv):
    with open(out_csv, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["text", "label"])  # CSV header
        for (text, label) in data:
            writer.writerow([text, label])

# Finally, write out your train, val, test CSVs
write_to_csv(train_data, TRAIN_CSV)
write_to_csv(val_data,   VAL_CSV)
write_to_csv(test_data,  TEST_CSV)

print("Done! Created train.csv, val.csv, test.csv (with minimal cleaning).")


Done! Created train.csv, val.csv, test.csv (with minimal cleaning).
