--- Data Extraction from PDF and Data Prep---

In [2]:
import fitz  # PyMuPDF
import pandas as pd
import re
import os
from datetime import datetime

In [3]:
# --- 1. CONFIGURATION ---
pdf_file_path = "/Users/berkeelibol/anaconda_projects/3cd054a8-6e0f-4baf-88d9-78b83253cb18/thoreau_complete_journals.pdf"
output_csv = "thoreau_journals_v18.csv"

MAX_WORD_COUNT = 250
MIN_WORD_COUNT = 6   

In [4]:
# --- 2. REGEX PATTERNS ---

# A. DATE SPLITTER
# Matches: "March 21", "Mar. 21", "April 5", "Apr 5", "Sept. 21"
# added [a-z]* back to allow full month names.
date_split_pattern = (
    r'(?:\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sept?|Oct|Nov|Dec)' 
    r'[a-z]*'           # ALLOWS extra letters (e.g. 'uary' in January, 'ch' in March)
    r'\.?'              # Optional dot
    r'\s*'              # Optional space
    r'[\'’]?\s*'        # Optional apostrophe
    r'\d{1,2}'          # Digits (Day)
    r'(?:st|nd|rd|th)?\.?)' # Optional ordinal/dot
)

# B. EDITORIAL REMOVAL
# Cleans headers, editorial notes in brackets etc.
editorial_bracket_pattern = re.compile(r'\[.*?\]', re.DOTALL)
citation_pattern = re.compile(r'(?:Week,\s*p\.[^;]*;?|Riv\.[^.]*\.?|Vol\.\s+[IVX]+)', re.IGNORECASE)

# C. HEADER DATE REMOVER
# Headers included dates which caused unintended breaks in the text and sometimes did not match with the date of the text 
header_date_pattern = re.compile(
    r'\[' r'\s*' 
    r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sept?|Oct|Nov|Dec)[a-z]*' 
    r'\.?' r'\s*' r'\d{1,2}', 
    re.IGNORECASE
)

# D. NOISE CLEANING
time_marker_pattern = re.compile(r'\b(?:\d{1,2}\s*)?[AP]\.?\s*M\.?\s*[-—–]?\s*', re.IGNORECASE)

# --- 3. HELPER FUNCTIONS ---

def get_quadrant_sorted_text(page):
# Each page consisted of 4 pages which when read mixed the order of the pages
    """Sorts 4-up layout correctly (Top-Left -> Top-Right -> Bottom-Left -> Bottom-Right)."""
    blocks = page.get_text("blocks")
    if not blocks: return ""
    mid_x, mid_y = page.rect.width / 2, page.rect.height / 2
    tl, tr, bl, br = [], [], [], []
    for b in blocks:
        x0, y0 = b[0], b[1]
        if y0 < mid_y:
            if x0 < mid_x: tl.append(b)
            else: tr.append(b)
        else:
            if x0 < mid_x: bl.append(b)
            else: br.append(b)
    for q in [tl, tr, bl, br]: q.sort(key=lambda b: b[1])
    return "\n".join([b[4] for b in tl + tr + bl + br])

def clean_text_block(text):
#Cleans interfering text in headers/footers etc that identified before
    """Cleans text BEFORE splitting."""
    # 1. Remove Header Dates (e.g. "[March 21")
    text = header_date_pattern.sub(' ', text)
    
    # 2. Remove Editorial Brackets (e.g. "[Week, p. 34]")
    text = editorial_bracket_pattern.sub(' ', text)
    
    # 3. Remove Citation Noise (e.g. "Riv. 12.")
    text = citation_pattern.sub(' ', text)
    
    # 4. Remove All-Caps Titles
    lines = text.split('\n')
    clean_lines = []
    for line in lines:
        stripped = line.strip()
        if not stripped: continue
        # Filter: Long, All-Caps, No Lowercase
        if len(stripped) > 5 and not re.search(r'[a-z]', stripped): 
             continue 
        if re.match(r'^\d+$', stripped): continue # Page numbers
        clean_lines.append(line)
        
    return "\n".join(clean_lines)

def post_process_content(text):
    """Cleans the specific entry content."""
    text = time_marker_pattern.sub('', text) 
    text = re.sub(r'\d+', ' ', text)         
    text = re.sub(r'[\[\]]', ' ', text)      
    text = re.sub(r'¬\s*', '', text)         
    text = re.sub(r'(\w)-\s+(\w)', r'\1\2', text) 
    text = re.sub(r'\s+', ' ', text).strip() 
    return text

def parse_date_mm_dd(date_str):
#The dates throughout the tex included dates as text, converts text dates into numerical format
    """Converts 'March 21' -> '03-21'"""
    if not date_str: return None
    # Normalize
    clean_str = date_str.replace('.', ' ').replace("'", ' ').replace('’', ' ')
    clean_str = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', clean_str)
    parts = clean_str.split()
    if len(parts) < 2: return None
    
    # Handle full month names by taking just first 3 chars
    # "March" -> "Mar", "April" -> "Apr"
    month_str = parts[0][:3].title()
    day_str = re.sub(r'\D', '', parts[1]) 
    
    try:
        m = datetime.strptime(month_str, "%b").month
        d = int(day_str)
        return f"{m:02d}-{d:02d}"
    except:
        return None

def sentence_aware_chunking(text, limit):
#Chunks the text respecting to sentence completeness
    words = text.split()
    if len(words) <= limit: return [text]
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks, curr, clen = [], [], 0
    for s in sentences:
        slen = len(s.split())
        if clen + slen > limit:
            if curr: chunks.append(" ".join(curr))
            curr, clen = [s], slen
        else:
            curr.append(s)
            clen += slen
    if curr: chunks.append(" ".join(curr))
    return chunks

# --- 4. MAIN EXECUTION ---

print(f"Reading single file: {pdf_file_path}")
master_data = []

if os.path.exists(pdf_file_path):
    try:
        doc = fitz.open(pdf_file_path)
        full_text = ""
        
        # A. EXTRACT
        for page in doc:
            full_text += get_quadrant_sorted_text(page) + "\n"
        
        # B. CLEAN
        cleaned_text = clean_text_block(full_text)
        
        # C. SPLIT 
        chunks = re.split(f'({date_split_pattern})', cleaned_text, flags=re.IGNORECASE)
        print(f"Found {len(chunks)//2} entries based on dates.")
        
        last_mm_dd = None 
        
        for i in range(1, len(chunks), 2):
            separator = chunks[i].strip()
            content = chunks[i+1] if i+1 < len(chunks) else ""
            
            # PARSE DATE
            mm_dd = parse_date_mm_dd(separator)
            if mm_dd:
                last_mm_dd = mm_dd
            else:
                mm_dd = last_mm_dd

            # CLEAN CONTENT
            final_content = post_process_content(content)
            
            # FILTER
            if len(final_content.split()) < MIN_WORD_COUNT: 
                continue
            
            # CHUNK
            final_chunks = sentence_aware_chunking(final_content, MAX_WORD_COUNT)
            
            for idx, chunk in enumerate(final_chunks):
                master_data.append({
                    'Date_MM_DD': mm_dd,
                    'Content': chunk,
                    'Vertical_Selftranscendence': None,
                    'Horizontal_Selftranscendence': None,
                    'Self_Actualization': None,
                    'Order': None,
                    'Well_Being': None,
                    'Valence_Pos': None,
                    'Valence_Neg': None
                })

    except Exception as e:
        print(f"Error: {e}")

    # --- 5. SAVE ---
    if master_data:
        df = pd.DataFrame(master_data)
        df.to_csv(output_csv, index=False)
        print(f"\nSUCCESS. Saved {len(df)} rows to: {output_csv}")
        print(df['Date_MM_DD'].sample(10).tolist())
    else:
        print("No data extracted.")

else:
    print("File not found.")

Reading single file: /Users/berkeelibol/anaconda_projects/3cd054a8-6e0f-4baf-88d9-78b83253cb18/thoreau_complete_journals.pdf
Found 4276 entries based on dates.

SUCCESS. Saved 9679 rows to: thoreau_journals_v18.csv
['12-20', '09-13', '07-10', '07-02', '08-14', '04-03', '08-06', '12-17', '10-17', '05-16']


At this stage I did some manual cleaning and prepping before moving on.

In [5]:
# --- CONFIGURATION ---
input_csv = "/Users/berkeelibol/anaconda_projects/3cd054a8-6e0f-4baf-88d9-78b83253cb18/thoreau_complete_clean.csv"
gold_standard_file = "thoreau_gold_standard_200.csv"
training_seed_file = "thoreau_training_seed_50.csv"

# --- MAIN EXECUTION ---
df = pd.read_csv(input_csv)
print(f"Loaded dataset with {len(df)} rows.")

# 1. THE GOLD STANDARD (Test Set)
test_set = df.sample(n=200, random_state=42)

# 2. THE TRAINING SEED -> Still 50 (Small start for Active Learning)
# We drop the test_set indices first so no data leaks from Test to Train
remaining_pool = df.drop(test_set.index)
train_seed = remaining_pool.sample(n=50, random_state=42)

# 3. SAVE
test_set.to_csv(gold_standard_file, index=False)
train_seed.to_csv(training_seed_file, index=False)

print("-" * 30)
print(f"SUCCESS!")
print(f"1. '{gold_standard_file}' created with {len(test_set)} rows.")
print(f"2. '{training_seed_file}' created with {len(train_seed)} rows.")
print("-" * 30)

Loaded dataset with 9639 rows.
------------------------------
SUCCESS!
1. 'thoreau_gold_standard_200.csv' created with 200 rows.
2. 'thoreau_training_seed_50.csv' created with 50 rows.
------------------------------


I created one initial training set and one gold standart set to test accuracy later on

In [None]:
!pip install setfit pandas numpy torch transformers

In [None]:
import pandas as pd
import numpy as np
from setfit import SetFitModel, SetFitTrainer
from datasets import Dataset
import torch

In [7]:
# --- CONFIGURATION ---
SEED_FILE = "seed_training_labeled.csv"
FULL_DATA_FILE = "thoreau_complete_clean.csv"
OUTPUT_NEXT_ROUND = "round_2_to_label.csv"
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" 

# The columns to train on
TARGET_COLS = [
    'Vertical_Selftranscendence', 
    'Horizontal_Selftranscendence', 
    'Self_Actualization', 
    'Order', 
    'Well_Being'
]

# Keywords to find missing Horizontal examples, it was the only category that had no examples in the training set so i added this to the logic.
HORIZONTAL_KEYWORDS = ["infinite", "universe", "eternal", "all", "connection", "whole", "boundless", "cosmos"]

def train_active_learning():
    print("--- 1. LOADING DATA ---")
    df_seed = pd.read_csv(SEED_FILE)
    df_full = pd.read_csv(FULL_DATA_FILE)
    
    # Filter out labeled rows from the full pool
    labeled_content = set(df_seed['Content'].tolist())
    df_unlabeled = df_full[~df_full['Content'].isin(labeled_content)].copy()
    
    print(f"Labeled Seeds: {len(df_seed)}")
    print(f"Unlabeled Pool: {len(df_unlabeled)}")

    # --- 2. CHECKING CLASS BALANCE ---
    trainable_cols = []
    skipped_cols = []
    
    for col in TARGET_COLS:
        # Check if we have at least one '1' for this category
        if df_seed[col].sum() > 0:
            trainable_cols.append(col)
        else:
            skipped_cols.append(col)
            
    print(f"\nTraining on: {trainable_cols}")
    if skipped_cols:
        print(f"(!) SKIPPING: {skipped_cols} (0 positive examples found)")

    # --- 3. TRAINING SETFIT ---
    
    # Create 'label' column as list of [0, 1, 0...] for the trainable columns
    df_seed['label'] = df_seed[trainable_cols].values.tolist()
    
    # Convert to Hugging Face Dataset 
    train_dataset = Dataset.from_pandas(df_seed)

    print("\n--- 2. TRAINING MODEL ---")
    # Load SetFit Model (Multi-label enabled by one-vs-rest strategy)
    model = SetFitModel.from_pretrained(
        MODEL_NAME,
        multi_target_strategy="one-vs-rest"
    )
    
    # Train
    trainer = SetFitTrainer(
        model=model,
        train_dataset=train_dataset,
        column_mapping={"Content": "text", "label": "label"},
        num_iterations=20, 
        batch_size=4,
        num_epochs=1
    )
    
    trainer.train()
    print("Training complete.")

    # --- 4. INFERENCE ON UNLABELED POOL ---
    print("\n--- 3. RUNNING INFERENCE ---")
    
    # Predict probabilities, returns a matrix of probs
    probs = model.predict_proba(df_unlabeled['Content'].tolist())
    
    # Convert to DataFrame
    df_probs = pd.DataFrame(probs, columns=trainable_cols, index=df_unlabeled.index)
    
    # --- 5. ACTIVE LEARNING STRATEGY ---
    print("\n--- 4. SELECTING NEXT BATCH ---")
    
    # A. UNCERTAINTY SAMPLING
    uncertainty_scores = 1 - (df_probs - 0.5).abs() * 2
    df_unlabeled['max_uncertainty'] = uncertainty_scores.max(axis=1)
    
    # Select Top 30 Most Uncertain
    uncertain_candidates = df_unlabeled.nlargest(30, 'max_uncertainty')
    uncertain_candidates = uncertain_candidates.copy()
    uncertain_candidates['Reason'] = 'Uncertainty'
    
    # B. KEYWORD DISCOVERY (RESTORED THIS BLOCK)
    # Looks for rows containing keywords but NOT already in the uncertain set
    keyword_mask = df_unlabeled['Content'].str.contains('|'.join(HORIZONTAL_KEYWORDS), case=False, na=False)
    keyword_candidates = df_unlabeled[keyword_mask].drop(uncertain_candidates.index, errors='ignore').head(20)
    keyword_candidates = keyword_candidates.copy()
    keyword_candidates['Reason'] = 'Keyword_Rescue'
    
    # Combine
    next_batch = pd.concat([uncertain_candidates, keyword_candidates])
    
    # Clean up for export
    for col in TARGET_COLS:
        next_batch[col] = "" # Empty columns for manual labeling
        
    output_cols = ['Year', 'Date_MM_DD', 'Content', 'Reason'] + TARGET_COLS
    next_batch = next_batch[output_cols]
    
    # --- 6. SAVE ---
    next_batch.to_csv(OUTPUT_NEXT_ROUND, index=False)
    print("-" * 30)
    print(f"Saved '{OUTPUT_NEXT_ROUND}' with {len(next_batch)} rows.")
    print("-" * 30)

if __name__ == "__main__":
    train_active_learning()

--- 1. LOADING DATA ---
Labeled Seeds: 50
Unlabeled Pool: 9589

Training on: ['Vertical_Selftranscendence', 'Self_Actualization', 'Order', 'Well_Being']
(!) SKIPPING: ['Horizontal_Selftranscendence'] (0 positive examples found)

--- 2. TRAINING MODEL ---


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
  trainer = SetFitTrainer(
Applying column mapping to the training dataset


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 2000
  Batch size = 4
  Num epochs = 1


Step,Training Loss
1,0.1802
50,0.1772
100,0.1385
150,0.0497
200,0.0441
250,0.0402
300,0.0507
350,0.0301
400,0.0459
450,0.0611


Training complete.

--- 3. RUNNING INFERENCE ---

--- 4. SELECTING NEXT BATCH ---
------------------------------
Saved 'round_2_to_label.csv' with 50 rows.
------------------------------


At this point I label the round 2 data manually and import it as a new dataset.

In [10]:
import pandas as pd
import numpy as np
from setfit import SetFitModel, SetFitTrainer
from datasets import Dataset
import torch
import os

# --- CONFIGURATION (UPDATE THIS EVERY ROUND) ---
LABELED_FILES = [
    "seed_training_labeled.csv",
    "round_2_labeled.csv"
]

FULL_DATA_FILE = "thoreau_complete_clean.csv"
OUTPUT_NEXT_ROUND = "round_3_to_label.csv" #for Round 3, i change the number each iteraiton
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" 

# Columns
TARGET_COLS = [
    'Vertical_Selftranscendence', 
    'Horizontal_Selftranscendence', 
    'Self_Actualization', 
    'Order', 
    'Well_Being'
]

def train_next_iteration():
    print(f"--- 1. MERGING {len(LABELED_FILES)} LABELED FILES ---")
    
    # Load and combine files
    df_list = []
    for f in LABELED_FILES:
        if os.path.exists(f):
            d = pd.read_csv(f)
            # Force content to be string to avoid any other type errors
            d['Content'] = d['Content'].astype(str)
            df_list.append(d)
            print(f"  -> Loaded {f} ({len(d)} rows)")
        else:
            print(f"  (!) ERROR: File not found: {f}")
            return

    df_train = pd.concat(df_list, ignore_index=True)
    df_full = pd.read_csv(FULL_DATA_FILE)

    #Fill NaN with 0
    for col in TARGET_COLS:
        if col in df_train.columns:
            df_train[col] = pd.to_numeric(df_train[col], errors='coerce').fillna(0).astype(int)
        else:
            df_train[col] = 0 
    
    # Filter out labeled rows
    labeled_content = set(df_train['Content'].tolist())
    df_unlabeled = df_full[~df_full['Content'].isin(labeled_content)].copy()
    
    print(f"Total Training Examples: {len(df_train)}")
    print(f"Unlabeled Pool Remaining: {len(df_unlabeled)}")

    # --- 2. CHECKING CLASS BALANCE ---
    trainable_cols = []
    skipped_cols = []
    
    for col in TARGET_COLS:
        # Ensure numeric type for summation
        count = pd.to_numeric(df_train[col], errors='coerce').fillna(0).sum()
        if count > 0:
            trainable_cols.append(col)
        else:
            skipped_cols.append(col)
            
    print(f"Training on: {trainable_cols}")
    if skipped_cols:
        print(f"(!) SKIPPING: {skipped_cols} (Still 0 positive examples)")

    # --- 3. TRAINING ---
    # Prepare labels
    df_train['label'] = df_train[trainable_cols].values.tolist()
    
    # Kept getting errors about type for the year coloumn, decided to exclude them from the training as they arent necessary anyway
    training_data_clean = df_train[['Content', 'label']].copy()
    train_dataset = Dataset.from_pandas(training_data_clean)

    print("\n--- 2. TRAINING MODEL (ITERATION) ---")
    model = SetFitModel.from_pretrained(MODEL_NAME, multi_target_strategy="one-vs-rest")
    
    trainer = SetFitTrainer(
        model=model,
        train_dataset=train_dataset,
        column_mapping={"Content": "text", "label": "label"},
        num_iterations=20, 
        batch_size=4,
        num_epochs=1
    )
    
    trainer.train()
    print("Training complete.")

    # --- 4. INFERENCE ---
    print("\n--- 3. RUNNING INFERENCE ---")
    probs = model.predict_proba(df_unlabeled['Content'].tolist())
    df_probs = pd.DataFrame(probs, columns=trainable_cols, index=df_unlabeled.index)
    
    # --- 5. SELECTION STRATEGY ---
    print("\n--- 4. SELECTING NEXT BATCH ---")
    
    # Uncertainty Formula
    uncertainty_scores = 1 - (df_probs - 0.5).abs() * 2
    df_unlabeled['max_uncertainty'] = uncertainty_scores.max(axis=1)
    
    # Select Top 50 Most Uncertain
    next_batch = df_unlabeled.nlargest(50, 'max_uncertainty').copy()
    next_batch['Reason'] = 'Uncertainty'
    
    # Format output
    for col in TARGET_COLS:
        next_batch[col] = ""
        
    output_cols = ['Year', 'Date_MM_DD', 'Content', 'Reason'] + TARGET_COLS
    
    # Ensure all columns exist before saving, as year, date etc. was excluded from training
    for c in output_cols:
        if c not in next_batch.columns:
            next_batch[c] = ""
            
    next_batch = next_batch[output_cols]
    
    next_batch.to_csv(OUTPUT_NEXT_ROUND, index=False)
    print("-" * 30)
    print(f"Saved '{OUTPUT_NEXT_ROUND}' with {len(next_batch)} rows.")
    print("-" * 30)

if __name__ == "__main__":
    train_next_iteration()

--- 1. MERGING 2 LABELED FILES ---
  -> Loaded seed_training_labeled.csv (50 rows)
  -> Loaded round_2_labeled.csv (50 rows)
Total Training Examples: 100
Unlabeled Pool Remaining: 9539
Training on: ['Vertical_Selftranscendence', 'Horizontal_Selftranscendence', 'Self_Actualization', 'Order', 'Well_Being']

--- 2. TRAINING MODEL (ITERATION) ---


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
  trainer = SetFitTrainer(
Applying column mapping to the training dataset


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 4000
  Batch size = 4
  Num epochs = 1


Step,Training Loss
1,0.3602
50,0.2569
100,0.2053
150,0.1639
200,0.1175
250,0.129
300,0.0696
350,0.0946
400,0.1066
450,0.0695


Training complete.

--- 3. RUNNING INFERENCE ---

--- 4. SELECTING NEXT BATCH ---
------------------------------
Saved 'round_3_to_label.csv' with 50 rows.
------------------------------


I repeat this process until satisfied with the outcome and rerun it one last time where I save the full dataframe.