In [19]:
# =============================================================================
#  Python Script for Pre-processing English Central Bank Texts
# =============================================================================

# --- Environment Setup and Library Imports ---
import fitz  # PyMuPDF
import pandas as pd
import re
from pathlib import Path
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from typing import List, Dict, Set
import nltk
nltk.download('punkt')
import nltk
nltk.download('stopwords')
import nltk
nltk.download('wordnet')
import nltk
nltk.download('omw-1.4')
import nltk
nltk.download('averaged_perceptron_tagger')
import nltk
nltk.download('names')
import nltk
nltk.download('popular')
import nltk
nltk.download('punkt_tab')
# --- Download NLTK Data (Robust Method) ---
# This ensures all required packages are present before the script runs.
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Downloading NLTK 'punkt' package...")
    nltk.download('punkt')
try:
    stopwords.words('english')
except LookupError:
    print("Downloading NLTK 'stopwords' package...")
    nltk.download('stopwords')

# --- 1. Configuration and Constants ---

DOMAIN_SPECIFIC_STOPWORDS: Set[str] = {
    'bank', 'japan', 'committee', 'policy', 'rate', 'fomc', 'federal',
    'meeting', 'statement', 'economic', 'financial', 'market'
}
MIN_TOKEN_LENGTH: int = 3

# --- 2. Core Processing Functions ---

def extract_text_from_pdfs(pdf_directory: str) -> pd.DataFrame:
    # (This function remains the same)
    pdf_path = Path(pdf_directory)
    if not pdf_path.is_dir():
        raise FileNotFoundError(f"Directory does not exist: {pdf_directory}")
    documents: List[Dict[str, str]] = []
    for pdf_file in sorted(pdf_path.glob("*.pdf")):
        try:
            with fitz.open(pdf_file) as doc:
                raw_text = "".join(page.get_text("text") for page in doc)
                if raw_text.strip():
                    documents.append({"filename": pdf_file.name, "raw_text": raw_text})
        except Exception as e:
            print(f"Could not process file {pdf_file.name}: {e}")
    if not documents:
        print(f"Warning: No text extracted from PDFs in {pdf_directory}.")
        return pd.DataFrame(columns=["filename", "raw_text"])
    df = pd.DataFrame(documents)
    print(f"Successfully extracted text from {len(df)} PDF documents.")
    return df

def create_english_preprocessor():
    # (This function remains the same)
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    stemmed_domain_stopwords = {stemmer.stem(word) for word in DOMAIN_SPECIFIC_STOPWORDS}
    stop_words.update(stemmed_domain_stopwords)
    def preprocess_document(text: str) -> List[str]:
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        tokens = word_tokenize(text)
        processed_tokens = [
            stemmer.stem(token) for token in tokens 
            if token not in stop_words and len(token) >= MIN_TOKEN_LENGTH
        ]
        return processed_tokens
    return preprocess_document

# --- 3. Main Execution Pipeline ---

def main_pipeline(pdf_directory: str, output_csv_path: str = "cleaned_english_corpus.csv"):
    print("--- Starting English Text Pre-processing Pipeline ---")

    # Step 1: Extract text from PDFs
    df = extract_text_from_pdfs(pdf_directory)
    if df.empty:
        print("Pipeline halted as no data was extracted.")
        return

    # Step 2: Create the preprocessor function
    english_preprocessor = create_english_preprocessor()

    # Step 3: Apply the preprocessing to all documents
    print("Processing all documents (Cleaning, Tokenizing, Stemming)...")
    
    # --- MODIFICATION START ---
    # This block will catch the error and print the full message
    try:
        df['cleaned_tokens'] = df['raw_text'].apply(english_preprocessor)
    except LookupError as e:
        print("\n--- NLTK Error Caught ---")
        print("A specific NLTK resource is still missing.")
        print(f"FULL ERROR MESSAGE: {e}")
        print("---------------------------\n")
        # Stop the pipeline after getting the error
        return
    # --- MODIFICATION END ---

    # Step 4: Save results
    final_df = df[['filename', 'cleaned_tokens']]
    final_df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')
    print(f"--- Pipeline Complete. Cleaned corpus saved to {output_csv_path} ---")

    # Display a sample of the output
    print("\nSample of cleaned tokens from the first document:")
    if not final_df.empty and final_df.iloc[0]['cleaned_tokens']:
        print(final_df.iloc[0]['cleaned_tokens'][:20])
    else:
        print("No tokens were generated for the first document.")
# --- 4. Script Execution ---

if __name__ == '__main__':
    # (This section remains the same)
    PDF_SOURCE_DIRECTORY = "boj_statements_english"
    Path(PDF_SOURCE_DIRECTORY).mkdir(exist_ok=True)
    if not any(Path(PDF_SOURCE_DIRECTORY).iterdir()):
        print(f"Error: The directory '{PDF_SOURCE_DIRECTORY}' is empty.")
        print("Please add your Bank of Japan PDF files to this directory and run the script again.")
    else:
        main_pipeline(pdf_directory=PDF_SOURCE_DIRECTORY)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tnk20\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tnk20\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tnk20\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\tnk20\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tnk20\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\tnk20\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data]

--- Starting English Text Pre-processing Pipeline ---
Successfully extracted text from 39 PDF documents.
Processing all documents (Cleaning, Tokenizing, Stemming)...
--- Pipeline Complete. Cleaned corpus saved to cleaned_english_corpus.csv ---

Sample of cleaned tokens from the first document:
['januari', 'monetari', 'polici', 'monetari', 'polici', 'meet', 'held', 'today', 'polici', 'board', 'decid', 'upon', 'follow', 'yield', 'curv', 'control', 'decid', 'major', 'vote', 'set']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tnk20\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True