In [7]:
import fitz  # PyMuPDF
import pandas as pd
import glob
import os
import re

# Define the path to your raw data folder
raw_data_path = '../data/raw/'
press_release_files = glob.glob(os.path.join(raw_data_path, 'Press Release for auction of G-Sec-*.pdf'))

print(f"Found {len(press_release_files)} press release files to process.")

all_tables_as_df = []

# Loop through each PDF file
for file in press_release_files:
    print(f"\nProcessing file: {os.path.basename(file)}...")
    try:
        doc = fitz.open(file)
        page = doc[0]
        text = page.get_text("text")
        
        lines = text.split('\n')
        
        # --- MORE FLEXIBLE MARKERS ---
        table_start_index = -1
        table_end_index = -1
        
        for i, line in enumerate(lines):
            # Look for a line that is likely the header
            if "Security" in line and "Amount" in line and "Maturity" in line:
                table_start_index = i + 1
            # Look for the line that starts the concluding paragraphs
            if "auction will be conducted" in line.lower() and table_start_index != -1:
                table_end_index = i
                break
        
        if table_start_index != -1 and table_end_index != -1:
            table_lines = lines[table_start_index:table_end_index]
            
            parsed_data = []
            for line in table_lines:
                row_data = re.split(r'\s{2,}', line.strip())
                if len(row_data) > 3:
                    parsed_data.append(row_data)
            
            if parsed_data:
                df = pd.DataFrame(parsed_data)
                
                headers = [
                    "name_of_the_security", "date_of_issue", "date_of_maturity", 
                    "coupon_rate", "notified_amount", "competitive_bids", "non_competitive_bids"
                ]
                
                # Ensure correct number of columns before assigning headers
                if len(df.columns) == len(headers):
                    df.columns = headers
                    df['source_file'] = os.path.basename(file)
                    all_tables_as_df.append(df)
                    print(f"Successfully extracted and parsed table with {len(df)} rows.")
                else:
                    print(f"--> Warning: Column count mismatch. Found {len(df.columns)}, expected {len(headers)}.")
            else:
                print("--> Warning: No data rows were parsed from the located table.")

        else:
            print("--> Warning: Could not locate table boundaries.")

    except Exception as e:
        print(f"--> An error occurred: {e}")

# --- CONSOLIDATE AND SAVE ---
if all_tables_as_df:
    consolidated_df = pd.concat(all_tables_as_df, ignore_index=True)
    
    output_path = '../data/processed/consolidated_gsec_auctions.csv'
    consolidated_df.to_csv(output_path, index=False)
    
    print(f"\nSuccessfully consolidated data from all files into:")
    print(output_path)
    
    print("\n--- Preview of Consolidated Data ---")
    print(consolidated_df.head())
else:
    print("\nNo data was extracted to consolidate.")

Found 3 press release files to process.

Processing file: Press Release for auction of G-Sec- 11.08.2025.pdf...

Processing file: Press Release for auction of G-Sec- 21.07.2025.pdf...

Processing file: Press Release for auction of G-Sec- 28.07.2025.pdf...

No data was extracted to consolidate.
