In [1]:
import pandas as pd
import glob
import os

# Path to your processed data folder
processed_data_path = '../data/processed/'

# Find all the CSV files starting with "Press Release"
files = glob.glob(os.path.join(processed_data_path, 'Press Release*.csv'))

if not files:
    print("Error: No CSV files starting with 'Press Release' were found in the '/data/processed/' folder.")
else:
    print(f"Found {len(files)} files to process.")
    all_dfs = []
    for file in files:
        try:
            # Read the CSV without a header first to inspect it
            df_temp = pd.read_csv(file, header=None)
            
            # Find the actual header row by looking for keywords
            header_row_index = -1
            for i, row in df_temp.iterrows():
                row_string = ' '.join(str(x) for x in row.values)
                if "Security" in row_string and "Amount" in row_string and "Maturity" in row_string:
                    header_row_index = i
                    break
            
            if header_row_index != -1:
                # Re-read the CSV, starting from the correct header row
                df = pd.read_csv(file, header=header_row_index)
                df['source_file'] = os.path.basename(file)
                all_dfs.append(df)
                print(f"Successfully read and processed table from {file}")
            else:
                print(f"--> Warning: Could not find a suitable header row in {file}")

        except Exception as e:
            print(f"Could not read file: {file}. Error: {e}")

    # Consolidate and save if any data was successfully read
    if all_dfs:
        consolidated_df = pd.concat(all_dfs, ignore_index=True)
        
        # Drop any rows that are completely empty
        consolidated_df.dropna(how='all', inplace=True)
        
        # Final cleanup of column names
        consolidated_df.columns = [
            str(col).replace('\n', ' ').strip().lower().replace(' ', '_').replace('-(₹_crore)', '_in_cr')
            for col in consolidated_df.columns
        ]
        
        print("\n--- Preview of Consolidated Data ---")
        print(consolidated_df.head())
        
        output_path = '../data/processed/consolidated_gsec_auctions_final.csv'
        consolidated_df.to_csv(output_path, index=False)
        print(f"\nSuccessfully saved consolidated data to {output_path}")
    else:
        print("\nNo dataframes were created to consolidate.")

Found 3 files to process.
Successfully read and processed table from ../data/processed\Press Release for auction of G-Sec- 11.08.2025.csv
Successfully read and processed table from ../data/processed\Press Release for auction of G-Sec- 21.07.2025.csv
Successfully read and processed table from ../data/processed\Press Release for auction of G-Sec- 28.07.2025.csv

--- Preview of Consolidated Data ---
             name_of_the_security    date_of_issue      date_of_maturity  \
0  6.01% Government Security 2030  August 14, 2025         June 22, 2030   
1    New Government Security 2055  August 14, 2025  To be notified later   
2  7.35% Government Security 2029    July 22, 2025         June 22, 2029   
3  7.26% Government Security 2033    July 22, 2025         July 22, 2033   
4    New Government Security 2035    July 22, 2025  To be notified later   

            coupon_rate notified_amount_(₹_crore)  \
0                 6.01%                    15,000   
1  To be notified later              