In [3]:
import pandas as pd
import re

# Load the CSV without assuming it's correctly structured
df = pd.read_csv("src/data/pdf_out.csv", header=None, skip_blank_lines=False)

# List to store fixed rows
fixed_rows = []
current_row = None  # To hold the current row being constructed

# Regular expression pattern to match numbers ending in .0 (e.g., 66.0, 1.0, 123.0, etc.)
pattern = re.compile(r"^\d+\.0$")

for _, row in df.iterrows():
    first_value = str(row[0])
    
    # If we encounter a new row delimiter
    if pattern.match(first_value):
        # If there's an existing row, finalize and add it to fixed_rows
        if current_row is not None:
            fixed_rows.append(current_row)
        
        # Initialize the new row
        current_row = row.tolist()
    else:
        # Concatenate each column with the row above it
        for i in range(len(row)):
            if pd.notna(row[i]):  # Skip NaN values
                # Append the current row's value to the corresponding cell in current_row
                current_row[i] = f"{current_row[i]} {row[i]}" if current_row[i] else str(row[i])

# Append the last accumulated row if it exists
if current_row is not None:
    fixed_rows.append(current_row)

# Convert the list of fixed rows into a DataFrame
fixed_df = pd.DataFrame(fixed_rows)

# Save to a corrected CSV or TSV
fixed_df.to_csv("corrected_output.tsv", index=False, sep="\t")  # For TSV, use sep="\t"
