In [6]:
import pandas as pd
input_path = r'/cluster/project/sachan/pmlr/grounding-vlms/eval/valid_results/claude-3.5-haiku/TallyQA_results_processed.csv'


def process_csv(input_path, output_path):
    # Manual parsing to handle commas in raw_result field
    data = []
    with open(input_path, 'r', encoding='utf-8') as file:
        # Skip header line but save it
        header = next(file).strip()
        
        for line in file:
            line = line.strip()
            if not line:  # Skip empty lines
                continue
                
            # Split by comma, but only take first 2 fields
            parts = line.split(',', 2)
            
            # Ensure we have at least 3 parts
            while len(parts) < 3:
                parts.append('')
                
            # Create a row dict with the first two fields and everything else as raw_result
            row = {
                'idx': parts[0],
                'result': parts[1],
                'raw_result': parts[2]
            }
            data.append(row)
    
    # Rest of the function remains the same
    df = pd.DataFrame(data)
    
    # Apply the same processing logic
    def process_row(row):
        try:
            float(row['raw_result'])
            return row  # Third column is a number, keep as is
        except (ValueError, TypeError):
            sentence = str(row['raw_result']).lower()
            if " is no " in sentence or " not show " in sentence or " are no " in sentence or "not see" in sentence or "not a":
                return {
                    'idx': row['idx'],
                    'result': '0',
                    'raw_result': '0 sentence'
                }
            else:
                return row

    processed = df.apply(process_row, axis=1, result_type='expand')
    processed.to_csv(output_path, index=False)
    
    return processed
    

process_csv(input_path, input_path.replace('.csv', '.csv'))
print(f"Processed CSV saved to {input_path.replace('.csv', '_processed.csv')}")



Processed CSV saved to /cluster/project/sachan/pmlr/grounding-vlms/eval/valid_results/claude-3.5-haiku/TallyQA_results_processed_processed.csv


In [9]:
def remove_long_lines_from_csv(path, maxChar):
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Keep lines that are shorter than or equal to maxChar
    filtered_lines = [line for line in lines if len(line) <= maxChar]

    # Overwrite the file with filtered lines
    with open(path, 'w', encoding='utf-8') as f:
        f.writelines(filtered_lines)
remove_long_lines_from_csv(input_path, 20)

In [28]:
def double_check_csv(path):
    df = pd.read_csv(path)
    lastIndex = -1
    for i, series in df.iterrows():
        try:
            idx, result, raw_result = series.values
        except ValueError:
            print(f"Row {series} has an issue.{i}")
            continue
        if idx != lastIndex + 1:
            print(f"Row {series} has an issue.{i}; expected {lastIndex + 1} but got {idx}\n---\n")
            lastIndex -=1
        lastIndex += 1
        try:
            float(raw_result)
        except ValueError:
            if not "sentence" in raw_result:
                print(f"Row {series} has an issue.{i}; expected a number but got {raw_result}\n---\n")
            else:
                continue
        try:
            float(result)
        except ValueError:
            print(f"Row {series} has an issue.{i}; expected a number but got {result}\n---\n")
            continue
        
double_check_csv(input_path)