In [2]:
# This block is for converting the provided CSV into a form more suitable for machine learning.

import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

def clean_and_encode_csv(input_file, output_file, columns, new_column_names):
    """
    Reads a CSV, selects specified columns, renames them, replaces "No Finding" with 0,
    replaces pipe separators with commas, encodes labels, and saves to a new CSV.
    """
    try:
        # Read and clean the CSV
        df = pd.read_csv(input_file)
        df_cleaned = df[columns].copy()
        df_cleaned.columns = new_column_names
        df_cleaned.loc[df_cleaned['labels'] == "No Finding", 'labels'] = 0
        df_cleaned['labels'] = df_cleaned['labels'].astype(str).str.replace('|', ',', regex=False)
        
        # Encode labels
        label_lists = [label.split(',') if label != '0' else [] for label in df_cleaned['labels']]
        mlb = MultiLabelBinarizer()
        df_cleaned[new_column_names[1]] = list(mlb.fit_transform(label_lists))

        # Save the new DataFrame
        df_cleaned.to_csv(output_file, index=False)
        print(f"Encoded and cleaned CSV saved to {output_file}")
    
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
input_file = './sample_labels.csv'
output_file = './encoded_labels.csv'
columns = ['Image Index', 'Finding Labels']
new_column_names = ['index', 'labels']

clean_and_encode_csv(input_file, output_file, columns, new_column_names)


Encoded and cleaned CSV saved to ./encoded_labels.csv
