In [None]:
import pandas as pd

df = pd.read_csv("new_processed_data.csv")


In [None]:
#Replace MRI-Liver with CT-Liver and set contrast to "Yes"
df["Procedure"] = df["Procedure"].replace("MRI-Liver", "CT-Liver")
df.loc[df["Procedure"] == "CT-Liver", "Contrast"] = True

In [None]:

# Remove CT-Brain Helical and redistribute to valid alternatives
reassign_procedures = ["CT-Thorax", "X-Ray Thorax", "CT-Liver", "X-Ray LWK"]
ct_brain_helical_idx = df[df["Procedure"] == "CT-Brain Helical"].index
valid_pool = df[df["Procedure"].isin(reassign_procedures)]

for i, idx in enumerate(ct_brain_helical_idx):
    sample_row = valid_pool.sample(1, random_state=i).iloc[0]
    df.loc[idx, "Procedure"] = sample_row["Procedure"]
    df.loc[idx, "Condition"] = sample_row["Condition"]
    df.loc[idx, "Gender"] = sample_row["Gender"]
    df.loc[idx, "Age"] = sample_row["Age"]
    df.loc[idx, "Contrast"] = sample_row["Contrast"]


In [None]:
# Remove less common conditions
conditions_to_remove = ["Uterine Fibroids", "Foot Injury","Prostate Check"]
total_removed = 0
removed_data = []

for cond in conditions_to_remove:
    to_remove = df[df["Condition"] == cond].sample(min(3, df[df["Condition"] == cond].shape[0]), random_state=42)
    removed_data.append(to_remove)
    df.drop(to_remove.index, inplace=True)
    total_removed += len(to_remove)

In [None]:
#Redistribute removed patients to existing common conditions
redistribute_conditions = ["Metastasis", "Thoracic Tumor", "Thoracic Injury", "ACL Tear", "CT-Liver Tear"]
available_redistribution = df[df["Condition"].isin(redistribute_conditions)]

replacements = available_redistribution.sample(n=total_removed, random_state=1)
df = pd.concat([df, replacements], ignore_index=True)

# Adjust MRI-Uterus contrast to ~50/50 and drop a few records
mri_uterus_idx = df[df["Procedure"] == "MRI-Uterus"].index
if len(mri_uterus_idx) >= 2:
    df.drop(mri_uterus_idx[:2], inplace=True)
mri_uterus_idx = df[df["Procedure"] == "MRI-Uterus"].index
half = len(mri_uterus_idx) // 2
df.loc[mri_uterus_idx[:half], "Contrast"] = True
df.loc[mri_uterus_idx[half:], "Contrast"] = False

In [None]:
# Reassign nonsensical combinations using existing valid pairings
valid_pairs = df.groupby(["Procedure", "Gender"])[["Condition", "Age"]].first().reset_index()
valid_lookup = valid_pairs.set_index(["Procedure", "Gender"]).T.to_dict()

for i, row in df.iterrows():
    key = (row["Procedure"], row["Gender"])
    if key in valid_lookup:
        expected_condition = valid_lookup[key]["Condition"]
        expected_age = valid_lookup[key]["Age"]
        if row["Condition"] != expected_condition or not (0 <= row["Age"] <= 120):
            df.at[i, "Condition"] = expected_condition
            df.at[i, "Age"] = expected_age

# Save the cleaned dataset
df.to_csv("final_cleaned_data.csv", index=False)

In [23]:
print(df["Gender"].value_counts())

Gender
Male      40
Female    38
Name: count, dtype: int64
