In [None]:
import pandas as pd
from sklearn.utils import resample
from google.colab import files
import io

# 1. Upload your CSV files
print("Upload 8 CSV files (amh.csv, arb.csv, etc.)")
uploaded = files.upload()

# 2. Combine all uploaded CSVs into one DataFrame
dfs = []
for filename in uploaded.keys():
    if filename.endswith('.csv'):
        print(f"Reading {filename}...")
        # Read the raw file content
        df = pd.read_csv(io.BytesIO(uploaded[filename]))
        dfs.append(df)

if len(dfs) == 0:
    print("No CSV files were found! Please run the cell again and upload your files.")
else:
    full_df = pd.concat(dfs, ignore_index=True)
    print(f"Successfully combined {len(full_df)} rows from {len(dfs)} files.")

    # 3. Define the target label to balance
    # We identified 'dehumanization' as the minority class previously
    target_label = 'dehumanization'

    if target_label not in full_df.columns:
        print(f"Error: The column '{target_label}' was not found in your data.")
    else:
        # 4. Separate majority (0) and minority (1) classes
        df_majority = full_df[full_df[target_label] == 0]
        df_minority = full_df[full_df[target_label] == 1]

        print(f"Original counts -> Majority: {len(df_majority)}, Minority: {len(df_minority)}")

        # 5. Oversample the minority class
        # We sample with replacement to match the number of majority samples
        df_minority_upsampled = resample(df_minority,
                                         replace=True,
                                         n_samples=len(df_majority),
                                         random_state=42)

        # 6. Combine back together
        df_upsampled = pd.concat([df_majority, df_minority_upsampled])

        # 7. Shuffle the dataset so labels are mixed
        df_upsampled = df_upsampled.sample(frac=1, random_state=42).reset_index(drop=True)

        print(f"New counts -> Majority: {len(df_majority)}, Minority (Upsampled): {len(df_minority_upsampled)}")
        print(f"Total rows: {len(df_upsampled)}")

        # 8. Save and Download
        output_filename = 'oversampled_data.csv'
        df_upsampled.to_csv(output_filename, index=False)

        print(f"Downloading {output_filename}...")
        files.download(output_filename)

Upload your 8 CSV files (amh.csv, arb.csv, etc.)


Saving amh.csv to amh (1).csv
Saving arb.csv to arb (1).csv
Saving deu.csv to deu (1).csv
Saving eng.csv to eng (1).csv
Saving hau.csv to hau (1).csv
Saving spa.csv to spa (1).csv
Saving urd.csv to urd (1).csv
Saving zho.csv to zho (1).csv
Reading amh (1).csv...
Reading arb (1).csv...
Reading deu (1).csv...
Reading eng (1).csv...
Reading hau (1).csv...
Reading spa (1).csv...
Reading urd (1).csv...
Reading zho (1).csv...
Successfully combined 26653 rows from 8 files.
Original counts -> Majority: 22850, Minority: 3803
New counts -> Majority: 22850, Minority (Upsampled): 22850
Total rows: 45700
Downloading oversampled_data.csv...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>