In [1]:
import pandas as pd
import numpy as np
import os

In [3]:
# Set folder name
output_loc = "/domino/datasets/local/smote_msfb/microarray_data/microarray-data/csv/"
folder = "su"

# Paths
inputs_path = os.path.join(output_loc,folder, f"{folder}_inputs.csv")
outputs_path = os.path.join(output_loc,folder, f"{folder}_outputs.csv")
#output_file = os.path.join(output_loc,folder, "processed.csv")

In [4]:
# Read files (no headers)
X = pd.read_csv(inputs_path, header=None)
y = pd.read_csv(outputs_path, header=None).iloc[:, 0]

In [5]:
# 1. Feature processing

# Detect numeric vs. categorical
is_numeric = X.apply(lambda col: pd.api.types.is_numeric_dtype(col))
continuous_cols = X.columns[is_numeric]
categorical_cols = X.columns[~is_numeric]

# a) Binarize continuous
X_bin = X.copy()
for col in continuous_cols:
    cutoff = X_bin[col].quantile(0.65)
    X_bin[col] = (X_bin[col] > cutoff).astype(int)

# b) One-hot encode categorical
if len(categorical_cols) > 0:
    X_bin = pd.get_dummies(X_bin, columns=categorical_cols, drop_first=False)
    X_bin.reset_index(drop=True, inplace=True)

In [6]:
# c) Rename columns as f_*
X_bin.columns = [f"f_{i}" for i in range(X_bin.shape[1])]

In [7]:
X_bin.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_5555,f_5556,f_5557,f_5558,f_5559,f_5560,f_5561,f_5562,f_5563,f_5564
0,1,1,0,1,0,0,1,1,1,0,...,0,0,1,1,1,0,1,1,0,0
1,1,0,1,1,0,0,1,0,1,0,...,0,1,0,0,1,1,0,1,0,0
2,1,0,0,0,0,0,1,0,1,1,...,1,1,1,1,1,0,0,1,0,0


In [8]:
# 2. Response processing

print("Unique value counts in response:")
print(y.value_counts().sort_index())

# Try all possible binary splits that meet the 10%-45% minority rule
unique_classes = sorted(y.unique())
best_split = None
best_ratio = 0

Unique value counts in response:
1.0    25
2.0    26
3.0    28
4.0    23
Name: 0, dtype: int64


In [9]:
# Try all combinations of unique levels as minority
from itertools import combinations
for r in range(1, len(unique_classes)):
    for minority_classes in combinations(unique_classes, r):
        target = y.isin(minority_classes).astype(int)
        ratio = target.sum() / len(target)
        if 0.10 <= ratio <= 0.25:
            best_split = minority_classes
            best_ratio = ratio

if best_split is None:
    raise Exception("No class grouping found to achieve the desired minority class ratio.")

print(f"\nGrouping classes {best_split} as minority. Minority ratio: {best_ratio:.2%}")


Grouping classes (4.0,) as minority. Minority ratio: 22.55%


In [10]:
target = y.isin(best_split).astype(int)
target.name = "target"

In [11]:
# 3. Combine and save
final_df = pd.concat([X_bin, target], axis=1)

In [12]:
final_df.shape

(102, 5566)

In [13]:
final_df.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_5556,f_5557,f_5558,f_5559,f_5560,f_5561,f_5562,f_5563,f_5564,target
0,1,1,0,1,0,0,1,1,1,0,...,0,1,1,1,0,1,1,0,0,0
1,1,0,1,1,0,0,1,0,1,0,...,1,0,0,1,1,0,1,0,0,0
2,1,0,0,0,0,0,1,0,1,1,...,1,1,1,1,0,0,1,0,0,0


In [14]:
final_df.to_csv(output_loc + "/" + folder + "/" + "final_data.csv.zip", 
                     index=False, 
                     compression={'method': 'zip', 'archive_name': 'final_data.csv'})