In [None]:
! pip install ucimlrepo

In [1]:
import pandas as pd
import numpy as np

In [2]:
output_loc = "/repos/smote_msfb/public_datasets/ecoli/"

In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
ecoli = fetch_ucirepo(id=39) 
  
# data (as pandas dataframes) 
X = ecoli.data.features 
y = ecoli.data.targets 

In [4]:
X.dtypes

mcg     float64
gvh     float64
lip     float64
chg     float64
aac     float64
alm1    float64
alm2    float64
dtype: object

In [5]:
X.head(3)

Unnamed: 0,mcg,gvh,lip,chg,aac,alm1,alm2
0,0.49,0.29,0.48,0.5,0.56,0.24,0.35
1,0.07,0.4,0.48,0.5,0.54,0.35,0.44
2,0.56,0.4,0.48,0.5,0.49,0.37,0.46


In [6]:
# Identify continuous and categorical columns
continuous_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# 1. Binarize continuous variables
X_bin = X.copy()
for col in continuous_cols:
    cutoff = X_bin[col].quantile(0.65)
    X_bin[col] = (X_bin[col] > cutoff).astype(int)

# 2. One-hot encode categorical columns
X_bin = pd.get_dummies(X_bin, columns=categorical_cols, drop_first=False)

X_bin.head(3)

Unnamed: 0,mcg,gvh,lip,chg,aac,alm1,alm2
0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0


In [8]:
# Step 3: Rename all columns as f_*
X_bin.columns = [f'f_{i}' for i in range(X_bin.shape[1])]

In [9]:
X_bin.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6
0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0


In [10]:
y.value_counts()

class
cp       143
im        77
pp        52
imU       35
om        20
omL        5
imL        2
imS        2
dtype: int64

In [13]:
# Create binary response: 1 for 'imU', 0 otherwise
target = (y['class'] == 'imU').astype(int).rename('target')

# Check the distribution to confirm imbalance
print(target.value_counts())

# If you want to append this target to your feature DataFrame (e.g., X_encoded or X_bin):
final_df = pd.concat([X_bin.reset_index(drop=True), target.reset_index(drop=True)], axis=1)

0    301
1     35
Name: target, dtype: int64


In [14]:
final_df.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,target
0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0


In [15]:
final_df.to_csv(output_loc + "final_data.csv.zip", 
                     index=False, 
                     compression={'method': 'zip', 'archive_name': 'final_data.csv'})