In [1]:
! pip install ucimlrepo



In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
car_evaluation = fetch_ucirepo(id=19) 
  
# data (as pandas dataframes) 
X = car_evaluation.data.features 
y = car_evaluation.data.targets   

In [None]:
# metadata 
print(car_evaluation.metadata) 
  
# variable information 
print(car_evaluation.variables) 

In [13]:
output_loc = "/repos/smote_msfb/public_datasets/careval/"

In [3]:
X.head(3)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,vhigh,vhigh,2,2,small,low
1,vhigh,vhigh,2,2,small,med
2,vhigh,vhigh,2,2,small,high


In [8]:
import pandas as pd
import numpy as np

# X is the DataFrame with your features
# Step 1: Identify continuous columns
continuous_cols = X.select_dtypes(include=[np.number]).columns.tolist()  # numeric columns

In [9]:
# Step 1: Binarize continuous columns at 65th percentile
for col in continuous_cols:
    cutoff = X[col].quantile(0.65)
    X[col + '_bin'] = (X[col] > cutoff).astype(int)

# Step 2: One-hot encode categorical variables (excluding binarized columns)
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=False)

# Step 3: Rename all columns as f_*
X_encoded.columns = [f'f_{i}' for i in range(X_encoded.shape[1])]

In [10]:
X_encoded.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20
0,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,0,1,0,1,0
1,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,0,1,0,0,1
2,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,0,1,1,0,0
3,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,1,0,0,1,0
4,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,1,0,0,0,1


In [6]:
y.value_counts()

class
unacc    1210
acc       384
good       69
vgood      65
dtype: int64

In [11]:
# Assuming y is a pandas Series or DataFrame with the column 'class'
# and X_encoded is your processed feature DataFrame

# Create binary target: 1 for 'good' or 'vgood', 0 otherwise
target = y['class'].isin(['good', 'vgood']).astype(int).rename('target')

# Concatenate features and target, ensuring target is last column
final_df = pd.concat([X_encoded.reset_index(drop=True), target.reset_index(drop=True)], axis=1)

In [12]:
final_df.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,target
0,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,1,0,1,0,0
1,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,1,0,0,1,0
2,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,1,1,0,0,0
3,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,0,1,0,0
4,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,0,0,1,0


In [16]:
final_df.to_csv(output_loc + "final_data.csv.zip", 
                     index=False, 
                     compression={'method': 'zip', 'archive_name': 'final_data.csv'})