In [None]:
import pandas as pd
from scipy.io import arff


In [None]:
arff_file = arff.loadarff('./census-income-full-nominal.arff')
df = pd.DataFrame(arff_file[0])

In [None]:
print(df.columns)

In [None]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

df_cleaned = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

# Replace '_' with '=' in the column names
df_cleaned.columns = df_cleaned.columns.str.replace('_', '=')

# Map the 'class' column to 0 and 1
df_cleaned['class'] = df_cleaned['class'].apply(lambda x: 0 if x == '--50000.' else 1)

# Identify categorical columns excluding 'class'
categorical_columns = df_cleaned.select_dtypes(include=['object']).columns.difference(['class'])

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform the data
encoded_data = encoder.fit_transform(df_cleaned[categorical_columns])

# Create a DataFrame with the encoded data
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))

# Drop the original categorical columns and concatenate the encoded columns with the 'class' column
df_encoded = df_cleaned.drop(columns=categorical_columns).reset_index(drop=True)
df_encoded = pd.concat([df_encoded, encoded_df], axis=1)

print(df_encoded.head())

In [None]:
# Remove the 'b' prefix and decode bytes to strings
df_encoded.iloc[:1000, -1].value_counts()

In [None]:
df_encoded.columns = df_encoded.columns.str.replace('_', '=')
print(df_encoded.columns)

In [None]:

df_encoded.to_csv('census_encoded.csv', index=False)