In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


In [13]:

df = pd.read_csv("adult income.csv")
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,native.country,income
0,17,Private,148522,11th,7.0,Never-married,occupation,Own-child,White,Male,United-States,<=50K
1,17,Private,93235,12th,8.0,Never-married,occupation,Own-child,White,Female,United-States,<=50K
2,17,Private,184924,9th,5.0,Never-married,occupation,Own-child,White,Male,United-States,<=50K
3,17,Private,116626,11th,7.0,Never-married,occupation,Own-child,White,Male,United-States,<=50K
4,17,Private,209949,11th,7.0,Never-married,occupation,Own-child,White,Female,United-States,<=50K


In [14]:
df.columns


Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'native.country', 'income'],
      dtype='object')

In [15]:
numerical_features = [
    'age',
    'fnlwgt',
    'education-num'
]


In [16]:
categorical_features = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country'
]


In [17]:
target = 'income'


In [18]:
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)


In [19]:
le = LabelEncoder()
df[target] = le.fit_transform(df[target])


In [21]:
X = df.drop(columns=[target])
y = df[target]


In [26]:

print(df.columns)


Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'native.country', 'income'],
      dtype='object')


In [34]:
numerical_features = [
    'age',
    'fnlwgt',
    'education.num'
]

categorical_features = [
    'workclass',
    'education',
    'marital.status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native.country'
]

target = 'income'


In [28]:
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)


In [29]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df[target] = le.fit_transform(df[target])


In [35]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ]
)


In [36]:
X = df.drop(columns=[target])
y = df[target]

X_processed = preprocessor.fit_transform(X)


In [37]:
X_processed = preprocessor.fit_transform(X)


In [38]:
feature_names = (
    numerical_features +
    list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))
)

X_processed_df = pd.DataFrame(X_processed, columns=feature_names)
X_processed_df.head()


Unnamed: 0,age,fnlwgt,education.num,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,...,native.country_Portugal,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia
0,-1.880923,-0.422648,-1.308352,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.880923,-0.926304,-0.886472,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-1.880923,-0.091032,-2.15211,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-1.880923,-0.713216,-1.308352,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-1.880923,0.136942,-1.308352,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    X_processed_df, y, test_size=0.2, random_state=42
)


In [40]:
processed_df = X_processed_df.copy()
processed_df['income'] = y.values

processed_df.to_csv("adult_preprocessed.csv", index=False)


In [41]:
from google.colab import files
files.download("adult_preprocessed.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>