# 2. Encoding Categorical Variables in a Car Evaluation Dataset

Importing the libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

Loading the dataset

In [8]:
car_df = pd.read_csv('./datasets/car_evaluation.csv', header=None)
car_df.columns = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]

Display the first few rows of the dataset

In [9]:
car_df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


Identifying categorical columns

In [10]:
categorical_cols = car_df.select_dtypes(include=['object']).columns

# One-Hot Encoding

In [12]:
one_hot_encoder = OneHotEncoder(sparse=False, drop='first')
one_hot_encoded = one_hot_encoder.fit_transform(car_df[categorical_cols])
one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out(categorical_cols))
one_hot_encoded_df.index = car_df.index



Dropping original categorical columns and concatenate one-hot encoded columns

In [13]:
car_df_one_hot = car_df.drop(columns=categorical_cols).join(one_hot_encoded_df)

print("\nDataset after One-Hot Encoding:\n", car_df_one_hot.head())


Dataset after One-Hot Encoding:
    buying_low  buying_med  buying_vhigh  maint_low  maint_med  maint_vhigh  \
0         0.0         0.0           1.0        0.0        0.0          1.0   
1         0.0         0.0           1.0        0.0        0.0          1.0   
2         0.0         0.0           1.0        0.0        0.0          1.0   
3         0.0         0.0           1.0        0.0        0.0          1.0   
4         0.0         0.0           1.0        0.0        0.0          1.0   

   doors_3  doors_4  doors_5more  persons_4  persons_more  lug_boot_med  \
0      0.0      0.0          0.0        0.0           0.0           0.0   
1      0.0      0.0          0.0        0.0           0.0           0.0   
2      0.0      0.0          0.0        0.0           0.0           0.0   
3      0.0      0.0          0.0        0.0           0.0           1.0   
4      0.0      0.0          0.0        0.0           0.0           1.0   

   lug_boot_small  safety_low  safety_med  cla

# Label Encoding

In [14]:
label_encoder = LabelEncoder()
car_df_label_encoded = car_df.copy()
for col in categorical_cols:
    car_df_label_encoded[col] = label_encoder.fit_transform(car_df_label_encoded[col])

print("\nDataset after Label Encoding:\n", car_df_label_encoded.head())



Dataset after Label Encoding:
    buying  maint  doors  persons  lug_boot  safety  class
0       3      3      0        0         2       1      2
1       3      3      0        0         2       2      2
2       3      3      0        0         2       0      2
3       3      3      0        0         1       1      2
4       3      3      0        0         1       2      2


# Comparing the results

In [15]:
print("\nOne-Hot Encoding Shape:", car_df_one_hot.shape)
print("Label Encoding Shape:", car_df_label_encoded.shape)


One-Hot Encoding Shape: (1728, 18)
Label Encoding Shape: (1728, 7)
