## Label Encoding of Nominal Data

In [1]:
# LabelEncoder from sklearn.preprocessing is used to convert categorical labels into numeric values (integers)
# It’s usually used for ordinal data (or target variables in classification)

# If numbers represent names, labels, or IDs → it's nominal
# If numbers represent counts, measurements, or order → it’s numeric or ordinal
from sklearn.preprocessing import LabelEncoder

# Method	              What it does	                                                  When to use
# .fit()	              Learns the encoding (e.g., maps categories to numbers)	        First time you train on data
# .transform()	        Applies the learned encoding to new data	                      Reuses the learned mapping
# .fit_transform()     	Does both .fit() and .transform() together	                    Use on training data to fit and encode

# 1. Instantiate LabelEncoder
x = LabelEncoder()

# 2. Fit data to encoder
x.fit([1, 1, 2, 6]) #sorts unique values before mapping them to indices
# 2nd step:
# Takes the array [1, 1, 2, 6].

# Finds the unique sorted values → [1, 2, 6]

# Internally maps:

# 1 → 0

# 2 → 1

# 6 → 2

# This mapping will be used when you call transform().

# So in the future, if you do x.transform([1, 2, 6]), you’ll get [0, 1, 2].


# 3. Get unique classes
x.classes_
# Class 1 will be encoded as 0

# Class 2 will be encoded as 1

# Class 6 will be encoded as 2

array([1, 2, 6])

In [3]:
# Transforming labels to normalized encoding through "transform()"

x.transform([1, 1, 2, 6])

array([0, 0, 1, 2])

In [5]:
# Transforming encoding back to original labels using "inverse_transform()" function
x.inverse_transform([0, 0, 1, 2])

array([1, 1, 2, 6])

## Label Encoding of Categorical Data

In [10]:
# Instantiate label_encoder object
y = LabelEncoder()
# Fitting data
y.fit(["paris", "paris", "tokyo", "amsterdam"])
# Getting unique classes from above fitted data in the form of list
list(y.classes_)

[np.str_('amsterdam'), np.str_('paris'), np.str_('tokyo')]

In [12]:
# Transforming labels to normalized encoding through "transform()"function
y.transform(["tokyo", "tokyo", "paris"])

array([2, 2, 1])

In [14]:
# Transforming encoding back to original labels using "inverse_transform()" function in the form of list
list(y.inverse_transform([2, 2, 1]))

[np.str_('tokyo'), np.str_('tokyo'), np.str_('paris')]

## Gender Identification

In [17]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# dataset
data = {
    'Height': [180.3, 170.0, 178.5, 163.4, 175.2222, 165.0],
    'Weight': [196, 120, 200, 110, 220, 150],
    'Hair': ['Bald', 'Long', 'Short', 'Medium', 'Short', 'Medium'],
    'Beard': ['Yes', 'No', 'No', 'No', 'Yes', 'No'],
    'Scarf': ['No', 'No', 'No', 'Yes', 'No', 'Yes'],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female']
}

# Create dataframe
train_dataset = pd.DataFrame(data)

# Copies
train_dataset_encoded = train_dataset.copy()
train_dataset_original = train_dataset.copy()

# Encoders
hair_label_encoder = LabelEncoder()
gender_label_encoder = LabelEncoder()
scarf_label_encoder = LabelEncoder()
beard_label_encoder = LabelEncoder()

# Encoding
train_dataset["encoded_gender"] = gender_label_encoder.fit_transform(train_dataset['Gender'])
train_dataset["encoded_scarf"] = scarf_label_encoder.fit_transform(train_dataset['Scarf'])
train_dataset["encoded_beard"] = beard_label_encoder.fit_transform(train_dataset['Beard'])
train_dataset["encoded_hair"] = hair_label_encoder.fit_transform(train_dataset['Hair'])
# no need to create height and weight's label encoder as they are already in numeric form

# Final encoded dataset
train_dataset_encoded[['Hair', 'Beard', 'Scarf', 'Gender']] = train_dataset[['encoded_hair', 'encoded_beard', 'encoded_scarf', 'encoded_gender']]
# We use double brackets when selecting or assigning multiple columns in a DataFrame to ensure that we're working with a 2D structure (DataFrame) instead of a 1D structure (Series).

# Save encoded dataset as CSV
encoded_csv_path = "train_dataset_encoded.csv"
train_dataset_encoded.to_csv(encoded_csv_path, index=False)

print("Original Data:\n", train_dataset_original)
print("\nEncoded Data:\n", train_dataset_encoded)
print(f"\nCSV file saved as: {encoded_csv_path}")


Original Data:
      Height  Weight    Hair Beard Scarf  Gender
0  180.3000     196    Bald   Yes    No    Male
1  170.0000     120    Long    No    No  Female
2  178.5000     200   Short    No    No    Male
3  163.4000     110  Medium    No   Yes  Female
4  175.2222     220   Short   Yes    No    Male
5  165.0000     150  Medium    No   Yes  Female

Encoded Data:
      Height  Weight  Hair  Beard  Scarf  Gender
0  180.3000     196     0      1      0       1
1  170.0000     120     1      0      0       0
2  178.5000     200     3      0      0       1
3  163.4000     110     2      0      1       0
4  175.2222     220     3      1      0       1
5  165.0000     150     2      0      1       0

CSV file saved as: train_dataset_encoded.csv
