In [1]:
import pandas as pd
from sklearn.datasets import load_iris





In [2]:
# --- When to Use One-Hot Encoding ---
# One-hot encoding is used to convert categorical variables into a numerical format
# that can be provided to machine learning algorithms to improve predictions.
#
# It should be used when:
# 1. The categorical data is nominal, meaning the categories do not have a natural order
#    or rank (e.g., 'Red', 'Green', 'Blue').
# 2. You are using a model that does not inherently handle categorical data or might
#    misinterpret ordered integers (e.g., Linear Regression, Logistic Regression, SVMs).
#
# --- Requirements ---
# The input data for one-hot encoding should be a categorical feature, typically
# represented as strings or integers that stand for different categories.



In [3]:
# 1. Load a Sample Dataset
# We'll use the famous Iris dataset from scikit-learn. It's a simple dataset
# containing information about different species of iris flowers.
iris = load_iris()
# Convert the data to a pandas DataFrame for easier manipulation.
# The 'target' column contains the species of each flower as an integer (0, 1, 2).
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species_id'] = iris.target

# For demonstration purposes, let's map the integer target to string names
# to make the categorical nature more explicit.
species_map = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}
iris_df['species_name'] = iris_df['species_id'].map(species_map)

print("--- Original DataFrame (Head) ---")
print(iris_df.head())
print("\nValue counts of the original 'species_name' column:")
print(iris_df['species_name'].value_counts())




--- Original DataFrame (Head) ---
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   species_id species_name  
0           0       setosa  
1           0       setosa  
2           0       setosa  
3           0       setosa  
4           0       setosa  

Value counts of the original 'species_name' column:
species_name
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64


In [4]:
# 2. Select the Feature to Encode
# We will apply one-hot encoding to the 'species_name' column. This column is
# nominal categorical data, making it a perfect candidate for this technique.
# --- Why not just use the 'species_id' column? ---
# Although 'species_id' contains numbers (0, 1, 2), using it directly can be misleading.
# Many models would incorrectly assume that the species have a mathematical relationship
# or order (e.g., that 'virginica' (2) is twice the value of 'versicolor' (1)).
# This is not true; the categories are nominal (no inherent order).
# One-hot encoding avoids this problem by converting each category into a separate
# binary feature, removing any false sense of order and allowing the model to treat
# each species independently.


In [5]:
# 3. Apply One-Hot Encoding
# We use the pandas `get_dummies()` function, which is a very convenient way
# to perform one-hot encoding. It creates a new binary column for each unique
# category in the specified column.
# The `prefix` argument helps in naming the new columns clearly.
one_hot_encoded_df = pd.get_dummies(iris_df, columns=['species_name'], prefix='species', dtype=int)



In [6]:
# 4. Show the Result
# The original 'species_name' column is replaced by three new columns:
# 'species_setosa', 'species_versicolor', and 'species_virginica'.
# For each row, only one of these new columns will have a value of 1, indicating
# the species of that flower, while the others will be 0.

print("\n\n--- DataFrame after One-Hot Encoding (Head) ---")
print(one_hot_encoded_df.head())

print("\n\n--- DataFrame after One-Hot Encoding (Tail, to show other species) ---")
# Displaying the tail to show rows where other species are encoded.
print(one_hot_encoded_df.tail())





--- DataFrame after One-Hot Encoding (Head) ---
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   species_id  species_setosa  species_versicolor  species_virginica  
0           0               1                   0                  0  
1           0               1                   0                  0  
2           0               1                   0                  0  
3           0               1                   0                  0  
4           0               1                   0                  0  


--- DataFrame after One-Hot Encoding (Tail, to 