**technique of one hot encoding.**

- Understanding Categorical Variables

    - Categorical variables are non-continuous and can include different types, such as ads with or without a call to action.
    - These variables need to be represented numerically for analysis.

- One Hot Encoding Explained

    - One hot encoding transforms a single categorical variable into multiple binary variables.
    - For example, an ad with a call to action is represented as 1, while one without it is represented as 0.

- Mathematical Representation

    - The equation for predicting website clicks incorporates these binary variables, allowing for analysis of their impact.
    - Additional categorical variables, like the streaming service, require more binary variables to capture all possibilities.

- Importance of Binary Variables

    - Using binary variables helps to convey all necessary information about categorical variables without losing data.
    - The final equation reflects the influence of each variable on the outcome, excluding redundant variables.

- Next Steps

    - The content sets the stage for using Python to implement one hot encoding and further explores regression modeling techniques.

-----

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
iris = sns.load_dataset('iris')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
df_onehot = pd.get_dummies(iris,columns=['species'])
df_onehot['species_setosa'].dtype
df_onehot

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_setosa,species_versicolor,species_virginica
0,5.1,3.5,1.4,0.2,True,False,False
1,4.9,3.0,1.4,0.2,True,False,False
2,4.7,3.2,1.3,0.2,True,False,False
3,4.6,3.1,1.5,0.2,True,False,False
4,5.0,3.6,1.4,0.2,True,False,False
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,False,False,True
146,6.3,2.5,5.0,1.9,False,False,True
147,6.5,3.0,5.2,2.0,False,False,True
148,6.2,3.4,5.4,2.3,False,False,True


In [6]:
penguins = sns.load_dataset('penguins')
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [8]:
df_onehot = pd.get_dummies(penguins,columns=['species'])
df_onehot['species_Adelie'].dtype
df_onehot

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,species_Adelie,species_Chinstrap,species_Gentoo
0,Torgersen,39.1,18.7,181.0,3750.0,Male,True,False,False
1,Torgersen,39.5,17.4,186.0,3800.0,Female,True,False,False
2,Torgersen,40.3,18.0,195.0,3250.0,Female,True,False,False
3,Torgersen,,,,,,True,False,False
4,Torgersen,36.7,19.3,193.0,3450.0,Female,True,False,False
...,...,...,...,...,...,...,...,...,...
339,Biscoe,,,,,,False,False,True
340,Biscoe,46.8,14.3,215.0,4850.0,Female,False,False,True
341,Biscoe,50.4,15.7,222.0,5750.0,Male,False,False,True
342,Biscoe,45.2,14.8,212.0,5200.0,Female,False,False,True


In [9]:
df_onehot = pd.get_dummies(penguins,columns=['island'])
df_onehot['island_Torgersen'].dtype
df_onehot

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,island_Biscoe,island_Dream,island_Torgersen
0,Adelie,39.1,18.7,181.0,3750.0,Male,False,False,True
1,Adelie,39.5,17.4,186.0,3800.0,Female,False,False,True
2,Adelie,40.3,18.0,195.0,3250.0,Female,False,False,True
3,Adelie,,,,,,False,False,True
4,Adelie,36.7,19.3,193.0,3450.0,Female,False,False,True
...,...,...,...,...,...,...,...,...,...
339,Gentoo,,,,,,True,False,False
340,Gentoo,46.8,14.3,215.0,4850.0,Female,True,False,False
341,Gentoo,50.4,15.7,222.0,5750.0,Male,True,False,False
342,Gentoo,45.2,14.8,212.0,5200.0,Female,True,False,False


Get dummies , it automatically ignores any missing values

dummy_na=True       is for put another column for df and show the na true

In [10]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [None]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Creating the encoder
enc = OneHotEncoder(handle_unknown='ignore')
#Ignore the missing value

# Sample data
X = [['Red'], ['Green'], ['Blue']]

# Fitting the encoder to the data
enc.fit(X)

# Transforming new data
result = enc.transform([['Red']]).toarray()

# Displaying the encoded result
print(result)

[[0. 0. 1.]]


In [51]:
iris_onehot = iris.copy()

In [52]:
encoder = OneHotEncoder( sparse_output=False, handle_unknown='ignore')
cate_one = ['species']


In [53]:
iris_coded_v = encoder.fit_transform(iris_onehot[cate_one])
iris_coded_v

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0

species --> object

drop this column, before fitting into the model


In [43]:
cate_one

['species']

In [44]:
coded_cols = encoder.get_feature_names_out(cate_one)
coded_cols


array(['species_setosa', 'species_versicolor', 'species_virginica'],
      dtype=object)

In [54]:
# Adding the columns to the dataframe
iris_onehot[coded_cols] = iris_coded_v
iris_onehot.drop(columns= ['species'],inplace = True)
iris_onehot

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_setosa,species_versicolor,species_virginica
0,5.1,3.5,1.4,0.2,1.0,0.0,0.0
1,4.9,3.0,1.4,0.2,1.0,0.0,0.0
2,4.7,3.2,1.3,0.2,1.0,0.0,0.0
3,4.6,3.1,1.5,0.2,1.0,0.0,0.0
4,5.0,3.6,1.4,0.2,1.0,0.0,0.0
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0.0,0.0,1.0
146,6.3,2.5,5.0,1.9,0.0,0.0,1.0
147,6.5,3.0,5.2,2.0,0.0,0.0,1.0
148,6.2,3.4,5.4,2.3,0.0,0.0,1.0


In [32]:
df_penguins = penguins.copy()

In [33]:
encoder = OneHotEncoder( sparse_output=False, handle_unknown='ignore')
cate_penguins1 = ['species']

In [34]:
penguins_onehot = encoder.fit_transform(df_penguins[cate_penguins1])
penguins_onehot


array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], shape=(344, 3))

In [35]:
cate_penguins2 = ['island']
penguins_onehot_island = encoder.fit_transform(df_penguins[cate_penguins2])
penguins_onehot_island

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]], shape=(344, 3))