In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder



In [2]:
X = pd.DataFrame({"input1" : [1,2,3,4,5],
                 "input2" : ["A","A","B","B","C"],
                  "input3" : ["X","X","X","Y","Y"]}) 

In [3]:
X

Unnamed: 0,input1,input2,input3
0,1,A,X
1,2,A,X
2,3,B,X
3,4,B,Y
4,5,C,Y


In [4]:
categorical_vars = ["input2", "input3"]

In [5]:
#Instantiate one hot encoder object
one_hot_encoder = OneHotEncoder(sparse=False)

In [6]:
encoder_vars_array = one_hot_encoder.fit_transform(X[categorical_vars])



In [7]:
encoder_vars_array

array([[1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0.],
       [0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 1.]])

In [8]:
#binary representations for input 2 and input 3 
#input 2 has 3 unique values, input 3 has 2 unique values
#However, we need to know what each column represents
#Get feature names

In [9]:
# Get feature names
encoder_feature_names = one_hot_encoder.get_feature_names_out(categorical_vars)

In [10]:
encoder_feature_names

array(['input2_A', 'input2_B', 'input2_C', 'input3_X', 'input3_Y'],
      dtype=object)

In [11]:
#Data frame to hold new columns

encoder_vars_df = pd.DataFrame(encoder_vars_array, columns = encoder_feature_names)

In [12]:
encoder_vars_df
#Now have data frame with encoded columns

Unnamed: 0,input2_A,input2_B,input2_C,input3_X,input3_Y
0,1.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,1.0


In [13]:
#Concatenate new data from back to original data frame

In [14]:
X_new = pd.concat([X.reset_index(drop=True), encoder_vars_df.reset_index(drop=True)], axis=1)
#reset index - we want to be sure that no rows are not aligned, which can result in missing values
#axis=1 so pandas knows we're concatenating columns, and not rows

In [15]:
X_new

Unnamed: 0,input1,input2,input3,input2_A,input2_B,input2_C,input3_X,input3_Y
0,1,A,X,1.0,0.0,0.0,1.0,0.0
1,2,A,X,1.0,0.0,0.0,1.0,0.0
2,3,B,X,0.0,1.0,0.0,1.0,0.0
3,4,B,Y,0.0,1.0,0.0,0.0,1.0
4,5,C,Y,0.0,0.0,1.0,0.0,1.0


In [16]:
#Drop original input2 and input3
X_new.drop(categorical_vars, axis = 1) #1 - dealing with columns, 
#inplace=True - so that this changes are applied to X_new object, rather than printed to console

Unnamed: 0,input1,input2_A,input2_B,input2_C,input3_X,input3_Y
0,1,1.0,0.0,0.0,1.0,0.0
1,2,1.0,0.0,0.0,1.0,0.0
2,3,0.0,1.0,0.0,1.0,0.0
3,4,0.0,1.0,0.0,0.0,1.0
4,5,0.0,0.0,1.0,0.0,1.0


In [18]:
#When using one hot encoding, to prevent dummy variable trap

#Add parameter - ensure one of the encoded columns is always removed

one_hot_encoder = OneHotEncoder(sparse=False, drop = "first")

In [22]:
one_hot_encoder
encoder_vars_array = one_hot_encoder.fit_transform(X[categorical_vars])
encoder_feature_names = one_hot_encoder.get_feature_names_out(categorical_vars)
encoder_vars_df = pd.DataFrame(encoder_vars_array, columns = encoder_feature_names)
X_new = pd.concat([X.reset_index(drop=True), encoder_vars_df.reset_index(drop=True)], axis=1)
X_new.drop(categorical_vars, axis = 1) #1 - dealing with columns, 



Unnamed: 0,input1,input2_B,input2_C,input3_Y
0,1,0.0,0.0,0.0
1,2,0.0,0.0,0.0
2,3,1.0,0.0,0.0
3,4,1.0,0.0,1.0
4,5,0.0,1.0,1.0


In [None]:
#Now have three encoded columns - dropped one for input2, and one for input3