## One Hot Encoding --

1. Using Pandas (get_dummies)
2. Using Sklearn library (from sklearn.preprocessing import OneHotEncoder)

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv("D:\OneDrive - Northeastern University\Jupyter Notebook\Machine Learning Algorithms\Datasets/Clean_data")

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
0,1,60,RL,65.0,8450,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,2,2008,WD,Normal,208500,0,0,0
1,2,20,RL,80.0,9600,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,5,2007,WD,Normal,181500,0,0,0
2,3,60,RL,68.0,11250,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,9,2008,WD,Normal,223500,0,0,0
3,4,70,RL,60.0,9550,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,2,2006,WD,Abnorml,140000,0,0,0
4,5,60,RL,84.0,14260,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,12,2008,WD,Normal,250000,0,0,0


In [4]:
#split the data to avoid over fitting
X_train, X_test, y_train, y_test = train_test_split(train, train["SalePrice"], test_size=0.3, random_state=42)
X_train.shape , X_test.shape

((1022, 84), (438, 84))

In [5]:
#creating a small subset of the data to make it more understandable
X_train = X_train[["Alley","LotShape","GarageCond","MasVnrArea","SalePrice"]]
X_train

Unnamed: 0,Alley,LotShape,GarageCond,MasVnrArea,SalePrice
135,Missing,Reg,TA,288.0,174000
1452,Missing,Reg,TA,80.0,145000
762,Missing,Reg,TA,0.0,215200
932,Missing,IR1,TA,302.0,320000
435,Missing,IR2,TA,0.0,212000
...,...,...,...,...,...
1095,Missing,IR1,TA,0.0,176432
1130,Missing,Reg,TA,0.0,135000
1294,Missing,Reg,TA,0.0,115000
860,Missing,Reg,TA,0.0,189950


# 1. One- Hot Encoding Using Pandas

In [6]:
#converting into dummy variables
dummy = pd.get_dummies(X_train,drop_first=True)

### A. Before One hot Encoding

In [7]:
#data before the one hot encoding 
X_train.head()

Unnamed: 0,Alley,LotShape,GarageCond,MasVnrArea,SalePrice
135,Missing,Reg,TA,288.0,174000
1452,Missing,Reg,TA,80.0,145000
762,Missing,Reg,TA,0.0,215200
932,Missing,IR1,TA,302.0,320000
435,Missing,IR2,TA,0.0,212000


In [8]:
X_train.columns

Index(['Alley', 'LotShape', 'GarageCond', 'MasVnrArea', 'SalePrice'], dtype='object')

### B. After One-Hot encoding

In [9]:
#data after encoding
dummy.head()

Unnamed: 0,MasVnrArea,SalePrice,Alley_Missing,Alley_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,GarageCond_Fa,GarageCond_Gd,GarageCond_Missing,GarageCond_Po,GarageCond_TA
135,288.0,174000,True,False,False,False,True,False,False,False,False,True
1452,80.0,145000,True,False,False,False,True,False,False,False,False,True
762,0.0,215200,True,False,False,False,True,False,False,False,False,True
932,302.0,320000,True,False,False,False,False,False,False,False,False,True
435,0.0,212000,True,False,True,False,False,False,False,False,False,True


In [10]:
def dummy_coding(data):
    data = data.astype(int)
    # Replace the True and False values
    data = data.replace({'True':'1','False':'0'})
    print(data.info())
    return data

dummy_coding(dummy)

<class 'pandas.core.frame.DataFrame'>
Index: 1022 entries, 135 to 1126
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   MasVnrArea          1022 non-null   int32
 1   SalePrice           1022 non-null   int32
 2   Alley_Missing       1022 non-null   int32
 3   Alley_Pave          1022 non-null   int32
 4   LotShape_IR2        1022 non-null   int32
 5   LotShape_IR3        1022 non-null   int32
 6   LotShape_Reg        1022 non-null   int32
 7   GarageCond_Fa       1022 non-null   int32
 8   GarageCond_Gd       1022 non-null   int32
 9   GarageCond_Missing  1022 non-null   int32
 10  GarageCond_Po       1022 non-null   int32
 11  GarageCond_TA       1022 non-null   int32
dtypes: int32(12)
memory usage: 55.9 KB
None


Unnamed: 0,MasVnrArea,SalePrice,Alley_Missing,Alley_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,GarageCond_Fa,GarageCond_Gd,GarageCond_Missing,GarageCond_Po,GarageCond_TA
135,288,174000,1,0,0,0,1,0,0,0,0,1
1452,80,145000,1,0,0,0,1,0,0,0,0,1
762,0,215200,1,0,0,0,1,0,0,0,0,1
932,302,320000,1,0,0,0,0,0,0,0,0,1
435,0,212000,1,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1095,0,176432,1,0,0,0,0,0,0,0,0,1
1130,0,135000,1,0,0,0,1,0,0,0,0,1
1294,0,115000,1,0,0,0,1,0,0,0,0,1
860,0,189950,1,0,0,0,1,0,0,0,0,1


# 2. One Hot Encoing Using Sci-kit Learn

In [11]:
from sklearn.preprocessing import OneHotEncoder

In [12]:
encoder = OneHotEncoder(categories="auto", #learning the category automatically
                        drop="first", #creating k-1 category
                       sparse=False, # this will return numpy array else it will return sparse matix
                        handle_unknown="error")  #helps to deal with unknown values

#here we have to remove continious variable and then feed it to the encoder
encoder.fit(X_train.drop(["MasVnrArea","SalePrice"],axis=1))



In [13]:
encoder.categories_

[array(['Grvl', 'Missing', 'Pave'], dtype=object),
 array(['IR1', 'IR2', 'IR3', 'Reg'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'Missing', 'Po', 'TA'], dtype=object)]

In [14]:
#transforming the data
variables = encoder.transform(X_train.drop(["MasVnrArea","SalePrice"],axis=1))
variables

array([[1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       ...,
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.]])

In [15]:
#converting into dataframe
variables = pd.DataFrame(variables, columns = encoder.get_feature_names_out(["Alley","LotShape","GarageCond"]))
variables = variables.astype(int)
variables

Unnamed: 0,Alley_Missing,Alley_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,GarageCond_Fa,GarageCond_Gd,GarageCond_Missing,GarageCond_Po,GarageCond_TA
0,1,0,0,0,1,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,1
2,1,0,0,0,1,0,0,0,0,1
3,1,0,0,0,0,0,0,0,0,1
4,1,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
1017,1,0,0,0,0,0,0,0,0,1
1018,1,0,0,0,1,0,0,0,0,1
1019,1,0,0,0,1,0,0,0,0,1
1020,1,0,0,0,1,0,0,0,0,1


In [16]:
#concatinating the two data
df= pd.concat([variables, X_train[["MasVnrArea","SalePrice"]]],axis=1)

In [17]:
df.head()

Unnamed: 0,Alley_Missing,Alley_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,GarageCond_Fa,GarageCond_Gd,GarageCond_Missing,GarageCond_Po,GarageCond_TA,MasVnrArea,SalePrice
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,196.0,208500.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,181500.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,162.0,223500.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,140000.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,350.0,250000.0
