In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [11]:
df = pd.read_csv("D:\OneDrive - Northeastern University\Jupyter Notebook\Machine Learning Algorithms\Datasets/Clean_data")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
0,1,60,RL,65.0,8450,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,2,2008,WD,Normal,208500,0,0,0
1,2,20,RL,80.0,9600,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,5,2007,WD,Normal,181500,0,0,0
2,3,60,RL,68.0,11250,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,9,2008,WD,Normal,223500,0,0,0
3,4,70,RL,60.0,9550,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,2,2006,WD,Abnorml,140000,0,0,0
4,5,60,RL,84.0,14260,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,12,2008,WD,Normal,250000,0,0,0


In [12]:
#creating a small subset of the data to make it more understandable
df = df[["Alley","LotShape","GarageCond","MasVnrArea","SalePrice"]]
df.head(5)

Unnamed: 0,Alley,LotShape,GarageCond,MasVnrArea,SalePrice
0,Missing,Reg,TA,196.0,208500
1,Missing,Reg,TA,0.0,181500
2,Missing,IR1,TA,162.0,223500
3,Missing,IR1,TA,0.0,140000
4,Missing,IR1,TA,350.0,250000


# One- Hot Encoding Using Pandas (Get Dummies)

In [9]:
#converting into dummy variables
def dummy_encoding(data):
    dummy = pd.get_dummies(data, drop_first=True)
    print("The columns of the data are:", dummy.columns,'\n')
    
    # Converting True/False to 1/0
    dummy = dummy.astype(int)
    # Replace the True and False values
    dummy = dummy.replace({'True':'1','False':'0'})
    print(dummy.info(),'\n')
    
    return dummy

In [10]:
dummy_encoding(df)

The columns of the data are: Index(['MasVnrArea', 'SalePrice', 'Alley_Missing', 'Alley_Pave',
       'LotShape_IR2', 'LotShape_IR3', 'LotShape_Reg', 'GarageCond_Fa',
       'GarageCond_Gd', 'GarageCond_Missing', 'GarageCond_Po',
       'GarageCond_TA'],
      dtype='object') 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   MasVnrArea          1460 non-null   int32
 1   SalePrice           1460 non-null   int32
 2   Alley_Missing       1460 non-null   int32
 3   Alley_Pave          1460 non-null   int32
 4   LotShape_IR2        1460 non-null   int32
 5   LotShape_IR3        1460 non-null   int32
 6   LotShape_Reg        1460 non-null   int32
 7   GarageCond_Fa       1460 non-null   int32
 8   GarageCond_Gd       1460 non-null   int32
 9   GarageCond_Missing  1460 non-null   int32
 10  GarageCond_Po       1460 non-null   int32
 11 

Unnamed: 0,MasVnrArea,SalePrice,Alley_Missing,Alley_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,GarageCond_Fa,GarageCond_Gd,GarageCond_Missing,GarageCond_Po,GarageCond_TA
0,196,208500,1,0,0,0,1,0,0,0,0,1
1,0,181500,1,0,0,0,1,0,0,0,0,1
2,162,223500,1,0,0,0,0,0,0,0,0,1
3,0,140000,1,0,0,0,0,0,0,0,0,1
4,350,250000,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0,175000,1,0,0,0,1,0,0,0,0,1
1456,119,210000,1,0,0,0,1,0,0,0,0,1
1457,0,266500,1,0,0,0,1,0,0,0,0,1
1458,0,142125,1,0,0,0,1,0,0,0,0,1


# One Hot Encoing Using Sci-kit Learn

In [39]:
def one_hot_encoding(data):
    from sklearn.preprocessing import OneHotEncoder
    encoder = OneHotEncoder(categories="auto", drop="first", sparse=False, handle_unknown="error")
    # Fit
    encoder.fit(data)
    # Transform
    variables = encoder.transform(data)
    print("The Categorial Columns are:",encoder.categories_)
    final_data = pd.DataFrame(variables, columns = encoder.get_feature_names_out(data.columns))
    
    return final_data

In [29]:
df_cat_values = df.drop(["MasVnrArea","SalePrice"],axis=1)
one_hot_encoding(df_cat_values)

The Categorial Columns are: [array(['Grvl', 'Missing', 'Pave'], dtype=object), array(['IR1', 'IR2', 'IR3', 'Reg'], dtype=object), array(['Ex', 'Fa', 'Gd', 'Missing', 'Po', 'TA'], dtype=object)]


Unnamed: 0,Alley_Missing,Alley_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,GarageCond_Fa,GarageCond_Gd,GarageCond_Missing,GarageCond_Po,GarageCond_TA
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
1455,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1456,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1457,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1458,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
