# One-Hot-Encoding

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("home_prices3.csv")
df.head()

Unnamed: 0,locality,area_sqr_ft,price_lakhs,bedrooms
0,Kollur,656,39.0,2
1,Kollur,1260,83.2,2
2,Kollur,1057,86.6,3
3,Kollur,1259,59.0,2
4,Kollur,1800,140.0,3


# One_hot encoding using Pandas

In [9]:
df_encoded = pd.get_dummies(df, columns=["locality"], drop_first=True)
df_encoded.sample(5)

Unnamed: 0,area_sqr_ft,price_lakhs,bedrooms,locality_Kollur,locality_Mankhal
3,1259,59.0,2,True,False
18,1008,50.0,2,False,True
14,1100,85.0,2,False,False
13,2400,300.0,3,False,False
4,1800,140.0,3,True,False


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X = df_encoded.drop('price_lakhs', axis=1)
y = df_encoded['price_lakhs']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
model = LinearRegression()
model.fit(X_train, y_train)

In [13]:
model.score(X_test, y_test)

0.8558905263155381

In [14]:
test = pd.DataFrame([
    {'area_sqr_ft': 1600, 'bedrooms':2, 'locality_Kollur': False, 'locality_Mankhal': False},
    {'area_sqr_ft': 1600, 'bedrooms':2, 'locality_Kollur': False, 'locality_Mankhal': True},
])

In [15]:
model.predict(test)

array([157.03383393, 109.25104283])

# One_hot encoding using SK Learn

In [21]:
from sklearn.preprocessing import OneHotEncoder

In [24]:
encoder = OneHotEncoder(drop='first', sparse_output=False)  

encoded_array = encoder.fit_transform(df[['locality']],)

encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(['locality']))

df_final = pd.concat([df.drop(columns=['locality']), encoded_df], axis=1)

print(df_final.head())

   area_sqr_ft  price_lakhs  bedrooms  locality_Kollur  locality_Mankhal
0          656         39.0         2              1.0               0.0
1         1260         83.2         2              1.0               0.0
2         1057         86.6         3              1.0               0.0
3         1259         59.0         2              1.0               0.0
4         1800        140.0         3              1.0               0.0
