# 2. Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

## 3. Load Dataset

In [2]:
url = "https://raw.githubusercontent.com/akdubey2k/ML/main/ML_4_OneHotEncoding_and_DummyVariable/ML_4_OneHotEncoding_homeprices.csv"
df = pd.read_csv(url)
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


# 4. Explore Dataset (EDA - Exploratory Data Analysis (EDA))

In [29]:
print("\nnumber of elements in the dataset ".ljust(50, '.'), ": ", df.size)
print("\nnumber of array dimensions of dataset ".ljust(50, '.'), ": ", df.ndim)
# prints information about a dataset, including the index, dtype and columns, non-null values and memory usage.
print("\ninformation about a dataset".ljust(50, '.'), ": ")
print(df.info)
print("\ndimensions of dataset in matrix (rows & columns) ".ljust(50, '.'), ": ", df.shape)


number of elements in the dataset ............... :  39

number of array dimensions of dataset ........... :  2

information about a dataset...................... : 
<bound method DataFrame.info of     town  area   price
0      0  2600  550000
1      0  3000  565000
2      0  3200  610000
3      0  3600  680000
4      0  4000  725000
5      2  2600  585000
6      2  2800  615000
7      2  3300  650000
8      2  3600  710000
9      1  2600  575000
10     1  2900  600000
11     1  3100  620000
12     1  3600  695000>

dimensions of dataset in matrix (rows & columns)  :  (13, 3)


In [30]:
dummies = pd.get_dummies(df.town).values
dummies

array([[ True, False, False],
       [ True, False, False],
       [ True, False, False],
       [ True, False, False],
       [ True, False, False],
       [False, False,  True],
       [False, False,  True],
       [False, False,  True],
       [False, False,  True],
       [False,  True, False],
       [False,  True, False],
       [False,  True, False],
       [False,  True, False]])

In [31]:
merged = pd.concat([df, dummies], axis='columns')
merged

TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid

In [5]:
merged = merged.drop(['town'], axis='columns')
merged

Unnamed: 0,area,price,monroe township,robinsville,west windsor
0,2600,550000,True,False,False
1,3000,565000,True,False,False
2,3200,610000,True,False,False
3,3600,680000,True,False,False
4,4000,725000,True,False,False
5,2600,585000,False,False,True
6,2800,615000,False,False,True
7,3300,650000,False,False,True
8,3600,710000,False,False,True
9,2600,575000,False,True,False


In [6]:
final = merged.drop(['west windsor'], axis='columns')
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,True,False
1,3000,565000,True,False
2,3200,610000,True,False
3,3600,680000,True,False
4,4000,725000,True,False
5,2600,585000,False,False
6,2800,615000,False,False
7,3300,650000,False,False
8,3600,710000,False,False
9,2600,575000,False,True


In [7]:
X = final.drop(['price'], axis='columns')
X

Unnamed: 0,area,monroe township,robinsville
0,2600,True,False
1,3000,True,False
2,3200,True,False
3,3600,True,False
4,4000,True,False
5,2600,False,False
6,2800,False,False
7,3300,False,False
8,3600,False,False
9,2600,False,True


In [8]:
y = final.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [9]:
model = LinearRegression()
model.fit(X, y)

In [10]:
model.predict(X)

array([539709.7398409 , 590468.71640508, 615848.20468716, 666607.18125134,
       717366.15781551, 579723.71533005, 605103.20361213, 668551.92431735,
       706621.15674048, 565396.15136531, 603465.38378844, 628844.87207052,
       692293.59277574])

In [11]:
model.score(X, y)

0.9573929037221872

In [12]:
# 3400 sqr ft home in west windsor [0,0].
# It's mean west windsor is 1 (monroe township, robinsville, west windsor) and here we dropped "west windsor"
model.predict([[3400, 0, 0]])



array([681241.66845839])

In [13]:
# 2800 sqr ft home in robbinsville
model.predict([[2800, 0, 1]])



array([590775.63964739])

Using **sklearn**

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dfle = df
dfle.town = le.fit_transform(dfle.town)
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [15]:
X = dfle.drop(['price'], axis='columns')
X

Unnamed: 0,town,area
0,0,2600
1,0,3000
2,0,3200
3,0,3600
4,0,4000
5,2,2600
6,2,2800
7,2,3300
8,2,3600
9,1,2600


In [16]:
y = dfle.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [17]:
dfle[['town', 'area']].values

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]])

In [18]:
dfle['price'].values

array([550000, 565000, 610000, 680000, 725000, 585000, 615000, 650000,
       710000, 575000, 600000, 620000, 695000])

In [19]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [20]:
ct_ohe = ColumnTransformer([('town', OneHotEncoder(), [0])], remainder='passthrough')
ct_ohe

In [21]:
X = ct_ohe.fit_transform(X)
X
# 0,               1,           2
# monroe township, robinsville, west windsor

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

In [22]:
X = X[:, 1:]
X
# 0,           1
# robinsville, west windsor

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [23]:
model.fit(X, y)

In [24]:
model.score(X, y)

0.9573929037221873

In [25]:
# 3400 sqr ft home in west windsor [0, 1, 3400].
# It's mean west windsor is 1 (monroe township, robinsville, west windsor) and here we dropped "west windsor"
model.predict([[0, 1, 3400]])

array([681241.6684584])

In [26]:
# 2800 sqr ft home in robbinsville [1, 0, 2800]
model.predict([[1, 0, 2800]])

array([590775.63964739])