In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model

In [2]:
df = pd.read_csv("C:/Users/abhui/Machine Learning/Data/homeprices.csv")
df.head()

Unnamed: 0,town,area,bedrooms,age,price
0,monroe,2600,3.0,20,550000
1,monroe,3000,4.0,15,565000
2,west windsor,3200,,18,610000
3,west windsor,3600,3.0,30,595000
4,robinsville,4000,5.0,8,760000


Data Preprocessing: Fill NA values with median value of a column

In [3]:
df.bedrooms = df.bedrooms.fillna(df.bedrooms.median())
df

Unnamed: 0,town,area,bedrooms,age,price
0,monroe,2600,3.0,20,550000
1,monroe,3000,4.0,15,565000
2,west windsor,3200,4.0,18,610000
3,west windsor,3600,3.0,30,595000
4,robinsville,4000,5.0,8,760000
5,robinsville,4100,6.0,8,810000


In [4]:
reg_model = linear_model.LinearRegression()
reg_model.fit(df.drop(['price', 'town'], axis='columns'),df.price)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [5]:
reg_model.coef_

array([  112.06244194, 23388.88007794, -3231.71790863])

In [6]:
reg_model.intercept_

221323.00186540443

Find price of home with 3000 sqr ft area, 3 bedrooms, 40 year old

In [7]:
112.06244194*3000 + 23388.88007794*3 + -3231.71790863*40 + 221323.00186540443

498408.25157402444

In [8]:
reg_model.predict([[3000, 3, 40]])

array([498408.25158031])

Using sklearn OneHotEncoder
First step is to use label encoder to convert town names into numbers

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [10]:
df_encoded = df
df_encoded.town = le.fit_transform(df_encoded.town)
df_encoded

Unnamed: 0,town,area,bedrooms,age,price
0,0,2600,3.0,20,550000
1,0,3000,4.0,15,565000
2,2,3200,4.0,18,610000
3,2,3600,3.0,30,595000
4,1,4000,5.0,8,760000
5,1,4100,6.0,8,810000


In [11]:
X = df_encoded[['town','area','bedrooms','age']].values
X

array([[0.0e+00, 2.6e+03, 3.0e+00, 2.0e+01],
       [0.0e+00, 3.0e+03, 4.0e+00, 1.5e+01],
       [2.0e+00, 3.2e+03, 4.0e+00, 1.8e+01],
       [2.0e+00, 3.6e+03, 3.0e+00, 3.0e+01],
       [1.0e+00, 4.0e+03, 5.0e+00, 8.0e+00],
       [1.0e+00, 4.1e+03, 6.0e+00, 8.0e+00]])

In [12]:
y = df_encoded.price.values
y

array([550000, 565000, 610000, 595000, 760000, 810000], dtype=int64)

Now use one hot encoder to create dummy variables for each of the town

In [13]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('town', OneHotEncoder(), [0])], remainder = 'passthrough')

In [14]:
X = ct.fit_transform(X)
X

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03, 3.0e+00, 2.0e+01],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03, 4.0e+00, 1.5e+01],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.2e+03, 4.0e+00, 1.8e+01],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03, 3.0e+00, 3.0e+01],
       [0.0e+00, 1.0e+00, 0.0e+00, 4.0e+03, 5.0e+00, 8.0e+00],
       [0.0e+00, 1.0e+00, 0.0e+00, 4.1e+03, 6.0e+00, 8.0e+00]])

In [15]:
X = X[:,1:]

In [16]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [27]:
model.predict([[0,1,3400, 6, 8]]) # 3400 sqr ft home in west windsor

array([664098.36065574])