In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [50]:
df = pd.read_csv('SaratogaHouses.csv')

In [51]:
df.head()

Unnamed: 0.1,Unnamed: 0,price,lotSize,age,landValue,livingArea,pctCollege,bedrooms,fireplaces,bathrooms,rooms,heating,fuel,sewer,waterfront,newConstruction,centralAir
0,1,132500,0.09,42,50000,906,35,2,1,1.0,5,electric,electric,septic,No,No,No
1,2,181115,0.92,0,22300,1953,51,3,0,2.5,6,hot water/steam,gas,septic,No,No,No
2,3,109000,0.19,133,7300,1944,51,4,1,1.0,8,hot water/steam,gas,public/commercial,No,No,No
3,4,155000,0.41,13,18700,1944,51,3,1,1.5,5,hot air,gas,septic,No,No,No
4,5,86060,0.11,0,15000,840,51,2,0,1.0,3,hot air,gas,public/commercial,No,Yes,Yes


In [52]:
pd.unique(df['heating'])

array(['electric', 'hot water/steam', 'hot air'], dtype=object)

In [53]:
pd.unique(df['newConstruction'])

array(['No', 'Yes'], dtype=object)

In [54]:
pd.unique(df['sewer'])

array(['septic', 'public/commercial', 'none'], dtype=object)

In [55]:
pd.unique(df['fuel'])

array(['electric', 'gas', 'oil'], dtype=object)

In [56]:
lab = LabelEncoder()
df['newConstruction'] = lab.fit_transform(df['newConstruction'])
df['centralAir'] = lab.fit_transform(df['centralAir'])
df['waterfront'] = lab.fit_transform(df['waterfront'])

In [57]:
df.head()

Unnamed: 0.1,Unnamed: 0,price,lotSize,age,landValue,livingArea,pctCollege,bedrooms,fireplaces,bathrooms,rooms,heating,fuel,sewer,waterfront,newConstruction,centralAir
0,1,132500,0.09,42,50000,906,35,2,1,1.0,5,electric,electric,septic,0,0,0
1,2,181115,0.92,0,22300,1953,51,3,0,2.5,6,hot water/steam,gas,septic,0,0,0
2,3,109000,0.19,133,7300,1944,51,4,1,1.0,8,hot water/steam,gas,public/commercial,0,0,0
3,4,155000,0.41,13,18700,1944,51,3,1,1.5,5,hot air,gas,septic,0,0,0
4,5,86060,0.11,0,15000,840,51,2,0,1.0,3,hot air,gas,public/commercial,0,1,1


In [58]:
sewer = df[['sewer']]
fuel = df[['fuel']]
heating = df[['heating']]

In [59]:
# df[['price']]

In [60]:
sewer_onehot = OneHotEncoder()
sewer = sewer_onehot.fit_transform(sewer)

In [61]:
fuel_onehot = OneHotEncoder()
fuel = fuel_onehot.fit_transform(fuel)

In [62]:
heat_onehot = OneHotEncoder()
heating = heat_onehot.fit_transform(heating)

In [63]:
sewer

<1728x3 sparse matrix of type '<class 'numpy.float64'>'
	with 1728 stored elements in Compressed Sparse Row format>

In [64]:
sewer = sewer.toarray()

In [65]:
fuel = fuel.toarray()
heating = heating.toarray()

In [66]:
sewer

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [19]:
df.drop(columns=['Unnamed: 0','heating','sewer','fuel'], inplace=True)

In [20]:
df.head()

Unnamed: 0,price,lotSize,age,landValue,livingArea,pctCollege,bedrooms,fireplaces,bathrooms,rooms,waterfront,newConstruction,centralAir
0,132500,0.09,42,50000,906,35,2,1,1.0,5,0,0,0
1,181115,0.92,0,22300,1953,51,3,0,2.5,6,0,0,0
2,109000,0.19,133,7300,1944,51,4,1,1.0,8,0,0,0
3,155000,0.41,13,18700,1944,51,3,1,1.5,5,0,0,0
4,86060,0.11,0,15000,840,51,2,0,1.0,3,0,1,1


In [21]:
X = df.iloc[:,1:].values
y = df['price'].values

In [22]:
X = np.c_[X, sewer, fuel, heating]

In [23]:
X.shape

(1728, 21)

In [24]:
X[0]

array([9.00e-02, 4.20e+01, 5.00e+04, 9.06e+02, 3.50e+01, 2.00e+00,
       1.00e+00, 1.00e+00, 5.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 1.00e+00, 1.00e+00, 0.00e+00, 0.00e+00,
       1.00e+00, 0.00e+00, 0.00e+00])

In [25]:
minmax = MinMaxScaler()
X = minmax.fit_transform(X)
y = minmax.fit_transform(y.reshape(-1,1))

In [26]:
X[0]

array([0.00737705, 0.18666667, 0.12075655, 0.06287944, 0.24193548,
       0.16666667, 0.25      , 0.22222222, 0.3       , 0.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       1.        , 0.        , 0.        , 1.        , 0.        ,
       0.        ])

In [27]:
y[0]

array([0.16558442])

In [28]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

In [29]:
x_train.shape

(1296, 21)

In [30]:
x_test.shape

(432, 21)

In [31]:
regression = LinearRegression()
regression.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [32]:
regression.coef_

array([[ 0.13495482, -0.04205891,  0.49483185,  0.4036916 , -0.01854885,
        -0.06317964,  0.01791926,  0.15813433,  0.03354941,  0.1605478 ,
        -0.06678617,  0.0133743 , -0.00092716, -0.00118372,  0.00211088,
        -0.00548705,  0.0070974 , -0.00161035,  0.00278675,  0.00508262,
        -0.00786937]])

In [33]:
y_pred = regression.predict(x_test)

In [34]:
mean_squared_error(y_test, y_pred)

0.006484893572818613

In [35]:
r2_score(y_test, y_pred)

0.5894625670040989

In [36]:
df.head()

Unnamed: 0,price,lotSize,age,landValue,livingArea,pctCollege,bedrooms,fireplaces,bathrooms,rooms,waterfront,newConstruction,centralAir
0,132500,0.09,42,50000,906,35,2,1,1.0,5,0,0,0
1,181115,0.92,0,22300,1953,51,3,0,2.5,6,0,0,0
2,109000,0.19,133,7300,1944,51,4,1,1.0,8,0,0,0
3,155000,0.41,13,18700,1944,51,3,1,1.5,5,0,0,0
4,86060,0.11,0,15000,840,51,2,0,1.0,3,0,1,1


In [37]:
df.insert(loc=1,column='inserted',value=df['rooms']/2)

In [38]:
df.head()

Unnamed: 0,price,inserted,lotSize,age,landValue,livingArea,pctCollege,bedrooms,fireplaces,bathrooms,rooms,waterfront,newConstruction,centralAir
0,132500,2.5,0.09,42,50000,906,35,2,1,1.0,5,0,0,0
1,181115,3.0,0.92,0,22300,1953,51,3,0,2.5,6,0,0,0
2,109000,4.0,0.19,133,7300,1944,51,4,1,1.0,8,0,0,0
3,155000,2.5,0.41,13,18700,1944,51,3,1,1.5,5,0,0,0
4,86060,1.5,0.11,0,15000,840,51,2,0,1.0,3,0,1,1


In [39]:
# df = df.reindex(columns=['price','age'])

In [41]:
minmax.inverse_transform(y_pred)[:5]

array([[137109.89788079],
       [172494.30165819],
       [207012.0170404 ],
       [155391.15924935],
       [294042.279136  ]])

In [67]:
newCons = 'Yes'
waterfront = 'No'
centralAir = 'Yes'
sewer_test = 'septic'
fuel_test = 'gas'
heating_test = 'electric'

In [68]:
lab.transform([newCons])

array([1], dtype=int64)

In [69]:
lab.transform([waterfront])

array([0], dtype=int64)

In [70]:
lab.transform([centralAir])

array([1], dtype=int64)

In [72]:
sewer_onehot.transform([[sewer_test]]).toarray()

array([[0., 0., 1.]])

In [73]:
fuel_onehot.transform([[fuel_test]]).toarray()

array([[0., 1., 0.]])

In [74]:
heat_onehot.transform([[heating_test]]).toarray()

array([[1., 0., 0.]])