In [2]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_absolute_percentage_error

In [3]:
def load_house_attributes(inputpath):
    cols = ['bedrooms', 'bathrooms', 'area', 'zipcode', 'price']
    df = pd.read_csv(inputpath, sep=' ', header=None, names=cols)
    zipcodes = df['zipcode'].value_counts().keys().tolist()
    counts = df['zipcode'].value_counts().tolist()
    zipped_list = zip(zipcodes, counts)
    for (zipcode, count) in zipped_list:
        if count < 25:
            idx = df[df['zipcode'] == zipcode].index
            df.drop(idx, inplace=True)
    return df


In [14]:
df = load_house_attributes('HousesInfo.txt')
df

Unnamed: 0,bedrooms,bathrooms,area,zipcode,price
30,5,3.0,2520,93446,789000
32,3,2.0,1802,93446,365000
39,3,3.0,2146,93446,455000
80,4,2.5,2464,91901,599000
81,2,2.0,1845,91901,529800
...,...,...,...,...,...
499,4,4.0,3000,93446,1495000
500,3,2.0,2330,93446,599900
501,3,2.5,1339,93446,344900
502,3,2.0,1472,93446,309995


In [15]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
train

Unnamed: 0,bedrooms,bathrooms,area,zipcode,price
351,2,2.0,1512,92276,98900
479,2,2.0,1190,93446,739900
498,3,2.5,1500,93446,319000
185,6,4.0,3492,94501,1495000
194,2,1.0,805,94501,625000
...,...,...,...,...,...
181,3,2.5,2185,94501,490000
228,5,5.0,3701,92880,579000
412,2,2.0,1344,92276,139000
490,3,2.0,2108,93446,439000


In [16]:
test

Unnamed: 0,bedrooms,bathrooms,area,zipcode,price
305,4,3.0,2570,92880,510000
110,4,3.0,3277,91901,669000
92,3,2.5,2836,91901,979000
464,3,3.0,2390,93446,769000
145,3,2.5,1655,92677,649900
...,...,...,...,...,...
439,3,2.0,1152,93510,425000
192,3,2.0,1490,94501,949000
204,2,1.0,1110,94501,649000
483,2,2.0,1088,93446,280000


In [17]:
# normalizing continues features
def preprocess_house_continues_attribute(train, test):
    continues = ['bedrooms', 'bathrooms', 'area']
    standarding = StandardScaler()
    trainContinues = standarding.fit_transform(train[continues])
    testContinues = standarding.fit_transform(test[continues])
    return trainContinues, testContinues

trainContinues,testContinues = preprocess_house_continues_attribute(train, test)


In [18]:
encoder = OneHotEncoder(sparse=False)
trainCategorical = encoder.fit_transform(np.array(train['zipcode']).reshape(-1, 1))
testCategorical = encoder.fit_transform(np.array(test['zipcode']).reshape(-1, 1))
print(testCategorical.shape)
print(trainCategorical.shape)
print(trainCategorical)

(73, 7)
(289, 7)
[[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [19]:
trainX = np.hstack([trainContinues, trainCategorical])
testX = np.hstack([testContinues, testCategorical])

In [20]:
maxprice = train['price'].max()
trainY = train['price']/maxprice
testY= test['price']/maxprice
print(train['price'])
print(test['price']) 

351      98900
479     739900
498     319000
185    1495000
194     625000
        ...   
181     490000
228     579000
412     139000
490     439000
212     599000
Name: price, Length: 289, dtype: int64
305    510000
110    669000
92     979000
464    769000
145    649900
        ...  
439    425000
192    949000
204    649000
483    280000
445    541000
Name: price, Length: 73, dtype: int64


In [42]:
# this is finding prameter by gradiant decent
model = SGDRegressor(tol=0.000001)
model.fit(trainX, trainY)
print(model.coef_)

[-0.00475205  0.01265794  0.04040491  0.01339453 -0.0337682   0.06434771
 -0.04517922  0.01486195  0.0122659   0.06346062]


In [43]:
predicted = model.predict(testX)
diff = predicted - testY
precentdiff = (diff/testY)*100
absprecentdiff = np.abs(precentdiff)
mean = np.mean(absprecentdiff)
std = np.std(absprecentdiff)
print('[INFO] mean: {:.2f}, std: {:.2f}'.format(mean, std))

[INFO] mean: 32.98, std: 40.28


In [44]:
mean_absolute_percentage_error(testY, predicted)

0.3298086164219486

In [45]:
# this is finding prameter by formula
model2 = LinearRegression()
model2.fit(trainX, trainY)
print(model2.coef_)

[-0.00325816  0.0138258   0.03955316 -0.00216541 -0.04708977  0.0634664
 -0.06610923 -0.0002863  -0.00295073  0.05513504]


In [47]:
predicted2 = model2.predict(testX)
diff2 = predicted2 - testY
precentdiff2 = (diff2/testY)*100
absprecentdiff2 = np.abs(precentdiff2)
mean2 = np.mean(absprecentdiff2)
std2 = np.std(absprecentdiff2)
print('[INFO] mean: {:.2f}, std: {:.2f}'.format(mean2, std2))

[INFO] mean: 33.32, std: 41.75


In [48]:
mean_absolute_percentage_error(testY, predicted2)

0.33319318782376506