In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
import pickle

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,2.5,2590.0,sqft,6000.0,sqft,98144,795000.0
1,4,2.0,2240.0,sqft,0.31,acre,98106,915000.0
2,4,3.0,2040.0,sqft,3783.0,sqft,98107,950000.0
3,4,3.0,3800.0,sqft,5175.0,sqft,98199,1950000.0
4,2,2.0,1042.0,sqft,,,98102,950000.0


In [4]:
data.shape

(2016, 8)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   beds            2016 non-null   int64  
 1   baths           2016 non-null   float64
 2   size            2016 non-null   float64
 3   size_units      2016 non-null   object 
 4   lot_size        1669 non-null   float64
 5   lot_size_units  1669 non-null   object 
 6   zip_code        2016 non-null   int64  
 7   price           2016 non-null   float64
dtypes: float64(4), int64(2), object(2)
memory usage: 126.1+ KB


In [6]:
for column in data.columns:
    print(data[column].value_counts())
    print("*"*20)

beds
3     645
2     560
4     398
1     256
5     123
6      22
9       5
7       3
8       2
15      1
14      1
Name: count, dtype: int64
********************
baths
2.0    627
1.0    493
2.5    282
3.0    198
3.5    179
1.5    137
4.0     37
4.5     21
5.0     16
5.5     13
6.0      5
7.0      4
8.5      1
0.5      1
9.0      1
6.5      1
Name: count, dtype: int64
********************
size
2080.0    12
1440.0    11
1460.0    11
1370.0    11
1670.0    11
          ..
1548.0     1
1174.0     1
1865.0     1
578.0      1
795.0      1
Name: count, Length: 879, dtype: int64
********************
size_units
sqft    2016
Name: count, dtype: int64
********************
lot_size
5000.0    61
4000.0    45
6000.0    38
1.0       26
4800.0    16
          ..
745.0      1
5043.0     1
2256.0     1
8540.0     1
4267.0     1
Name: count, Length: 959, dtype: int64
********************
lot_size_units
sqft    1449
acre     220
Name: count, dtype: int64
********************
zip_code
98115    170
98103   

In [7]:
data.isna().sum()

beds                0
baths               0
size                0
size_units          0
lot_size          347
lot_size_units    347
zip_code            0
price               0
dtype: int64

In [8]:
data.drop(columns=['lot_size', 'lot_size_units'],inplace=True)

In [9]:
data.describe()

Unnamed: 0,beds,baths,size,zip_code,price
count,2016.0,2016.0,2016.0,2016.0,2016.0
mean,2.857639,2.15997,1735.740575,98123.638889,963625.2
std,1.255092,1.002023,920.132591,22.650819,944095.4
min,1.0,0.5,250.0,98101.0,159000.0
25%,2.0,1.5,1068.75,98108.0,601750.0
50%,3.0,2.0,1560.0,98117.0,800000.0
75%,4.0,2.5,2222.5,98126.0,1105250.0
max,15.0,9.0,11010.0,98199.0,25000000.0


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   beds        2016 non-null   int64  
 1   baths       2016 non-null   float64
 2   size        2016 non-null   float64
 3   size_units  2016 non-null   object 
 4   zip_code    2016 non-null   int64  
 5   price       2016 non-null   float64
dtypes: float64(3), int64(2), object(1)
memory usage: 94.6+ KB


In [11]:
data['beds'].value_counts()

beds
3     645
2     560
4     398
1     256
5     123
6      22
9       5
7       3
8       2
15      1
14      1
Name: count, dtype: int64

In [12]:
data.head()

Unnamed: 0,beds,baths,size,size_units,zip_code,price
0,3,2.5,2590.0,sqft,98144,795000.0
1,4,2.0,2240.0,sqft,98106,915000.0
2,4,3.0,2040.0,sqft,98107,950000.0
3,4,3.0,3800.0,sqft,98199,1950000.0
4,2,2.0,1042.0,sqft,98102,950000.0


In [13]:
data['price_per_sqft'] = data['price'] * 100000 / data['size']

In [14]:
data['price_per_sqft']

0       3.069498e+07
1       4.084821e+07
2       4.656863e+07
3       5.131579e+07
4       9.117083e+07
            ...     
2011    6.642336e+07
2012    6.186727e+07
2013    5.373832e+07
2014    7.421384e+07
2015    3.853801e+07
Name: price_per_sqft, Length: 2016, dtype: float64

In [15]:
data.describe()

Unnamed: 0,beds,baths,size,zip_code,price,price_per_sqft
count,2016.0,2016.0,2016.0,2016.0,2016.0,2016.0
mean,2.857639,2.15997,1735.740575,98123.638889,963625.2,59158510.0
std,1.255092,1.002023,920.132591,22.650819,944095.4,83279520.0
min,1.0,0.5,250.0,98101.0,159000.0,6796117.0
25%,2.0,1.5,1068.75,98108.0,601750.0,44522210.0
50%,3.0,2.0,1560.0,98117.0,800000.0,55297620.0
75%,4.0,2.5,2222.5,98126.0,1105250.0,65953890.0
max,15.0,9.0,11010.0,98199.0,25000000.0,3424658000.0


In [16]:
data.shape

(2016, 7)

In [17]:
data

Unnamed: 0,beds,baths,size,size_units,zip_code,price,price_per_sqft
0,3,2.5,2590.0,sqft,98144,795000.0,3.069498e+07
1,4,2.0,2240.0,sqft,98106,915000.0,4.084821e+07
2,4,3.0,2040.0,sqft,98107,950000.0,4.656863e+07
3,4,3.0,3800.0,sqft,98199,1950000.0,5.131579e+07
4,2,2.0,1042.0,sqft,98102,950000.0,9.117083e+07
...,...,...,...,...,...,...,...
2011,3,2.0,1370.0,sqft,98112,910000.0,6.642336e+07
2012,1,1.0,889.0,sqft,98121,550000.0,6.186727e+07
2013,4,2.0,2140.0,sqft,98199,1150000.0,5.373832e+07
2014,2,2.0,795.0,sqft,98103,590000.0,7.421384e+07


In [18]:
data.drop(columns=['size_units'], inplace=True)

In [19]:
data.drop(columns=['price_per_sqft'],inplace=True)

In [20]:
data.head()

Unnamed: 0,beds,baths,size,zip_code,price
0,3,2.5,2590.0,98144,795000.0
1,4,2.0,2240.0,98106,915000.0
2,4,3.0,2040.0,98107,950000.0
3,4,3.0,3800.0,98199,1950000.0
4,2,2.0,1042.0,98102,950000.0


In [21]:
data.to_csv('final_dataset.csv')

In [22]:
x = data.drop(columns=['price'])
y = data['price']

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [24]:
print(x_train.shape)
print(y_train.shape)

(1612, 4)
(1612,)


In [25]:
#Applying linear Regression
column_trans = make_column_transformer((OneHotEncoder(sparse=False), ['beds']), remainder='passthrough')

In [26]:
scaler = StandardScaler()
lr = LinearRegression()

In [27]:
x_scaled = scaler.fit_transform(x)
lr.fit(x_scaled, y)

In [28]:
pipe = make_pipeline(column_trans, scaler, lr)

In [29]:
pipe.fit(x_train, y_train)



In [30]:
y_pred_lr = pipe.predict(x_test)

In [31]:
r2_score(y_test, y_pred_lr)

0.5743461632174616

In [32]:
#Using Lasso
lasso = Lasso()

In [33]:
pipe = make_pipeline(column_trans, scaler, lasso)

In [34]:
pipe.fit(x_train, y_train)



In [35]:
y_pred_lasso = pipe.predict(x_test)
r2_score(y_test, y_pred_lasso)

0.5746817917322382

In [36]:
#Using Ridge
ridge = Ridge()

In [37]:
pipe = make_pipeline(column_trans, scaler, ridge)

In [38]:
pipe.fit(x_train, y_train)



In [39]:
y_pred_ridge = pipe.predict(x_test)
r2_score(y_test, y_pred_lasso)

0.5746817917322382

In [40]:
print("No Regularization: ", r2_score(y_test, y_pred_lr))
print("Lasso: ", r2_score(y_test, y_pred_lasso))
print("Ridge: ", r2_score(y_test, y_pred_ridge))

No Regularization:  0.5743461632174616
Lasso:  0.5746817917322382
Ridge:  0.5746884627878555


In [41]:
pickle.dump(pipe, open('RidgeModel.pkl','wb'))