# Load the data

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

dirpath = 'HousePredictionKaggle/'
import os
for dirname, _, filenames in os.walk(dirpath):
    for filename in filenames:
        print(os.path.join(dirname, filename))


HousePredictionKaggle/output.csv
HousePredictionKaggle/data.csv
HousePredictionKaggle/data.dat


In [2]:
## import libraries

import pandas as pd
import numpy as np
import pickle5 as pickle
import datetime as dt
from time import time
from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv(dirpath+"/data.csv")  # Reading dataset
df.iloc[0]

date              2014-05-02 00:00:00
price                          313000
bedrooms                            3
bathrooms                         1.5
sqft_living                      1340
sqft_lot                         7912
floors                            1.5
waterfront                          0
view                                0
condition                           3
sqft_above                       1340
sqft_basement                       0
yr_built                         1955
yr_renovated                     2005
street           18810 Densmore Ave N
city                        Shoreline
statezip                     WA 98133
country                           USA
Name: 0, dtype: object

In [4]:
def preprocessing(df):
    #extract data
    df["date"]= pd.to_datetime(df["date"])
    df["day"] = df["date"].dt.day
    df["month"] = df["date"].dt.month
    df["year"] = df["date"].dt.year
    df = df.drop(columns=["date"])

    #set types
    df["price"]= df["price"].astype("int64")
    df["bedrooms"]= df["bedrooms"].astype("int64")
    df["bathrooms"]= df["bathrooms"].astype("int64")
    df["floors"]= df["floors"].astype("int64")

    #define binary features
    df["basement"] = ["No" if i == 0 else "Yes" for i in df["sqft_basement"]]
    df["situation"] = ["Former" if i <= 1990 else "New" for i in df["yr_built"]]
    df["renewal_status"] = ["Not_renewed" if i == 0 else "Renewed" for i in df["yr_renovated"]]

    #convert categorical to numerical
    lb = LabelEncoder()
    cat_variable = df.dtypes==object
    cat_variable = df.columns[cat_variable].tolist()
    df[cat_variable] = df[cat_variable].apply(lambda col: lb.fit_transform(col.astype(str)))

    return df    

In [5]:
df = preprocessing(df)
df.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,...,street,city,statezip,country,day,month,year,basement,situation,renewal_status
count,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,...,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0
mean,551963.0,3.40087,1.788913,2139.346957,14852.52,1.45913,0.007174,0.240652,3.451739,1827.265435,...,2265.38587,25.674348,38.743696,0.0,14.791304,5.757609,2014.0,0.403261,0.311957,0.405435
std,563834.7,0.908848,0.752185,963.206916,35884.44,0.552194,0.084404,0.778405,0.67723,862.168977,...,1307.591795,11.982721,20.919517,0.0,8.677569,0.683851,0.0,0.490606,0.463343,0.491029
min,0.0,0.0,0.0,370.0,638.0,1.0,0.0,0.0,1.0,370.0,...,0.0,0.0,0.0,0.0,1.0,5.0,2014.0,0.0,0.0,0.0
25%,322875.0,3.0,1.0,1460.0,5000.75,1.0,0.0,0.0,3.0,1190.0,...,1132.75,17.0,21.0,0.0,7.0,5.0,2014.0,0.0,0.0,0.0
50%,460943.0,3.0,2.0,1980.0,7683.0,1.0,0.0,0.0,3.0,1590.0,...,2263.5,32.0,41.0,0.0,14.0,6.0,2014.0,0.0,0.0,0.0
75%,654962.5,4.0,2.0,2620.0,11001.25,2.0,0.0,0.0,4.0,2300.0,...,3400.25,35.0,56.0,0.0,23.0,6.0,2014.0,1.0,1.0,1.0
max,26590000.0,9.0,8.0,13540.0,1074218.0,3.0,1.0,4.0,5.0,9410.0,...,4524.0,43.0,76.0,0.0,31.0,7.0,2014.0,1.0,1.0,1.0


In [6]:
X = df[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'floors', 'waterfront', 'view', 'yr_built', 'yr_renovated', 'city', 'country', 'renewal_status']]
y = df["price"]

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2)

# Predictions

In [7]:
#load the model from disk
lr = pickle.load(open('models/lr.sav', 'rb'))
result = lr.score(X_test, y_test)
print(result)

knn = pickle.load(open('models/knn.sav', 'rb'))
result = knn.score(X_test, y_test)
print(result)

svc = pickle.load(open('models/svc.sav', 'rb'))
result = svc.score(X_test, y_test)
print(result)

tree = pickle.load(open('models/tree.sav', 'rb'))
result = tree.score(X_test, y_test)
print(result)

forest = pickle.load(open('models/forest.sav', 'rb'))
result = forest.score(X_test, y_test)
print(result)

gb = pickle.load(open('models/gb.sav', 'rb'))
result = gb.score(X_test, y_test)
print(result)

xgb = pickle.load(open('models/xgb.sav', 'rb'))
result = xgb.score(X_test, y_test)
print(result)

lgbm = pickle.load(open('models/lgbm.sav', 'rb'))
result = lgbm.score(X_test, y_test)
print(result)

0.1924171641275081
0.23198027270766286
-0.4554729964187618
0.10431170008244095
0.3883895436383411
0.3529472042988113
0.38544590948154434
0.4184279627684464


In [8]:
preds = svc.predict(X_test)
y_test = y_test.reset_index(drop=True)
for i in range(10):
    print(preds[i], y_test[i])


281615.43125330005 346000
401975.14987872634 233166
89576.03849969897 670000
513635.16194008384 640000
546226.050282226 620000
383460.31634889636 378000
311036.69453893695 220600
499593.4107550243 1325000
536619.477519379 370000
449211.6986962417 885000


In [None]:
def preprocess_type(input_json):
    #convert categorical to numerical
    input_df = pd.DataFrame(input_json)
    lb = LabelEncoder()
    cat_variable = input_df.dtypes==object
    cat_variable = input_df.columns[cat_variable].tolist()
    input_df[cat_variable] = input_df[cat_variable].apply(lambda col: lb.fit_transform(col.astype(str)))
    return input_df


In [None]:
#'model_name' = 'lr'
input_json = [{ 
    'bedrooms': 3,
    'bathrooms': 1.5,
    'sqft_living': 1340,
    'sqft_lot': 7912,
    'sqft_above':1340,
    'sqft_basement': 0,
    'floors': 1.5,
    'waterfront': 0,
    'view':0,
    'yr_built': 1955,
    'yr_renovated': 2005,
    'city': 'Shoreline',
    'country': 'USA',
    'renewal_status': 1
},
{
    'bedrooms': 3,
    'bathrooms': 1.5,
    'sqft_living': 1340,
    'sqft_lot': 7912,
    'sqft_above':1340,
    'sqft_basement': 0,
    'floors': 1.5,
    'waterfront': 0,
    'view':0,
    'yr_built': 1955,
    'yr_renovated': 2005,
    'city': 'Shoreline',
    'country': 'USA',
    'renewal_status': 1
}]


input_df = preprocess_type(input_json)
lr.predict(pd.DataFrame(input_df))

array([318727.94131982, 318727.94131982])

# API code

In [11]:
import flask
import string
from flask import request, jsonify, Flask
import json
from json_tricks import dumps
import sys

#sys.path.insert(1, 'accept-except')
#from preprocessing_fixed_size import X_to_Xfixed, preprocessing, get_window_size

app = Flask(__name__)

#load the model from disk
lr = pickle.load(open('models/lr.sav', 'rb'))
knn = pickle.load(open('models/knn.sav', 'rb'))
svc = pickle.load(open('models/svc.sav', 'rb'))
tree = pickle.load(open('models/tree.sav', 'rb'))
forest = pickle.load(open('models/forest.sav', 'rb'))
gb = pickle.load(open('models/gb.sav', 'rb'))
xgb = pickle.load(open('models/xgb.sav', 'rb'))
lgbm = pickle.load(open('models/lgbm.sav', 'rb'))

    
#____________________________________________________________________#


@app.route('/', methods=['POST'])
def predict():
    try:
        input = request.get_json()
        print('Input', input)
        input_df = preprocess_type(input)
        print('Input_df', input_df)
    except Exception:
        print("Input exception: The received input is not in a valid json format!")
        return jsonify({})
   
  
    preds = lr.predict(input_df)
    print('Preds: ', preds)
    return dumps(preds) #jsonify(pred)

if __name__ == '__main__':
    #app.debug = True
    app.run(host='0.0.0.0', port=5320, threaded=True)




 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on all addresses.
 * Running on http://172.17.0.2:5320/ (Press CTRL+C to quit)
127.0.0.1 - - [07/Mar/2022 17:43:45] "POST / HTTP/1.1" 200 -


Input [{'bedrooms': 3, 'bathrooms': 1.5, 'sqft_living': 1340, 'sqft_lot': 7912, 'sqft_above': 1340, 'sqft_basement': 0, 'floors': 1.5, 'waterfront': 0, 'view': 0, 'yr_built': 1955, 'yr_renovated': 2005, 'city': 'Shoreline', 'country': 'USA', 'renewal_status': 1}, {'bedrooms': 3, 'bathrooms': 1.5, 'sqft_living': 1340, 'sqft_lot': 7912, 'sqft_above': 1340, 'sqft_basement': 0, 'floors': 1.5, 'waterfront': 0, 'view': 0, 'yr_built': 1955, 'yr_renovated': 2005, 'city': 'Shoreline', 'country': 'USA', 'renewal_status': 1}]
Input_df    bedrooms  bathrooms  sqft_living  sqft_lot  sqft_above  sqft_basement  \
0         3        1.5         1340      7912        1340              0   
1         3        1.5         1340      7912        1340              0   

   floors  waterfront  view  yr_built  yr_renovated  city  country  \
0     1.5           0     0      1955          2005     0        0   
1     1.5           0     0      1955          2005     0        0   

   renewal_status  
0         