# Load the data

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

dirpath = 'Dataset/'
import os
for dirname, _, filenames in os.walk(dirpath):
    for filename in filenames:
        print(os.path.join(dirname, filename))


Dataset/offers_cluj-napoca_2022-03-25.csv


In [3]:
## import libraries

import pandas as pd
import numpy as np
import pickle5 as pickle
import datetime as dt
from time import time
from sklearn.preprocessing import LabelEncoder

In [8]:
df = pd.read_csv(dirpath+"/offers_cluj-napoca_2022-03-25.csv")  # Reading dataset
df.iloc[0]

bathrooms          1
floor            0.5
rooms              2
surface           82
state            nou
zone         Marasti
ad_price         750
Name: 0, dtype: object

In [9]:
def preprocessing(df):
 
    '''
    #extract data
    df["date"]= pd.to_datetime(df["date"])
    df["day"] = df["date"].dt.day
    df["month"] = df["date"].dt.month
    df["year"] = df["date"].dt.year
    df = df.drop(columns=["date"])

    #set types
    df["price"]= df["price"].astype("int64")
    df["bedrooms"]= df["bedrooms"].astype("int64")
    df["bathrooms"]= df["bathrooms"].astype("int64")
    df["floors"]= df["floors"].astype("int64")

    #define binary features
    df["basement"] = ["No" if i == 0 else "Yes" for i in df["sqft_basement"]]
    df["situation"] = ["Former" if i <= 1990 else "New" for i in df["yr_built"]]
    df["renewal_status"] = ["Not_renewed" if i == 0 else "Renewed" for i in df["yr_renovated"]]
    '''

    #convert categorical to numerical
    lb = LabelEncoder()
    cat_variable = df.dtypes==object
    cat_variable = df.columns[cat_variable].tolist()
    df[cat_variable] = df[cat_variable].apply(lambda col: lb.fit_transform(col.astype(str)))

    return df   

df = preprocessing(df)
df

Unnamed: 0,bathrooms,floor,rooms,surface,state,zone,ad_price
0,1.0,0.500,2.0,82.0,1,89,750.0
1,1.0,0.250,2.0,65.0,3,100,350.0
2,1.0,0.000,2.0,72.0,1,88,372.0
3,1.0,0.333,2.0,65.0,2,110,470.0
4,1.0,2.000,2.0,45.0,1,58,290.0
...,...,...,...,...,...,...,...
848,1.0,0.250,1.0,45.0,2,127,360.0
849,2.0,0.000,3.0,85.0,2,81,700.0
850,1.0,0.600,2.0,60.0,1,135,480.0
851,1.0,1.000,2.0,57.0,2,85,300.0


In [11]:
df.describe()

Unnamed: 0,bathrooms,floor,rooms,surface,state,zone,ad_price
count,853.0,853.0,853.0,853.0,853.0,853.0,853.0
mean,1.215709,0.627067,2.243845,68.12544,2.154748,73.728019,500.113716
std,0.457475,0.863865,0.818851,31.489129,0.859614,30.460374,219.71611
min,1.0,-1.0,1.0,11.0,0.0,0.0,150.0
25%,1.0,0.0,2.0,54.0,1.0,52.0,399.0
50%,1.0,0.5,2.0,63.0,2.0,73.0,450.0
75%,1.0,0.875,3.0,76.0,3.0,89.0,550.0
max,4.0,9.0,10.0,500.0,3.0,135.0,3000.0


In [12]:
X = df.drop(columns=['ad_price'])
y = df["ad_price"]

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2)

# Predictions

In [13]:
#load the model from disk
lr = pickle.load(open('models/lr.sav', 'rb'))
result = lr.score(X_test, y_test)
print(result)

knn = pickle.load(open('models/knn.sav', 'rb'))
result = knn.score(X_test, y_test)
print(result)

svc = pickle.load(open('models/svc.sav', 'rb'))
result = svc.score(X_test, y_test)
print(result)

tree = pickle.load(open('models/tree.sav', 'rb'))
result = tree.score(X_test, y_test)
print(result)

forest = pickle.load(open('models/forest.sav', 'rb'))
result = forest.score(X_test, y_test)
print(result)

gb = pickle.load(open('models/gb.sav', 'rb'))
result = gb.score(X_test, y_test)
print(result)

xgb = pickle.load(open('models/xgb.sav', 'rb'))
result = xgb.score(X_test, y_test)
print(result)

lgbm = pickle.load(open('models/lgbm.sav', 'rb'))
result = lgbm.score(X_test, y_test)
print(result)

0.5939432953332147
0.6461721173852648
0.574767631825648
0.9277489710995996
0.9032929133919311
0.7212535239318143
0.9094139860580088
0.7614632924579746


In [14]:
preds = svc.predict(X_test)
y_test = y_test.reset_index(drop=True)
for i in range(10):
    print(preds[i], y_test[i])


1423.3608121601942 1350.0
465.5943669174872 350.0
389.7825243808425 370.0
580.9174527129476 450.0
398.97584515819887 350.0
895.2837390922728 1200.0
389.1472014478264 500.0
464.86159910290786 480.0
460.34898071923686 650.0
479.1005594349686 480.0


In [22]:
#'model_name' = 'lr'
input_json = [{ 
    'bathrooms': 2, 
    'floor': 0.5, 
    'rooms': 3, 
    'surface': 78, 
    'state': 'nou', 
    'zone': 'Buna Ziua'
},
{
    'bathrooms': 2, 
    'floor': 0.5, 
    'rooms': 3, 
    'surface': 78, 
    'state': 'nou', 
    'zone': 'Marasti'
}]


#input_df = preprocess_type(input_json)
input_df = preprocessing(pd.DataFrame(input_json))
gb.predict(pd.DataFrame(input_df))

array([550.35596528, 550.35596528])

# API code

In [18]:
import flask
import string
from flask import request, jsonify, Flask
import json
from json_tricks import dumps
import sys

#sys.path.insert(1, 'accept-except')
#from preprocessing_fixed_size import X_to_Xfixed, preprocessing, get_window_size

app = Flask(__name__)

#load the model from disk
lr = pickle.load(open('models/lr.sav', 'rb'))
knn = pickle.load(open('models/knn.sav', 'rb'))
svc = pickle.load(open('models/svc.sav', 'rb'))
tree = pickle.load(open('models/tree.sav', 'rb'))
forest = pickle.load(open('models/forest.sav', 'rb'))
gb = pickle.load(open('models/gb.sav', 'rb'))
xgb = pickle.load(open('models/xgb.sav', 'rb'))
lgbm = pickle.load(open('models/lgbm.sav', 'rb'))

    
#____________________________________________________________________#


@app.route('/', methods=['POST'])
def predict():
    try:
        input = request.get_json()
        print('Input', input)
        input_df =  preprocessing(pd.DataFrame(input_json))
        print('Input_df', input_df)
    except Exception:
        print("Input exception: The received input is not in a valid json format!")
        return jsonify({})
   
  
    preds = gb.predict(input_df)
    print('Preds: ', preds)
    return dumps(preds) #jsonify(pred)

if __name__ == '__main__':
    #app.debug = True
    app.run(host='0.0.0.0', port=5320, threaded=True)




 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on all addresses.
 * Running on http://172.17.0.3:5320/ (Press CTRL+C to quit)
