## Importing the Libraries


First we will import again all the libraries we need for our future work.


In [2]:
import pandas as pd
import numpy as np



from sklearn.model_selection import train_test_split


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
kc_data = pd.read_csv("data/King_County_House_prices_dataset.csv")

In [4]:
kc_data.drop(15856, axis=0, inplace=True)

In [5]:
kc_data.drop(kc_data[kc_data['bedrooms'] == 33].index, inplace = True)

In [6]:
kc_data['sqft_basement'] = kc_data['sqft_basement'].replace('?', np.NaN)
kc_data['sqft_basement'] = kc_data['sqft_basement'].astype(float)

In [7]:
kc_data.eval('sqft_basement = sqft_living - sqft_above', inplace=True)

In [8]:
kc_data.fillna({'view':0}, inplace=True)

In [9]:
kc_data.fillna({'waterfront':0}, inplace=True)

In [10]:
last_known_change = []

for idx, yr_re in kc_data.yr_renovated.items():
    if str(yr_re) == 'nan' or yr_re == 0.0:
        last_known_change.append(kc_data.yr_built[idx])
    else:
        last_known_change.append(int(yr_re))

In [11]:
kc_data['last_known_change'] = last_known_change

In [12]:
kc_data.drop("yr_renovated", axis=1, inplace=True)
kc_data.drop("yr_built", axis=1, inplace=True)

In [13]:
kc_data['sqft_price'] = (kc_data.price/(kc_data.sqft_living + kc_data.sqft_lot)).round(2)

In [14]:
kc_data['delta_lat'] = np.absolute(47.62774- kc_data['lat'])
kc_data['delta_long'] = np.absolute(-122.24194-kc_data['long'])
kc_data['center_distance']= ((kc_data['delta_long']* np.cos(np.radians(47.6219)))**2 
                                   + kc_data['delta_lat']**2)**(1/2)*2*np.pi*6378/360

In [15]:
def dist(long, lat, ref_long, ref_lat):
    '''dist computes the distance in km to a reference location. Input: long and lat of 
    the location of interest and ref_long and ref_lat as the long and lat of the reference location'''
    delta_long = long - ref_long
    delta_lat = lat - ref_lat
    delta_long_corr = delta_long * np.cos(np.radians(ref_lat))
    return ((delta_long_corr)**2 +(delta_lat)**2)**(1/2)*2*np.pi*6378/360

In [16]:
water_list= kc_data.query('waterfront == 1')


In [17]:
water_distance = []
for idx, lat in kc_data.lat.items():
    ref_list = []
    for x,y in zip(list(water_list.long), list(water_list.lat)):
        ref_list.append(dist(kc_data.long[idx], kc_data.lat[idx],x,y).min())
    water_distance.append(min(ref_list))

In [18]:
kc_data['water_distance'] = water_distance

## Prepare columns and datatypes for Pydantic model

In [19]:
kc_data.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'last_known_change', 'sqft_price',
       'delta_lat', 'delta_long', 'center_distance', 'water_distance'],
      dtype='object')

In [20]:
kc_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21596 entries, 0 to 21596
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 21596 non-null  int64  
 1   date               21596 non-null  object 
 2   price              21596 non-null  float64
 3   bedrooms           21596 non-null  int64  
 4   bathrooms          21596 non-null  float64
 5   sqft_living        21596 non-null  int64  
 6   sqft_lot           21596 non-null  int64  
 7   floors             21596 non-null  float64
 8   waterfront         21596 non-null  float64
 9   view               21596 non-null  float64
 10  condition          21596 non-null  int64  
 11  grade              21596 non-null  int64  
 12  sqft_above         21596 non-null  int64  
 13  sqft_basement      21596 non-null  float64
 14  zipcode            21596 non-null  int64  
 15  lat                21596 non-null  float64
 16  long               21596 no

In [21]:
seed = 18
houses = kc_data.sample(4, random_state=seed).to_dict('index')
for house in houses.values():
    print(house)

{'id': 4027701265, 'date': '5/1/2015', 'price': 480000.0, 'bedrooms': 3, 'bathrooms': 1.75, 'sqft_living': 2920, 'sqft_lot': 21375, 'floors': 1.0, 'waterfront': 0.0, 'view': 0.0, 'condition': 3, 'grade': 8, 'sqft_above': 1850, 'sqft_basement': 1070.0, 'zipcode': 98028, 'lat': 47.7666, 'long': -122.265, 'sqft_living15': 1540, 'sqft_lot15': 8482, 'last_known_change': 1961, 'sqft_price': 19.76, 'delta_lat': 0.138859999999994, 'delta_long': 0.02306000000000097, 'center_distance': 15.55402307333386, 'water_distance': 1.2521118454744993}
{'id': 322059049, 'date': '10/3/2014', 'price': 295000.0, 'bedrooms': 2, 'bathrooms': 1.0, 'sqft_living': 820, 'sqft_lot': 288367, 'floors': 1.0, 'waterfront': 0.0, 'view': 0.0, 'condition': 3, 'grade': 6, 'sqft_above': 820, 'sqft_basement': 0.0, 'zipcode': 98042, 'lat': 47.4196, 'long': -122.165, 'sqft_living15': 1580, 'sqft_lot15': 8154, 'last_known_change': 1930, 'sqft_price': 1.02, 'delta_lat': 0.2081400000000002, 'delta_long': 0.07693999999999335, 'cent

In [22]:
seed = 18
houses = kc_data.sample(4, random_state=seed).to_dict('index')
for house in houses.values():
    print(f'curl -X POST http://localhost:8100/houses -H "Content-Type: application/json" -d "{house}"')

curl -X POST http://localhost:8100/houses -H "Content-Type: application/json" -d "{'id': 4027701265, 'date': '5/1/2015', 'price': 480000.0, 'bedrooms': 3, 'bathrooms': 1.75, 'sqft_living': 2920, 'sqft_lot': 21375, 'floors': 1.0, 'waterfront': 0.0, 'view': 0.0, 'condition': 3, 'grade': 8, 'sqft_above': 1850, 'sqft_basement': 1070.0, 'zipcode': 98028, 'lat': 47.7666, 'long': -122.265, 'sqft_living15': 1540, 'sqft_lot15': 8482, 'last_known_change': 1961, 'sqft_price': 19.76, 'delta_lat': 0.138859999999994, 'delta_long': 0.02306000000000097, 'center_distance': 15.55402307333386, 'water_distance': 1.2521118454744993}"
curl -X POST http://localhost:8100/houses -H "Content-Type: application/json" -d "{'id': 322059049, 'date': '10/3/2014', 'price': 295000.0, 'bedrooms': 2, 'bathrooms': 1.0, 'sqft_living': 820, 'sqft_lot': 288367, 'floors': 1.0, 'waterfront': 0.0, 'view': 0.0, 'condition': 3, 'grade': 6, 'sqft_above': 820, 'sqft_basement': 0.0, 'zipcode': 98042, 'lat': 47.4196, 'long': -122.165

In [23]:
list_5features = ['id', 'bedrooms', 'sqft_living', 'center_distance', 'price']
house_list = []
for house in houses.values():
    houses_5features = {key: house[key] for key in list_5features}
    house_list.append(houses_5features)
    print(houses_5features)

{'id': 4027701265, 'bedrooms': 3, 'sqft_living': 2920, 'center_distance': 15.55402307333386, 'price': 480000.0}
{'id': 322059049, 'bedrooms': 2, 'sqft_living': 820, 'center_distance': 23.877875022947446, 'price': 295000.0}
{'id': 6840701160, 'bedrooms': 5, 'sqft_living': 2140, 'center_distance': 5.007902993942828, 'price': 680000.0}
{'id': 1026069061, 'bedrooms': 4, 'sqft_living': 3600, 'center_distance': 22.31799370802972, 'price': 682000.0}


In [29]:
import json
list_5features = ['id', 'bedrooms', 'sqft_living', 'center_distance', 'price']
house_list = []
for house in houses.values():
    houses_5features = {key: house[key] for key in list_5features}
    house_json = json.dumps(houses_5features)
    print(f"curl -X POST http://localhost:8100/houses -H \"Content-Type: application/json\" -d '{house_json}'")

curl -X POST http://localhost:8100/houses -H "Content-Type: application/json" -d '{"id": 4027701265, "bedrooms": 3, "sqft_living": 2920, "center_distance": 15.55402307333386, "price": 480000.0}'
curl -X POST http://localhost:8100/houses -H "Content-Type: application/json" -d '{"id": 322059049, "bedrooms": 2, "sqft_living": 820, "center_distance": 23.877875022947446, "price": 295000.0}'
curl -X POST http://localhost:8100/houses -H "Content-Type: application/json" -d '{"id": 6840701160, "bedrooms": 5, "sqft_living": 2140, "center_distance": 5.007902993942828, "price": 680000.0}'
curl -X POST http://localhost:8100/houses -H "Content-Type: application/json" -d '{"id": 1026069061, "bedrooms": 4, "sqft_living": 3600, "center_distance": 22.31799370802972, "price": 682000.0}'


In [25]:


house1_json = json.dumps(house_list[3], indent=4)
print(house1_json)


IndexError: list index out of range

final columns are converted to pydantic model [here](https://jsontopydantic.com/)

In [None]:
from __future__ import annotations

from pydantic import BaseModel


class Model(BaseModel):
    id: int
    bedrooms: int
    sqft_living: int
    center_distance: float
    price: float


In [None]:
# use of pydantic model for data validation of sampled houses
class DataValidation(BaseModel):
    id: int
    bedrooms: int
    sqft_living: int
    center_distance: float
    price: float

for house in house_list:
    DataValidation(**house)

columns dropped before modelling

In [None]:
drop_lst = ['price', 'sqft_price', 'date', 'delta_lat', 'delta_long',]

In [None]:
all_features = [x for x in kc_data.columns if x not in drop_lst]

In [None]:
X = kc_data[all_features]

In [None]:
y = kc_data.price

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)