In [1]:
import os
import pandas as pd
from dotenv import load_dotenv

# Set the env variables
load_dotenv()

True

## Exploring data

In [8]:
mock = os.environ['APP_DEMOGRAPHICS_PATH']
mock

'/app/data/zipcode_demographics.csv'

In [2]:
SALES_PATH = os.getenv("SALES_PATH")                      #"../data/kc_house_data.csv"  # path to CSV with home sale data
DEMOGRAPHICS_PATH = os.getenv("DEMOGRAPHICS_PATH")           #"../data/zipcode_demographics.csv"  # path to CSV with demographics
# List of columns (subset) that will be taken from home sale data
SALES_COLUMN_SELECTION = os.getenv("SALES_COLUMN_SELECTION") #['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors','sqft_above', 'sqft_basement', 'zipcode'] 
OUTPUT_DIR = os.getenv("OUTPUT_DIR")                         #"model"  # Directory where output artifacts will be saved

In [7]:
SALES_PATH

'../data/kc_house_data.csv'

In [40]:
data = pd.read_csv(SALES_PATH,
                           #usecols=eval(SALES_COLUMN_SELECTION),
                           dtype={'zipcode': str})
demographics = pd.read_csv(DEMOGRAPHICS_PATH,
                                   dtype={'zipcode': str})

In [5]:
data.head(3)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_basement,zipcode
0,221900.0,3,1.0,1180,5650,1.0,1180,0,98178
1,538000.0,3,2.25,2570,7242,2.0,2170,400,98125
2,180000.0,2,1.0,770,10000,1.0,770,0,98028


In [6]:
data.dtypes

price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
sqft_above         int64
sqft_basement      int64
zipcode           object
dtype: object

In [6]:
demographics.columns

Index(['ppltn_qty', 'urbn_ppltn_qty', 'sbrbn_ppltn_qty', 'farm_ppltn_qty',
       'non_farm_qty', 'medn_hshld_incm_amt', 'medn_incm_per_prsn_amt',
       'hous_val_amt', 'edctn_less_than_9_qty', 'edctn_9_12_qty',
       'edctn_high_schl_qty', 'edctn_some_clg_qty', 'edctn_assoc_dgre_qty',
       'edctn_bchlr_dgre_qty', 'edctn_prfsnl_qty', 'per_urbn', 'per_sbrbn',
       'per_farm', 'per_non_farm', 'per_less_than_9', 'per_9_to_12', 'per_hsd',
       'per_some_clg', 'per_assoc', 'per_bchlr', 'per_prfsnl', 'zipcode'],
      dtype='object')

## Test script

In [42]:
data.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [43]:
unseen_data.columns

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [39]:
demographics.columns

Index(['ppltn_qty', 'urbn_ppltn_qty', 'sbrbn_ppltn_qty', 'farm_ppltn_qty',
       'non_farm_qty', 'medn_hshld_incm_amt', 'medn_incm_per_prsn_amt',
       'hous_val_amt', 'edctn_less_than_9_qty', 'edctn_9_12_qty',
       'edctn_high_schl_qty', 'edctn_some_clg_qty', 'edctn_assoc_dgre_qty',
       'edctn_bchlr_dgre_qty', 'edctn_prfsnl_qty', 'per_urbn', 'per_sbrbn',
       'per_farm', 'per_non_farm', 'per_less_than_9', 'per_9_to_12', 'per_hsd',
       'per_some_clg', 'per_assoc', 'per_bchlr', 'per_prfsnl', 'zipcode'],
      dtype='object')

In [None]:
import json
import requests


def test_hard_predict(url: str=os.getenv("URL_HARD_PREDICT")):
 
    # Header
    headers = {"content-type": "application/json", "Accept-Charset": "UTF-8"}

    # Load unseen data
    features = eval(os.getenv("SALES_COLUMN_SELECTION"))
    features.remove('price')
    unseen_data_path = os.getenv("UNSEEN_DATA")
    unseen_data = pd.read_csv(unseen_data_path)
    unseen_data = unseen_data[features]

    # Json
    data_string = unseen_data.to_json(orient='columns')
    data_load = json.loads(data_string)

    # Request
    r = requests.post(url, data=data_load, headers=headers)
    return print(r, r.text)

def test_soft_predict(url: str=os.getenv("URL_SOFT_PREDICT")):
 
    # Header
    headers = {"content-type": "application/json", "Accept-Charset": "UTF-8"}

    # Load unseen data
    unseen_data_path = os.getenv("UNSEEN_DATA")
    unseen_data = pd.read_csv(unseen_data_path)

    # Json
    data_string = unseen_data.to_json(orient='columns')
    data_load = json.loads(data_string)

    # Request
    r = requests.post(url, data=data_load, headers=headers)
    return print(r, r.text)

In [14]:
features = eval(os.getenv("SALES_COLUMN_SELECTION"))
#columns = features.remove('price')

In [17]:
features


['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'sqft_above',
 'sqft_basement',
 'zipcode']

In [3]:
# Load unseen data
features = eval(os.getenv("SALES_COLUMN_SELECTION"))
features = features.remove('price')
unseen_data_path = os.getenv("UNSEEN_DATA")
unseen_data = pd.read_csv(unseen_data_path)
unseen_data = unseen_data[features]

# Json
data_string = unseen_data.to_json(orient='columns')
data_load = json.loads(data_string)
data_string

KeyError: None

In [4]:
features

## Testing inference

In [13]:
import pickle
import logging
from pydantic import BaseModel, Field
from fastapi import HTTPException

model_file = open(os.getenv("MODEL_PATH"), 'rb')
MODEL = pickle.load(model_file)

class PayloadValidation(BaseModel):
    bedrooms: dict = Field(default={})
    bathrooms: dict = Field(default={})
    sqft_living: dict = Field(default={})
    sqft_lot: dict = Field(default={})
    floors: dict = Field(default={})
    sqft_above: dict = Field(default={})
    sqft_basement: dict = Field(default={})


In [14]:
def get_prediction(payload: PayloadValidation):
    
    """Handles the inference pipeline"""

    try:

        logging.info("Request received!!!")

        # Incoming data
        message_data = pd.DataFrame(payload)

        # Demographics data
        demo_data = pd.read_csv(os.getenv("DEMOGRAPHICS_PATH"))

        # Merging
        merged_data = message_data.merge(demo_data, how="left",
                                         on="zipcode").drop(columns="zipcode")

        #Model features
        features = pd.read_json(os.getenv("FEATURES"))
        features = features[0].to_list()
        
        # Make predictions on payload
        pred = MODEL.predict(merged_data[features])

        #prettifying the predictions
        response = dict()
        for i in range(len(pred)):
            response[f"real_state-{i}"] = pred[i]
        
        return response
    
    except Exception as e:
        logging.error(f"Processing failed: {str(e)}")
        raise HTTPException(status_code=500, detail="An error occured during processing") from e

In [15]:
res = get_prediction(payload=data_load)

In [16]:
res

{'real_state-0': 458520.0,
 'real_state-1': 612800.0,
 'real_state-2': 449160.0,
 'real_state-3': 679700.0,
 'real_state-4': 304256.0,
 'real_state-5': 553798.0,
 'real_state-6': 341800.0,
 'real_state-7': 445350.0,
 'real_state-8': 990500.0,
 'real_state-9': 532940.0,
 'real_state-10': 422700.0,
 'real_state-11': 484220.0,
 'real_state-12': 499400.0,
 'real_state-13': 358470.0,
 'real_state-14': 790700.0,
 'real_state-15': 236300.0,
 'real_state-16': 426950.0,
 'real_state-17': 687600.0,
 'real_state-18': 619880.0,
 'real_state-19': 438000.0,
 'real_state-20': 520800.0,
 'real_state-21': 669300.2,
 'real_state-22': 549036.0,
 'real_state-23': 411100.0,
 'real_state-24': 250190.0,
 'real_state-25': 313590.0,
 'real_state-26': 730800.0,
 'real_state-27': 285730.0,
 'real_state-28': 256990.0,
 'real_state-29': 390200.0,
 'real_state-30': 285942.4,
 'real_state-31': 865700.0,
 'real_state-32': 975500.0,
 'real_state-33': 494936.0,
 'real_state-34': 272090.0,
 'real_state-35': 297900.0,
 '

{}