In [1]:
import os
import pandas as pd
from dotenv import load_dotenv

# Set the env variables
load_dotenv()

True

## Exploring data

In [8]:
mock = os.environ['APP_DEMOGRAPHICS_PATH']
mock

'/app/data/zipcode_demographics.csv'

In [2]:
SALES_PATH = os.getenv("SALES_PATH")                      #"../data/kc_house_data.csv"  # path to CSV with home sale data
DEMOGRAPHICS_PATH = os.getenv("DEMOGRAPHICS_PATH")           #"../data/zipcode_demographics.csv"  # path to CSV with demographics
# List of columns (subset) that will be taken from home sale data
SALES_COLUMN_SELECTION = os.getenv("SALES_COLUMN_SELECTION") #['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors','sqft_above', 'sqft_basement', 'zipcode'] 
OUTPUT_DIR = os.getenv("OUTPUT_DIR")                         #"model"  # Directory where output artifacts will be saved

In [7]:
SALES_PATH

'../data/kc_house_data.csv'

In [40]:
data = pd.read_csv(SALES_PATH,
                           #usecols=eval(SALES_COLUMN_SELECTION),
                           dtype={'zipcode': str})
demographics = pd.read_csv(DEMOGRAPHICS_PATH,
                                   dtype={'zipcode': str})

In [5]:
data.head(3)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_basement,zipcode
0,221900.0,3,1.0,1180,5650,1.0,1180,0,98178
1,538000.0,3,2.25,2570,7242,2.0,2170,400,98125
2,180000.0,2,1.0,770,10000,1.0,770,0,98028


In [6]:
data.dtypes

price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
sqft_above         int64
sqft_basement      int64
zipcode           object
dtype: object

In [6]:
demographics.columns

Index(['ppltn_qty', 'urbn_ppltn_qty', 'sbrbn_ppltn_qty', 'farm_ppltn_qty',
       'non_farm_qty', 'medn_hshld_incm_amt', 'medn_incm_per_prsn_amt',
       'hous_val_amt', 'edctn_less_than_9_qty', 'edctn_9_12_qty',
       'edctn_high_schl_qty', 'edctn_some_clg_qty', 'edctn_assoc_dgre_qty',
       'edctn_bchlr_dgre_qty', 'edctn_prfsnl_qty', 'per_urbn', 'per_sbrbn',
       'per_farm', 'per_non_farm', 'per_less_than_9', 'per_9_to_12', 'per_hsd',
       'per_some_clg', 'per_assoc', 'per_bchlr', 'per_prfsnl', 'zipcode'],
      dtype='object')

## Test script

In [42]:
data.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [43]:
unseen_data.columns

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [39]:
demographics.columns

Index(['ppltn_qty', 'urbn_ppltn_qty', 'sbrbn_ppltn_qty', 'farm_ppltn_qty',
       'non_farm_qty', 'medn_hshld_incm_amt', 'medn_incm_per_prsn_amt',
       'hous_val_amt', 'edctn_less_than_9_qty', 'edctn_9_12_qty',
       'edctn_high_schl_qty', 'edctn_some_clg_qty', 'edctn_assoc_dgre_qty',
       'edctn_bchlr_dgre_qty', 'edctn_prfsnl_qty', 'per_urbn', 'per_sbrbn',
       'per_farm', 'per_non_farm', 'per_less_than_9', 'per_9_to_12', 'per_hsd',
       'per_some_clg', 'per_assoc', 'per_bchlr', 'per_prfsnl', 'zipcode'],
      dtype='object')

In [19]:
import json
import requests


def test_hard_predict(url: str=os.getenv("URL_HARD_PREDICT")):
 
    # Header
    headers = {"content-type": "application/json", "Accept-Charset": "UTF-8"}

    # Load unseen data
    features = eval(os.getenv("SALES_COLUMN_SELECTION"))
    features.remove('price')
    unseen_data_path = os.getenv("UNSEEN_DATA")
    unseen_data = pd.read_csv(unseen_data_path)
    unseen_data = unseen_data[features]

    # Json
    data_string = unseen_data.to_json(orient='columns')
    data_load = json.loads(data_string)

    # Request
    r = requests.post(url, data=data_load, headers=headers)
    return print(r, r.text)

def test_soft_predict(url: str=os.getenv("URL_SOFT_PREDICT")):
 
    # Header
    headers = {"content-type": "application/json", "Accept-Charset": "UTF-8"}

    # Load unseen data
    unseen_data_path = os.getenv("UNSEEN_DATA")
    unseen_data = pd.read_csv(unseen_data_path)

    # Json
    data_string = unseen_data.to_json(orient='columns')
    data_load = json.loads(data_string)

    # Request
    r = requests.post(url, data=data_load, headers=headers)
    return print(r, r.text)

In [14]:
features = eval(os.getenv("SALES_COLUMN_SELECTION"))
#columns = features.remove('price')

In [129]:
# Load unseen data
features = eval(os.getenv("SALES_COLUMN_SELECTION"))
features.remove('price')
unseen_data_path = os.getenv("UNSEEN_DATA")
unseen_data = pd.read_csv(unseen_data_path)
unseen_data = unseen_data[features]

n=3

data_string = unseen_data[0:n].to_json(orient='records',force_ascii=True)
data_load = json.loads(data_string)
print(data_load[0])



# Json
#data_string = unseen_data.to_json(orient='columns')
data_string = unseen_data[:3].to_json(orient='records',force_ascii=True)
#data_string = data_string.replace("[","{")
#data_string = data_string.replace("]","}")

data_load = json.loads(data_string)


{'bedrooms': 4, 'bathrooms': 1.0, 'sqft_living': 1680, 'sqft_lot': 5043, 'floors': 1.5, 'sqft_above': 1680, 'sqft_basement': 0, 'zipcode': 98118}


In [130]:
data_load

[{'bedrooms': 4,
  'bathrooms': 1.0,
  'sqft_living': 1680,
  'sqft_lot': 5043,
  'floors': 1.5,
  'sqft_above': 1680,
  'sqft_basement': 0,
  'zipcode': 98118},
 {'bedrooms': 3,
  'bathrooms': 2.5,
  'sqft_living': 2220,
  'sqft_lot': 6380,
  'floors': 1.5,
  'sqft_above': 1660,
  'sqft_basement': 560,
  'zipcode': 98115},
 {'bedrooms': 3,
  'bathrooms': 2.25,
  'sqft_living': 1630,
  'sqft_lot': 10962,
  'floors': 1.0,
  'sqft_above': 1100,
  'sqft_basement': 530,
  'zipcode': 98030}]

In [123]:
pd.DataFrame(data_load)

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_basement,zipcode
0,4,1.0,1680,5043,1.5,1680,0,98118
1,3,2.5,2220,6380,1.5,1660,560,98115
2,3,2.25,1630,10962,1.0,1100,530,98030


## Testing inference

In [125]:
import pickle
import logging
from pydantic import BaseModel, Field
from fastapi import HTTPException
from typing import Dict, List, Union

model_file = open(os.getenv("MODEL_PATH"), 'rb')
MODEL = pickle.load(model_file)


data_string = unseen_data[:3].to_json(orient='split',force_ascii=True)
data_load = json.loads(data_string)


class PayloadValidation(BaseModel):
    bedrooms: dict = Field(default={})
    bathrooms: dict = Field(default={})
    sqft_living: dict = Field(default={})
    sqft_lot: dict = Field(default={})
    floors: dict = Field(default={})
    sqft_above: dict = Field(default={})
    sqft_basement: dict = Field(default={})

class InputPayload(BaseModel):
    columns: List[str]
    index: List[int]
    data: List[List[Union[int, float]]]

class HouseFeatures(BaseModel):
    bedrooms: int
    bathrooms: float
    sqft_living: int
    sqft_lot: int
    floors: float
    sqft_above: int
    sqft_basement: int
    zipcode: int

class HouseBatchInput(BaseModel):
    houses: List[HouseFeatures]


In [159]:
batch = {"houses":data_load}
data = HouseBatchInput(**batch)
data
#pd.DataFrame(data)

HouseBatchInput(houses=[HouseFeatures(bedrooms=4, bathrooms=1.0, sqft_living=1680, sqft_lot=5043, floors=1.5, sqft_above=1680, sqft_basement=0, zipcode=98118), HouseFeatures(bedrooms=3, bathrooms=2.5, sqft_living=2220, sqft_lot=6380, floors=1.5, sqft_above=1660, sqft_basement=560, zipcode=98115), HouseFeatures(bedrooms=3, bathrooms=2.25, sqft_living=1630, sqft_lot=10962, floors=1.0, sqft_above=1100, sqft_basement=530, zipcode=98030)])

In [189]:
enumerate(data.houses)

<enumerate at 0x7c3cec33a880>

In [187]:
message_data=pd.DataFrame()
for i,_ in enumerate(data.houses):
    temp_data = pd.DataFrame([data.houses[i].model_dump()])
    message_data = pd.concat([message_data,temp_data], ignore_index=True)
#list(data.houses[0].model_dump().values())


In [198]:
message_data.iloc[0]['zipcode']

98118.0

In [202]:
unseen_data_path = os.getenv("UNSEEN_DATA")
unseen_data = pd.read_csv(unseen_data_path)
data_string = unseen_data[:1].to_json(orient='records', force_ascii=True)
data_string

'[{"bedrooms":4,"bathrooms":1.0,"sqft_living":1680,"sqft_lot":5043,"floors":1.5,"waterfront":0,"view":0,"condition":4,"grade":6,"sqft_above":1680,"sqft_basement":0,"yr_built":1911,"yr_renovated":0,"zipcode":98118,"lat":47.5354,"long":-122.273,"sqft_living15":1560,"sqft_lot15":5765}]'

## Create Model

In [None]:
import os
import numpy as np
import json
import pathlib
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from typing import List
from typing import Tuple
import pandas
import pandas as pd
from sklearn import model_selection
from sklearn import neighbors
from sklearn import pipeline
from sklearn import preprocessing
from sklearn.metrics import (mean_absolute_error, 
                             mean_absolute_percentage_error, 
                             mean_squared_error, 
                             r2_score)

SALES_PATH = "../data/kc_house_data.csv"  # path to CSV with home sale data
DEMOGRAPHICS_PATH = "../data/zipcode_demographics.csv" # path to CSV with demographics
# List of columns (subset) that will be taken from home sale data
SALES_COLUMN_SELECTION = [
    'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
    'sqft_above', 'sqft_basement', 'zipcode'
]
OUTPUT_DIR = "model"  # Directory where output artifacts will be saved


def load_data(
    sales_path: str, 
    demographics_path: str, 
    sales_column_selection: List[str], 
) -> Tuple[pandas.DataFrame, pandas.Series]:
    """Load the target and feature data by merging sales and demographics.

    Args:
        sales_path: path to CSV file with home sale data
        demographics_path: path to CSV file with home sale data
        sales_column_selection: list of columns from sales data to be used as
            features

    Returns:
        Tuple containg with two elements: a DataFrame and a Series of the same
        length.  The DataFrame contains features for machine learning, the
        series contains the target variable (home sale price).

    """
    data = pandas.read_csv(sales_path,
                           usecols=sales_column_selection,
                           dtype={'zipcode': str})
    demographics = pandas.read_csv(demographics_path,
                                   dtype={'zipcode': str})

    merged_data = data.merge(demographics, how="left",
                             on="zipcode").drop(columns="zipcode")
    # Remove the target variable from the dataframe, features will remain
    y = merged_data.pop('price')
    x = merged_data

    return x, y

def model_evaluation(model, 
                     X_test: pd.DataFrame, 
                     y_test: pd.Series) -> Tuple[dict, pd.Series]:

    """
    Generate the regression metrics for a given model.

    Args:


    Return:
    
    """

    # Predictions
    pred = model.predict(X_test)
    print(np.isnan(pred))

    # Mean Absolute error
    mae = mean_absolute_error(y_true=y_test, y_pred=pred)

    # Mean Absolute Percentage Error
    mape = mean_absolute_percentage_error(y_true=y_test, y_pred=pred)

    # Mean Squared Error
    mse = mean_squared_error(y_true=y_test, y_pred=pred)

    # Root Mean Squared Error
    rmse=np.sqrt(mse)

    # R-squared
    r_squared = r2_score(y_true=y_test, y_pred=pred)

    # metrics dict
    metrics = {
        "MAE": mae,
        "MAPE": mape,
        "MSE": mse,
        "RMSE": rmse,
        "R-Squared": r_squared
    }

    return metrics, pred

def plot_regression_results(
                            X:pd.DataFrame, 
                            y:pd.Series, 
                            pred:pd.Series,
                            output_dir=OUTPUT_DIR, 
                            prefix="regression"
                            ):
    """
    Generate a scatter plot with a regression line and residuals plot.

    Args:
    
    - X: array-like or input Dataframe with one or more features.
    - y: array-like or pandas series with the ground truth numbers.
    - pred: array-like or pandas series with the predicted values of the model.
    - output_dir: output directory.
    - prefix: prefix to the name of the files.

    Return:
    Image files on the specified output_dir.
    """

    # Residuals    
    y_pred = pred
    residuals = y - y_pred
    
    # Regression Line
    #df = pd.concat([y,pd.Series(y_pred)], ignore_index=True, axis=1)
    #df.columns = ["price","predicted"]

    
    g = sns.jointplot(x=y_pred, y=y.to_numpy(), kind="reg", height=6)



    #plt.figure(figsize=(8, 6))
    #plt.scatter(y, y_pred, label="Regression", color='blue')
    #plt.xlabel("Ground Truth")
    #plt.ylabel("Predicted")
    #plt.title("Regression")
    #plt.legend()
    #plt.tight_layout()
    reg_path = os.path.join(output_dir, f"{prefix}_scatterplot.png")
    g.figure.savefig(reg_path)
    #plt.close()

    # Residuals plot
    #plt.figure(figsize=(8, 6))
    #plt.scatter(y_pred, residuals, color='purple')
    #plt.axhline(y=0, color='black', linestyle='--')
    #plt.xlabel("Predictions")
    #plt.ylabel("Residuals")
    #plt.title("Residuals plot")
    #plt.tight_layout()
    #resid_path = os.path.join(output_dir, f"{prefix}_residuals.png")
    #plt.savefig(resid_path)
    #plt.close()

    #print(f"Plots delivered in:\n - {reg_path}\n - {resid_path}")


def main():
    """Load data, train model, and export artifacts."""
    x, y = load_data(SALES_PATH, DEMOGRAPHICS_PATH, SALES_COLUMN_SELECTION)
    x_train, _x_test, y_train, _y_test = model_selection.train_test_split(
        x, y, random_state=42)

    model = pipeline.make_pipeline(preprocessing.RobustScaler(),
                                   neighbors.KNeighborsRegressor()).fit(
                                       x_train, y_train)
    
    # Model Evaluation
    metrics, pred = model_evaluation(model=model, 
                               X_test=_x_test, 
                               y_test=_y_test)
    
    plot_regression_results(X=_x_test, y=_y_test, pred=pred)

    output_dir = pathlib.Path(OUTPUT_DIR)
    output_dir.mkdir(exist_ok=True)

    # Saving the metrics
    json.dump(metrics,
              open(output_dir / "metrics.json", 'w'))

    # Output model artifacts: pickled model and JSON list of features
    pickle.dump(model, open(output_dir / "model.pkl", 'wb'))
    json.dump(list(x_train.columns),
              open(output_dir / "model_features.json", 'w'))


if __name__ == "__main__":
    main()


[False False False ... False False False]


AttributeError: 'numpy.ndarray' object has no attribute 'to_numpy'