# Report

In this notebook we report progress on the project of house price prediction.

In [1]:
import os
from functools import partial

In [2]:
import joblib
import pandas as pd

In [3]:
import data
import metrics

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
def evaluate_model(*, model, metric, X_train, y_train, X_test, y_test):
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)
    train_error = metric(y_train, train_predictions)
    test_error = metric(y_test, test_predictions)
    return {
        "train_predictions": train_predictions,
        "test_predictions": test_predictions,
        "train_error": train_error,
        "test_error": test_error
    }

def print_report(*, model, evaluation):
    print(f"Model used:\n\t{reg}")
    print(f"Error:\n\ttrain set {evaluation['train_error']}\n\ttest error: {evaluation['test_error']}")

In [6]:
models_dir = "models"

In [7]:
dataset_path = "dataset.csv"

In [8]:
dataset = data.get_dataset(
    partial(pd.read_csv, filepath_or_buffer=dataset_path),
    splits=("train", "test")
)

KeyError: "None of [Index(['oldpeak', 'thalachh', 'chol', 'trtbps'], dtype='object')] are in the [columns]"

In [17]:
dataset["train"][0]

NameError: name 'dataset' is not defined

In [13]:
df = pd.read_csv("train_classification.csv")
df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,y
0,54,1,2,150,232,0,0,165,0,1.6,2,0,3,1
1,58,0,2,120,340,0,1,172,0,0.0,2,0,2,1
2,47,1,0,112,204,0,1,143,0,0.1,2,0,2,1
3,58,1,0,114,318,0,2,140,0,4.4,0,3,1,0
4,58,1,0,100,234,0,1,156,0,0.1,2,1,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,50,1,2,140,233,0,1,163,0,0.6,1,1,3,0
199,51,1,2,94,227,0,1,154,1,0.0,2,1,3,1
200,69,1,3,160,234,1,0,131,0,0.1,1,1,2,1
201,46,1,0,120,249,0,0,144,0,0.8,2,0,3,0


In [14]:
for col in df:
    print(col)
    print(df[col].unique().dtype)

age
int64
sex
int64
cp
int64
trtbps
int64
chol
int64
fbs
int64
restecg
int64
thalachh
int64
exng
int64
oldpeak
float64
slp
int64
caa
int64
thall
int64
y
int64


In [19]:
df["y"]

NameError: name 'df' is not defined

In [15]:
dataset_clasificacion = data.get_dataset(
    partial(pd.read_csv, filepath_or_buffer="train_classification.csv"),
    splits=("train", "test")
)

In [16]:
dataset_clasificacion["train"][0]

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
160,52,1,0,128.0,255.0,0,1,161.0,1,0.0,2,1,3
108,59,1,0,140.0,177.0,0,1,162.0,1,0.0,2,1,3
202,63,0,1,140.0,195.0,0,1,179.0,0,0.0,2,2,2
167,54,0,2,110.0,214.0,0,1,158.0,0,1.6,1,0,2
103,51,1,2,110.0,175.0,0,1,123.0,0,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,60,0,0,150.0,258.0,0,0,157.0,0,2.6,1,2,3
137,44,1,0,120.0,169.0,0,1,144.0,1,2.8,0,0,1
72,62,1,1,120.0,281.0,0,0,103.0,0,1.4,1,1,3
140,58,0,3,150.0,283.0,1,0,162.0,0,1.0,2,0,2


**If you need to visualize anything from your training data, do it here**

## Baseline

Before doing any complex Machine Learning model, let's try to solve the problem by having an initial educated guess. 

In [23]:
model_path = os.path.join("models", "2021-06-10 22-24", "model.joblib")
reg = joblib.load(model_path)
evaluation = evaluate_model(
    model=reg,
    metric=metrics.custom_error,
    X_train=dataset_clasificacion["train"][0],
    y_train=dataset_clasificacion["train"][1],
    X_test=dataset_clasificacion["test"][0],
    y_test=dataset_clasificacion["test"][1]
)
print_report(model=reg, evaluation=evaluation)

ZeroDivisionError: Weights sum to zero, can't be normalized

In [33]:
dataset_clasificacion["train"][1]

160    0
108    0
202    1
167    1
103    1
      ..
133    0
137    0
72     0
140    1
37     1
Name: y, Length: 142, dtype: int64

## Linear Regression Model 

We want to try easy things first, so know lets see how a linear regression model does.

In [31]:
model_path = os.path.join("models", "2021-06-10", "model.joblib")
reg = joblib.load(model_path)
evaluation = evaluate_model(
    model=reg,
    metric=metrics.custom_error,
    X_train=dataset["train"][0],
    y_train=dataset["train"][1],
    X_test=dataset["test"][0],
    y_test=dataset["test"][1]
)
print_report(model=reg, evaluation=evaluation)

ValueError: could not convert string to float: 'RL'

**Error Analysis**

What can you learn about the errors your model is making? Try this:

* Discretize the errors your model is making by some categorical variables.
* Sort and discretize the errors your model is making and see what the features have in common in those cases. 

## Linear regression with Feature Engineering

Probably the previous model is not good enough, let's see how is the performance of a model using some produced features.

Techniques:
1. Feature Cross
2. Discretizer
3. Add average per neighborhood.


In [35]:
model_path = os.path.join("models", "2021-06-05 01-04", "model.joblib")
reg = joblib.load(model_path)
report = evaluate_model(
    model=reg,
    metric=metrics.custom_error,
    X_train=dataset["train"][0],
    y_train=dataset["train"][1],
    X_test=dataset["test"][0],
    y_test=dataset["test"][1]
)
print_report(model=reg, evaluation=evaluation)

Model used:
	Pipeline(steps=[('age-extractor', AgeExtractor()),
                ('averager', AveragePricePerNeighborhoodExtractor()),
                ('categorical-encoder',
                 CategoricalEncoder(force_dense_array=True, one_hot=True)),
                ('standard-scaler', StandardScaler()),
                ('linear-regressor', LinearRegression())])
Error:
	train set 11261.923628487202
	test error: 25474507852808.133


**Error Analysis**

What can you learn about the errors your model is making? Try this:

* Discretize the errors your model is making by some categorical variables.
* Sort or discretize the errors your model is making and see what the features have in common in those cases. 

## Regularized Linear Regression

Let's assume you are overfitting. Load the results of a linear regression model with regularized loss

In [None]:
model_path = os.path.join("models", "", "model.joblib")
reg = joblib.load(model_path)
report = evaluate_model(
    model=reg,
    metric=metrics.custom_error,
    X_train=dataset["train"][0],
    y_train=dataset["train"][1],
    X_test=dataset["test"][0],
    y_test=dataset["test"][1]
)
print_report(model=reg, evaluation=evaluation)

**Error Analysis**

What can you learn about the errors your model is making? Try this:

* Discretize the errors your model is making by some categorical variables.
* Sort or discretize the errors your model is making and see what the features have in common in those cases. 

## Decision Tree

Decision trees ofer great complexity, they can fit even a noisy dataset almost perfectly. Let's see how it behaves on the task at hand. 

**Overfiting case**
Let's see the results for a model that has greatly overfit the data, this wouldn't be an ideal model, but at least it could tell that our model is powerful enough for the task at hand

In [15]:
model_path = os.path.join("models", "2021-06-05 13-59", "model.joblib")
reg = joblib.load(model_path)
evaluation = evaluate_model(
    model=reg,
    metric=metrics.custom_error,
    X_train=dataset["train"][0],
    y_train=dataset["train"][1],
    X_test=dataset["test"][0],
    y_test=dataset["test"][1]
)
print_report(model=reg, evaluation=evaluation)

Model used:
	Pipeline(steps=[('age-extractor', AgeExtractor()),
                ('categorical-encoder',
                 CategoricalEncoder(force_dense_array=True,
                                    pass_through_columns=['HouseAge',
                                                          'RemodAddAge',
                                                          'GarageAge'])),
                ('decision-treee-regresor',
                 DecisionTreeRegressor(max_depth=3000))])
Error:
	train set 0.0
	test error: 28588.116587677723


**Error Analysis**

What can you learn about the errors your model is making? Try this:

* Discretize the errors your model is making by some categorical variables.
* Sort or discretize the errors your model is making and see what the features have in common in those cases. 

**Using best hyper params** Now let's see thow much a simple decision tree can give us

In [18]:
model_path = os.path.join("models", "2021-06-05 14-15", "model.joblib")
reg = joblib.load(model_path)
report = evaluate_model(
    model=reg,
    metric=metrics.custom_error,
    X_train=dataset["train"][0],
    y_train=dataset["train"][1],
    X_test=dataset["test"][0],
    y_test=dataset["test"][1]
)
print_report(model=reg, evaluation=evaluation)

Model used:
	Pipeline(steps=[('age-extractor', AgeExtractor()),
                ('categorical-encoder',
                 CategoricalEncoder(force_dense_array=True,
                                    pass_through_columns=['HouseAge',
                                                          'RemodAddAge',
                                                          'GarageAge'])),
                ('decision-treee-regresor',
                 DecisionTreeRegressor(max_depth=1000, max_features=50,
                                       min_samples_leaf=2,
                                       min_samples_split=20))])
Error:
	train set 18123.789642031083
	test error: 30951.389810550965


## Random Forest

Now it is time to use a model that can properly help us to regularize the previous one.

In [16]:
model_path = os.path.join("models", "2021-06-05 14-19", "model.joblib")
reg = joblib.load(model_path)
evaluation = evaluate_model(
    model=reg,
    metric=metrics.custom_error,
    X_train=dataset["train"][0],
    y_train=dataset["train"][1],
    X_test=dataset["test"][0],
    y_test=dataset["test"][1]
)
print_report(model=reg, evaluation=evaluation)

Model used:
	Pipeline(steps=[('age-extractor', AgeExtractor()),
                ('categorical-encoder',
                 CategoricalEncoder(force_dense_array=True,
                                    pass_through_columns=['HouseAge',
                                                          'RemodAddAge',
                                                          'GarageAge'])),
                ('decision-treee-regresor',
                 DecisionTreeRegressor(max_depth=100, max_features=15,
                                       min_samples_leaf=5,
                                       min_samples_split=20))])
Error:
	train set 18123.789642031083
	test error: 30951.389810550965


**Error Analysis**

What can you learn about the errors your model is making? Try this:

* Discretize the errors your model is making by some categorical variables.
* Sort or discretize the errors your model is making and see what the features have in common in those cases. 