## Imports

In [8]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


# Constants
SEED = 123

<br>

## Loading data

Tendo em vista que o foco principal do projeto é o desenvolvimento do deploy do modelo, será adotada somente a variável **House Age** para predição do valor meidiano das casas.

<br>

###  California Housing dataset

**MedInc** - median income in block group

**HouseAge** - median house age in block group

**AveRooms** - average number of rooms per household

**AveBedrms** - average number of bedrooms per household

**Population** - block group population

**AveOccup** - average number of household members

**Latitude** - block group latitude

**Longitude** - block group longitude

**Target** - median house value (in $100,000)

In [9]:
# Load and setup data
X, y = fetch_california_housing(return_X_y = True, as_frame=True)
df = pd.concat([X,y], axis=1)[['HouseAge', 'MedHouseVal']]
df.columns = ['house_age', 'target']

# Display infos
display(df)
display(df.info())
display(df.describe().transpose())

Unnamed: 0,house_age,target
0,41.0,4.526
1,21.0,3.585
2,52.0,3.521
3,52.0,3.413
4,52.0,3.422
...,...,...
20635,25.0,0.781
20636,18.0,0.771
20637,17.0,0.923
20638,18.0,0.847


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   house_age  20640 non-null  float64
 1   target     20640 non-null  float64
dtypes: float64(2)
memory usage: 322.6 KB


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
house_age,20640.0,28.639486,12.585558,1.0,18.0,29.0,37.0,52.0
target,20640.0,2.068558,1.153956,0.14999,1.196,1.797,2.64725,5.00001


<br>

## Treinamento do Modelo

O modelo de regressão linear é encapsulado a um pipeline junto a uma etapa de preprocessamento (standard scaler)

<br>

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df['house_age'], df['target'], test_size=0.33, random_state=SEED)

scaler = StandardScaler()
ln = LinearRegression()
model = Pipeline([('scaler', scaler),('estimator', ln)])
model.fit(X_train.to_numpy().reshape(-1, 1), y_train.to_numpy())

In [11]:
mean_squared_error(model.predict(X_test.to_numpy().reshape(-1,1)), y_test)
# model.predict(np.array(4).reshape(1,-1))[0]

1.3175551827532885