In [42]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [43]:
from clean_rents2 import LabelCleaner, ColumnsSelector, LabelTransformer, NumericalTransformer, CategoricalTransformer
from sklearn.pipeline import Pipeline

In [44]:
df = pd.read_parquet("dataframes/italy_housing_price_rent_raw.parquet.gzip")
df = df[['prezzo', 'stanze', 'bagni', 'superficie', 'Posti Auto']]
target_cleaner = LabelCleaner(df['prezzo'])
df['prezzo'] = target_cleaner.clean_target()
df = df.dropna(subset=['prezzo'])

X = df.drop(['prezzo'], axis=1)
y = df['prezzo']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
numerical_columns = ['superficie']
label = ['prezzo']

label_transformer = Pipeline(steps=[
    ('label_selector', ColumnsSelector(label)),
    ('label_transformer', LabelTransformer())
])


In [46]:
numeric_transformer = Pipeline(steps=[
    ('num_selector', ColumnsSelector(numerical_columns)),
    ('num_transformer', NumericalTransformer()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


In [47]:
categorical_columns = ['stanze', 'bagni', 'Posti Auto']

categorical_transformer = Pipeline(steps=[
    ('cat_selector', ColumnsSelector(categorical_columns)),
    ('cat_transformer', CategoricalTransformer()),
    ('imputer', SimpleImputer(strategy='most_frequent'))
])


In [48]:
feature_pipeline = FeatureUnion( transformer_list = [
    ('numerical_pipeline', numeric_transformer ),
    ('categorical_pipeline', categorical_transformer ) ] )


In [49]:
lr = LinearRegression()
rf = RandomForestRegressor()
xgb = GradientBoostingRegressor()

models = {
    'Linear regression': lr,
    'Random forest': rf,
    'XGBoost': xgb
          }

full_pipeline = Pipeline(steps=[
    ('feature_pipeline', feature_pipeline),
    ('model', LinearRegression() )
])


In [50]:
full_pipeline.fit(X_train, y_train)
train_pred = full_pipeline.predict(X_train)

In [51]:
for name, model in models.items():

    full_pipeline = Pipeline(steps=[
        ('feature_pipeline', feature_pipeline),
        ('model', LinearRegression() )
    ])

    # model fitting
    full_pipeline.fit(X_train, y_train)

    # model scoring
    train_pred = full_pipeline.predict(X_train)

    # Evaluate model performance
    print('*' * 50)
    print(f'{name} Training score:')
    print(f'MAE: {round(mean_absolute_error(y_train, train_pred),4)} | RMSE: {round(mean_squared_error(y_train, train_pred, squared=False),4)} | R2: {round(r2_score(y_train, train_pred),4)}')

**************************************************
Linear regression Training score:
MAE: 1994.0082 | RMSE: 26382.808 | R2: 0.001
**************************************************
Random forest Training score:
MAE: 1994.0082 | RMSE: 26382.808 | R2: 0.001
**************************************************
XGBoost Training score:
MAE: 1994.0082 | RMSE: 26382.808 | R2: 0.001
