# cml1 - Immobilienrechner
# 2.2 Bestmögliches Regressionsmodell - kaggle-Contest

Entwickle mit beliebigen Algorithmen das bestmögliche Modell im Sinne des Mean absolute percentage error (MAPE). Vergleiche dabei mindestens drei algorithmische Ansätze, wobei ein multiples lineares Modell Teil davon sein soll als Benchmark. Untersuche die ‘Variable Importance’ für dein bestes Modell.

Abgabe

Notebook und daraus erstellter Bericht (ohne Code) als pdf, welche die Entwicklung deines besten Modells, sowie der zwei weiteren Modelle dokumentiert, inklusive verwendeter Features, Preprocessing, Model Selection Prozess und Untersuchung der ‘Variable Importance’.

Eingabe der Vorhersage des Preises für den Testdatensatz mit deinem bestmöglichen Modell auf kaggle.

## Module importieren

In [189]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# linear models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

# tree models
from sklearn.tree import DecisionTreeRegressor

# ensemble models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor

# xgb model
import xgboost as xgb
from xgboost import XGBRegressor

# imputing
from sklearn.impute import SimpleImputer, KNNImputer

# preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, OneHotEncoder

# pipeline
from sklearn.pipeline import Pipeline

# model selection
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold, RandomizedSearchCV

# metrics
from sklearn.metrics import mean_absolute_percentage_error, r2_score

# save model
import pickle

## Daten laden

Damit die Predictions die korrekten IDs haben, müssen die rohe Kaggle-Daten geladen werden und die IDs extrahiert werden. Diese werden wir später wieder benötigen.

In [190]:
df_raw_kaggle = pd.read_csv('../../../data/test_data-Kaggle-v0.11.csv', low_memory=False)

# get indexes for predictions
indexes = df_raw_kaggle["Unnamed: 0"]

In [191]:
df = pd.read_csv('../../../data/immo_data_clean.csv', low_memory=False)
df.head()

Unnamed: 0,price_cleaned,kanton,Zip,Year built:,type_unified,Availability_Categorized,Living_area_unified,Floor_unified,Floor_space_merged,Plot_area_unified,...,PopulationDensityS,RiversAndLakesL,RiversAndLakesM,RiversAndLakesS,WorkplaceDensityL,WorkplaceDensityM,WorkplaceDensityS,ForestDensityL,ForestDensityM,ForestDensityS
0,1150000.0,AG,50,,penthouse,On request,100.0,4.0,,,...,0.366674,0.08217,0.001811,0.011871,0.030169,0.05212,0.098951,0.511176,0.286451,0.090908
1,1420000.0,AG,50,,terrace-house,On request,156.0,,242.0,222.0,...,0.366674,0.08217,0.001811,0.011871,0.030169,0.05212,0.098951,0.511176,0.286451,0.090908
2,720000.0,AG,50,,penthouse,Immediately,93.0,2.0,,,...,0.635194,0.154274,0.188229,0.0,0.172646,0.16385,0.16583,0.163362,0.095877,0.001911
3,1430000.0,AG,50,,detached-house,On request,154.0,,257.0,370.0,...,0.366674,0.08217,0.001811,0.011871,0.030169,0.05212,0.098951,0.511176,0.286451,0.090908
4,995000.0,AG,50,,flat,On request,142.0,0.0,,,...,0.204549,0.109586,0.141473,0.091805,0.04695,0.038008,0.055509,0.333865,0.279276,0.145835


In [192]:
df_kaggle = pd.read_csv('../../../data/test_data_kaggle_clean.csv', low_memory=False)
df_kaggle.head()

Unnamed: 0,kanton,Zip,Year built:,type_unified,Availability_Categorized,Living_area_unified,Floor_unified,Floor_space_merged,Plot_area_unified,Rooms_new,...,PopulationDensityS,RiversAndLakesL,RiversAndLakesM,RiversAndLakesS,WorkplaceDensityL,WorkplaceDensityM,WorkplaceDensityS,ForestDensityL,ForestDensityM,ForestDensityS
0,AG,50,,villa,On request,220.0,,,733.0,6.5,...,0.205251,0.020765,0.034714,0.051031,0.09787,0.103867,0.038822,0.164382,0.10003,0.063548
1,AG,50,,detached-house,On request,230.0,,,702.0,7.5,...,0.194898,0.017035,0.033235,0.053474,0.064024,0.021157,0.03431,0.260855,0.170434,0.083253
2,AG,50,,stepped-house,On request,131.0,,,,4.5,...,0.287587,0.127272,0.06703,0.0,0.032232,0.076203,0.153552,0.434114,0.357984,0.125505
3,AG,50,,terrace-house,Immediately,140.0,,140.0,206.0,6.5,...,0.422582,0.024733,0.030412,0.0,0.111079,0.207175,0.167494,0.14819,0.07661,0.0
4,AG,50,,terrace-house,On request,156.0,,242.0,222.0,4.5,...,0.366674,0.08217,0.001811,0.011871,0.030169,0.05212,0.098951,0.511176,0.286451,0.090908


In [193]:
# remove cols Year built:, Availability_Categorized
#df = df.drop(columns=["Year built:", "Availability_Categorized"])
#df_kaggle = df_kaggle.drop(columns=["Year built:", "Availability_Categorized"])

## Daten vorbereiten

In [194]:
def preprocess_dataframe(df):
    categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    df_processed = pd.get_dummies(df, columns=categorical_columns, drop_first=True, dtype=int)

    return df_processed, numerical_columns


def remove_outliers(df, numerical_columns):
    df_processed = df.copy()

    quantiles_1 = df_processed[numerical_columns].quantile(0.002)
    quantiles_99 = df_processed[numerical_columns].quantile(0.998)

    for column in numerical_columns:
        condition = (df_processed[column] < quantiles_1[column]) | (df_processed[column] > quantiles_99[column])
        df_processed.loc[condition, column] = None

    return df_processed

df, num_cols_df = preprocess_dataframe(df)
df_kaggle, num_cols_df_kaggle = preprocess_dataframe(df_kaggle)

df = remove_outliers(df, num_cols_df)

df = df[df['price_cleaned'] > 10000]

df = df.dropna(subset=['price_cleaned'], axis=0)

print("Dataset after removing rows with missing target values:")
print(df.shape)

print("Kaggle Test Dataset")
print(df_kaggle.shape)

Dataset after removing rows with missing target values:
(21381, 99)
Kaggle Test Dataset
(24556, 99)


In [195]:
columns_df1 = set(df.columns)
columns_df2 = set(df_kaggle.columns)

# Compare the sets of columns
if columns_df1 == columns_df2:
    print("The dataframes have the same column names.")
else:
    print("The dataframes do not have the same column names.")

    # Find out which columns are different
    diff_df1 = columns_df1 - columns_df2
    diff_df2 = columns_df2 - columns_df1

    if diff_df1:
        print("Columns in df1 not in df2:", diff_df1)
    if diff_df2:
        print("Columns in df2 not in df1 (dropped):", diff_df2)
        df_kaggle.drop(diff_df2, axis=1, inplace=True)

The dataframes do not have the same column names.
Columns in df1 not in df2: {'price_cleaned'}
Columns in df2 not in df1 (dropped): {'type_unified_furnished-residential-property'}


## Train Test Split

In [196]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

X_train = train.drop("price_cleaned", axis=1)
y_train = train["price_cleaned"]

X_test = test.drop("price_cleaned", axis=1)
y_test = test["price_cleaned"]

X_test_kaggle = df_kaggle

print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)
print("X_test_kaggle shape: ", X_test_kaggle.shape)

X_train shape:  (17104, 98)
y_train shape:  (17104,)
X_test shape:  (4277, 98)
y_test shape:  (4277,)
X_test_kaggle shape:  (24556, 98)


## Impuation

In [197]:
cols = X_train.columns

X_train_with_na = X_train.copy()
X_test_with_na = X_test.copy()

In [198]:
imputer = KNNImputer(n_neighbors=5)
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=cols)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=cols)
X_test_kaggle_imputed = pd.DataFrame(imputer.fit_transform(df_kaggle), columns=cols)

# export imputed train and test data to csv so we dont always have to run the imputation
X_train_imputed.to_csv('../../../data/X_train_imputed.csv')
X_test_imputed.to_csv('../../../data/X_test_imputed.csv')
X_test_kaggle_imputed.to_csv('../../../data/X_test_kaggle_imputed.csv')

y_train.to_csv('../../../data/y_train.csv')
y_test.to_csv('../../../data/y_test.csv')

In [201]:
X_train_imputed = pd.read_csv('../../../data/X_train_imputed.csv', low_memory=False)
X_test_imputed = pd.read_csv('../../../data/X_test_imputed.csv', low_memory=False)
X_test_kaggle_imputed = pd.read_csv('../../../data/X_test_kaggle_imputed.csv', low_memory=False)
y_train = pd.read_csv('../../../data/y_train.csv', low_memory=False)
y_test = pd.read_csv('../../../data/y_test.csv', low_memory=False)

# ignore first column of X_train
X_train_imputed = X_train_imputed.iloc[:, 1:]

# ignore first column of X_test
X_test_imputed = X_test_imputed.iloc[:, 1:]

# ignore first column of X_test_kaggle
X_test_kaggle_imputed = X_test_kaggle_imputed.iloc[:, 1:]

# ignore first column of y_train
y_train = y_train.iloc[:, 1:].values.ravel()

# ignore first column of y_test
y_test = y_test.iloc[:, 1:].values.ravel()

print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)
print("X_test_kaggle shape: ", X_test_kaggle.shape)

X_train shape:  (17104, 98)
y_train shape:  (17104,)
X_test shape:  (4277, 98)
y_test shape:  (4277,)
X_test_kaggle shape:  (24556, 98)


## Standardisierung

In [202]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=cols)
X_test = pd.DataFrame(scaler.transform(X_test), columns=cols)
X_test_kaggle = pd.DataFrame(scaler.fit_transform(X_test_kaggle), columns=cols)
X_train_with_na = pd.DataFrame(scaler.fit_transform(X_train_with_na), columns=cols)
X_test_with_na = pd.DataFrame(scaler.transform(X_test_with_na), columns=cols)

## Modelle

### Modell 4: XGBoost

[Dokumentation](https://xgboost.readthedocs.io/en/latest/)

In [188]:
def xgb_model(
        X_train, 
        y_train, 
        X_test, 
        y_test, 
        cv,
        max_depth,
        learning_rate,
        booster
    ):
    
    xgb = XGBRegressor()

    param_grid = {
        'max_depth': max_depth,
        'learning_rate': learning_rate,
        'booster': booster,
        'random_state': [42]
    }

    grid_xgb = GridSearchCV(xgb, param_grid, cv=cv, n_jobs=-1, scoring='neg_mean_absolute_percentage_error', verbose=1)

    grid_xgb.fit(X_train, y_train)

    best_params = grid_xgb.best_params_
    print('Best parameters: {}'.format(best_params))

    y_pred_xgb = grid_xgb.predict(X_test)

    mape_xgb = round(mean_absolute_percentage_error(y_test, y_pred_xgb), 4)
    print('MAPE: {}'.format(mape_xgb))

    r2_xgb = round(r2_score(y_test, y_pred_xgb), 4)
    print('R2: {}'.format(r2_xgb))

    return grid_xgb, mape_xgb

xgb_regression, mape_xgb = xgb_model(
    X_train,
    y_train,
    X_test,
    y_test,
    cv=5,
    max_depth = [None, 6, 9, 12, 15],
    learning_rate = [0.01, 0.1, 0.2],
    booster = ['gbtree']
)

Fitting 5 folds for each of 15 candidates, totalling 75 fits




Best parameters: {'booster': 'gbtree', 'learning_rate': 0.2, 'max_depth': 12, 'random_state': 42}
MAPE: 0.2432
R2: 0.8182


### Modell 5: XGBoost (Log-Transformiert)

In [None]:
def xgb_model(
        X_train,
        y_train,
        X_test,
        y_test,
        cv,
        max_depth,
        learning_rate,
        booster
    ):

    xgb = XGBRegressor()

    param_grid = {
        'max_depth': max_depth,
        'learning_rate': learning_rate,
        'booster': booster,
        'random_state': [42]
    }

    grid_xgb = GridSearchCV(xgb, param_grid, cv=cv, n_jobs=-1, scoring='neg_mean_absolute_percentage_error', verbose=1)

    grid_xgb.fit(X_train, np.log(y_train))

    best_params = grid_xgb.best_params_
    print('Best parameters: {}'.format(best_params))

    y_pred_xgb = grid_xgb.predict(X_test)
    y_pred_xgb = np.exp(y_pred_xgb)

    mape_xgb = round(mean_absolute_percentage_error(y_test, y_pred_xgb), 4)
    print('MAPE: {}'.format(mape_xgb))

    r2_xgb = round(r2_score(y_test, y_pred_xgb), 4)
    print('R2: {}'.format(r2_xgb))

    return grid_xgb, mape_xgb

xgb_regression_log, mape_xgb_log = xgb_model(
    X_train,
    y_train,
    X_test,
    y_test,
    cv=5,
    max_depth = [None, 6, 9, 12],
    learning_rate = [0.01, 0.1, 0.3],
    booster = ['gbtree', 'dart']
)

Fitting 5 folds for each of 24 candidates, totalling 120 fits




Best parameters: {'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 12, 'random_state': 42}
MAPE: 0.2415
R2: 0.7411


### Modell 6: Random Forest

[Dokumentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)

In [None]:
def rf_model(
        X_train, 
        y_train, 
        X_test, 
        y_test, 
        cv
    ):

    rf = RandomForestRegressor()

    param_grid = {
        'random_state': [42]
    }

    grid_rf = GridSearchCV(rf, param_grid, cv=cv, n_jobs=-1, scoring='neg_mean_absolute_percentage_error', verbose=1)

    grid_rf.fit(X_train, y_train)

    best_params = grid_rf.best_params_
    print('Best parameters: {}'.format(best_params))

    y_pred_rf = grid_rf.predict(X_test)

    mape_rf = round(mean_absolute_percentage_error(y_test, y_pred_rf), 4)
    print('MAPE: {}'.format(mape_rf))

    r2_rf = round(r2_score(y_test, y_pred_rf), 4)
    print('R2: {}'.format(r2_rf))

    return grid_rf, mape_rf

rf_regression, mape_rf = rf_model(
    X_train,
    y_train,
    X_test,
    y_test,
    cv=5,
)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters: {'random_state': 42}
MAPE: 0.3354
R2: 0.7454


### Modell 7: Random Forest (Log-Transformiert)

In [None]:
def rf_model(
        X_train,
        y_train,
        X_test,
        y_test,
        cv,
    ):

    rf = RandomForestRegressor()

    param_grid = {
        'random_state': [42],
    }

    grid_rf = GridSearchCV(rf, param_grid, cv=cv, n_jobs=-1, scoring='neg_mean_absolute_percentage_error', verbose=1)

    grid_rf.fit(X_train, np.log(y_train))

    best_params = grid_rf.best_params_
    print('Best parameters: {}'.format(best_params))

    y_pred_rf = grid_rf.predict(X_test)
    y_pred_rf = np.exp(y_pred_rf)

    mape_rf = round(mean_absolute_percentage_error(y_test, y_pred_rf), 4)
    print('MAPE: {}'.format(mape_rf))

    r2_rf = round(r2_score(y_test, y_pred_rf), 4)
    print('R2: {}'.format(r2_rf))

    return grid_rf, mape_rf

rf_regression_log, mape_rf_log = rf_model(
    X_train,
    y_train,
    X_test,
    y_test,
    cv=2
)

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV] END ....................................random_state=42; total time=  21.1s
[CV] END ....................................random_state=42; total time=  21.3s
Best parameters: {'random_state': 42}
MAPE: 0.2483
R2: 0.7404


### Modell 8: Hist Gradient Boosting Regression

[Dokumentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html)

In [None]:
def hgb_model(
        X_train, 
        y_train, 
        X_test, 
        y_test, 
        cv, 
        loss,
        learning_rate, 
        max_iter,
        max_leaf_nodes,
        max_depth, 
        l2_regularization, 
        random_state,
    ):
    
    hgb = HistGradientBoostingRegressor()

    param_grid = {
        'loss': loss,
        'learning_rate': learning_rate,
        'max_iter': max_iter,
        'max_leaf_nodes': max_leaf_nodes,
        'max_depth': max_depth,
        'l2_regularization': l2_regularization,
        'random_state': random_state
    }

    grid_hgb = GridSearchCV(hgb, param_grid, cv=cv, n_jobs=-1, scoring='neg_mean_absolute_percentage_error', verbose=1)

    grid_hgb.fit(X_train, y_train)

    best_params = grid_hgb.best_params_
    print('Best parameters: {}'.format(best_params))

    y_pred_hgb = grid_hgb.predict(X_test)

    mape_hgb = round(mean_absolute_percentage_error(y_test, y_pred_hgb), 4)
    print('MAPE: {}'.format(mape_hgb))

    r2_hgb = round(r2_score(y_test, y_pred_hgb), 4)
    print('R2: {}'.format(r2_hgb))

    return grid_hgb, mape_hgb

hgb_regression, mape_hgb = hgb_model(
    X_train,
    y_train,
    X_test,
    y_test,
    cv=5,
    loss = ['gamma'],
    learning_rate = [0.1],
    max_iter = [300],
    max_leaf_nodes = [70],
    max_depth = [None],
    l2_regularization = [0.1],
    random_state = [27]
)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END l2_regularization=0.1, learning_rate=0.1, loss=gamma, max_depth=None, max_iter=300, max_leaf_nodes=70, random_state=27; total time=   1.9s
[CV] END l2_regularization=0.1, learning_rate=0.1, loss=gamma, max_depth=None, max_iter=300, max_leaf_nodes=70, random_state=27; total time=   1.5s
[CV] END l2_regularization=0.1, learning_rate=0.1, loss=gamma, max_depth=None, max_iter=300, max_leaf_nodes=70, random_state=27; total time=   2.1s
[CV] END l2_regularization=0.1, learning_rate=0.1, loss=gamma, max_depth=None, max_iter=300, max_leaf_nodes=70, random_state=27; total time=   1.9s
[CV] END l2_regularization=0.1, learning_rate=0.1, loss=gamma, max_depth=None, max_iter=300, max_leaf_nodes=70, random_state=27; total time=   2.8s
Best parameters: {'l2_regularization': 0.1, 'learning_rate': 0.1, 'loss': 'gamma', 'max_depth': None, 'max_iter': 300, 'max_leaf_nodes': 70, 'random_state': 27}
MAPE: 0.2793
R2: 0.7515


### Modell 9: Hist Gradient Boosting Regression (Log-Transformiert)

In [None]:
def hgb_model(
        X_train, 
        y_train, 
        X_test, 
        y_test, 
        cv, 
        loss,
        learning_rate, 
        max_iter,
        max_leaf_nodes,
        max_depth, 
        l2_regularization, 
        random_state,
    ):
    
    hgb = HistGradientBoostingRegressor()

    param_grid = {
        'loss': loss,
        'learning_rate': learning_rate,
        'max_iter': max_iter,
        'max_leaf_nodes': max_leaf_nodes,
        'max_depth': max_depth,
        'l2_regularization': l2_regularization,
        'random_state': random_state
    }

    grid_hgb = GridSearchCV(hgb, param_grid, cv=cv, n_jobs=-1, scoring='neg_mean_absolute_percentage_error', verbose=1)

    grid_hgb.fit(X_train, np.log(y_train))

    best_params = grid_hgb.best_params_
    print('Best parameters: {}'.format(best_params))

    y_pred_hgb = grid_hgb.predict(X_test)
    y_pred_hgb = np.exp(y_pred_hgb)

    mape_hgb = round(mean_absolute_percentage_error(y_test, y_pred_hgb), 4)
    print('MAPE: {}'.format(mape_hgb))

    r2_hgb = round(r2_score(y_test, y_pred_hgb), 4)
    print('R2: {}'.format(r2_hgb))

    return grid_hgb, mape_hgb

hgb_regression_log, mape_hgb_log = hgb_model(
    X_train,
    y_train,
    X_test,
    y_test,
    cv=5,
    loss = ['gamma'],
    learning_rate = [0.1],
    max_iter = [300],
    max_leaf_nodes = [70],
    max_depth = [None],
    l2_regularization = [0.1],
    random_state = [27]
)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters: {'l2_regularization': 0.1, 'learning_rate': 0.1, 'loss': 'gamma', 'max_depth': None, 'max_iter': 300, 'max_leaf_nodes': 70, 'random_state': 27}
MAPE: 0.2225
R2: 0.79


In [None]:
hgb_regression_log, mape_hgb_log = hgb_model(
    X_train_with_na,
    y_train,
    X_test_with_na,
    y_test,
    cv=5,
    loss = ['gamma'],
    learning_rate = [0.1],
    max_iter = [300],
    max_leaf_nodes = [70],
    max_depth = [None],
    l2_regularization = [0.1],
    random_state = [20]
)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters: {'l2_regularization': 0.1, 'learning_rate': 0.1, 'loss': 'gamma', 'max_depth': None, 'max_iter': 300, 'max_leaf_nodes': 70, 'random_state': 20}
MAPE: 0.2145
R2: 0.8335


### Modell 10: Gradient Boosting Regression

[Dokumentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)

In [None]:
def gb_model(
        X_train,
        y_train,
        X_test,
        y_test,
        cv,
        max_depth,
        learning_rate,
    ):

    gb = GradientBoostingRegressor()

    param_grid = {
        'max_depth': max_depth,
        'learning_rate': learning_rate,
        'random_state': [42]
    }

    grid_gb = GridSearchCV(gb, param_grid, cv=cv, n_jobs=-1, scoring='neg_mean_absolute_percentage_error', verbose=1)

    grid_gb.fit(X_train, y_train)

    best_params = grid_gb.best_params_
    print('Best parameters: {}'.format(best_params))

    y_pred_gb = grid_gb.predict(X_test)

    mape_gb = round(mean_absolute_percentage_error(y_test, y_pred_gb), 4)
    print('MAPE: {}'.format(mape_gb))

    r2_gb = round(r2_score(y_test, y_pred_gb), 4)
    print('R2: {}'.format(r2_gb))

    return grid_gb, mape_gb

gb_regression, mape_gb = gb_model(
    X_train,
    y_train,
    X_test,
    y_test,
    cv=5,
    max_depth = [None, 3, 6, 9],
    learning_rate = [0.01, 0.1],
)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters: {'learning_rate': 0.1, 'max_depth': 9, 'random_state': 42}
MAPE: 0.2863
R2: 0.7561


### Modell 11: Gradient Boosting Regression (Log-Transformiert)

In [None]:
def gb_model(
        X_train,
        y_train,
        X_test,
        y_test,
        cv,
        max_depth,
        learning_rate
    ):

    gb = GradientBoostingRegressor()

    param_grid = {
        'max_depth': max_depth,
        'learning_rate': learning_rate,
        'random_state': [42]
    }

    grid_gb = GridSearchCV(gb, param_grid, cv=cv, n_jobs=-1, scoring='neg_mean_absolute_percentage_error', verbose=1)

    grid_gb.fit(X_train, np.log(y_train))

    best_params = grid_gb.best_params_
    print('Best parameters: {}'.format(best_params))

    y_pred_gb = grid_gb.predict(X_test)
    y_pred_gb = np.exp(y_pred_gb)

    mape_gb = round(mean_absolute_percentage_error(y_test, y_pred_gb), 4)
    print('MAPE: {}'.format(mape_gb))

    r2_gb = round(r2_score(y_test, y_pred_gb), 4)
    print('R2: {}'.format(r2_gb))

    return grid_gb, mape_gb

gb_regression_log, mape_gb_log = gb_model(
    X_train,
    y_train,
    X_test,
    y_test,
    cv=5,
    max_depth = [None, 3, 6, 9],
    learning_rate = [0.01, 0.1]
)

Fitting 5 folds for each of 8 candidates, totalling 40 fits




Best parameters: {'learning_rate': 0.1, 'max_depth': 9, 'random_state': 42}
MAPE: 0.2438
R2: 0.7498


## Kaggle Datensatz Predictions
### XGB

In [None]:
y_pred = xgb_regression.predict(X_test_kaggle)

predictions = pd.DataFrame({"ID": indexes, "Expected": y_pred})

predictions.to_csv('../../99_gespeicherte_modelle/predictions_xgb.csv', index=False)

In [None]:
y_pred = xgb_regression_log.predict(X_test_kaggle)

y_pred = np.exp(y_pred)

predictions = pd.DataFrame({"ID": indexes, "Expected": y_pred})

predictions.to_csv('../../99_gespeicherte_modelle/predictions_xgb_log.csv', index=False)

### Random Forest

In [None]:
y_pred = re_regression.predict(X_test_kaggle)

predictions = pd.DataFrame({"ID": indexes, "Expected": y_pred})

predictions.to_csv('../../99_gespeicherte_modelle/predictions_rf.csv', index=False)

In [None]:
y_pred = rf_regression_log.predict(X_test_kaggle)

y_pred = np.exp(y_pred)

predictions = pd.DataFrame({"ID": indexes, "Expected": y_pred})

predictions.to_csv('../../99_gespeicherte_modelle/predictions_rf_log.csv', index=False)

### Hist Gradient Boosting

In [None]:
y_pred = hgb_regression.predict(X_test_kaggle)

predictions = pd.DataFrame({"ID": indexes, "Expected": y_pred})

predictions.to_csv('../../99_gespeicherte_modelle/predictions_hgb.csv', index=False)

In [None]:
y_pred = hgb_regression_log.predict(X_test_kaggle)

y_pred = np.exp(y_pred)

predictions = pd.DataFrame({"ID": indexes, "Expected": y_pred})

predictions.to_csv('../../99_gespeicherte_modelle/predictions_hgb_log.csv', index=False)

### Gradient Boosting

In [None]:
y_pred = gb_regression.predict(X_test_kaggle)

predictions = pd.DataFrame({"ID": indexes, "Expected": y_pred})

predictions.to_csv('../../99_gespeicherte_modelle/predictions_gb.csv', index=False)

In [None]:
y_pred = gb_regression_log.predict(X_test_kaggle)

y_pred = np.exp(y_pred)

predictions = pd.DataFrame({"ID": indexes, "Expected": y_pred})

predictions.to_csv('../../99_gespeicherte_modelle/predictions_gb_log.csv', index=False)

  y_pred = np.exp(y_pred)
