In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

from bdint.models.nn import NN

import torch

def k_fold_validation(train_df, model, k=5):
    target = train_df[["SalePrice"]]
    features = train_df.drop(columns=["SalePrice"])
    target.columns = ["SalePrice"]

    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    rmse_values = []

    for train_index, test_index in kf.split(features):
        x_train, x_test = features.iloc[train_index], features.iloc[test_index]
        y_train, y_test = target.iloc[train_index], target.iloc[test_index]
        
        # reset model
        model.reset()

        # Learn the given model
        model.learn(x_train, y_train, x_test, y_test, epochs=500, learning_rate=0.5, weight_decay=0.1)

        # Calculate RMSE
        predictions = model.predict(x_test)
        if isinstance(predictions, torch.Tensor):
            predictions = predictions.detach().numpy()
        rmse_value = mean_squared_error(y_test["SalePrice"], predictions, squared=False)
        rmse_values.append(rmse_value)

    # Calculate the mean RMSE
    mean_rmse = sum(rmse_values) / len(rmse_values)

    return mean_rmse

import pandas as pd

from bdint.data import (
    get_test_df,
    get_train_df,
    make_kaggle_submission_file,
)
from models import CatBoost, RandomForest

train_df = get_train_df("../data/train.csv")
test_df = get_test_df("../data/test.csv")

print("Train Set Size:", len(train_df))
print("Test Set Size:", len(test_df))

# create Model
# model = RandomForest(n_estimators=100, random_state=42)
# set jobtype to cpu
# model = CatBoost(early_stopping_rounds=2000, iterations=10000)
# model = CatBoost(early_stopping_rounds=2000, iterations=15000)
model = NN(hidden_size1=1028, hidden_size2=512, hidden_size3=256)

# numerical
# train_df = preprocess_for_numerical_model(train_df)
# test_df = preprocess_for_numerical_model(test_df)

# Check Performance of model using k validation
rmses = k_fold_validation(train_df=train_df, model=model, k=5)

Train Set Size: 1460
Test Set Size: 1459
Epoch: 0, Loss: 197194.265625
Epoch: 1, Loss: 197158.609375
Epoch: 2, Loss: 197080.71875
Epoch: 3, Loss: 196936.3125
Epoch: 4, Loss: 196692.765625
Epoch: 5, Loss: 196324.015625
Epoch: 6, Loss: 195833.09375
Epoch: 7, Loss: 195280.453125
Epoch: 8, Loss: 194820.03125
Epoch: 9, Loss: 193524.125
Epoch: 10, Loss: 192365.515625
Epoch: 11, Loss: 191081.796875
Epoch: 12, Loss: 189609.203125
Epoch: 13, Loss: 187980.265625
Epoch: 14, Loss: 186273.15625
Epoch: 15, Loss: 184452.703125
Epoch: 16, Loss: 181915.203125
Epoch: 17, Loss: 179306.296875
Epoch: 18, Loss: 176525.484375
Epoch: 19, Loss: 173760.328125
Epoch: 20, Loss: 171096.40625
Epoch: 21, Loss: 166950.796875
Epoch: 22, Loss: 163309.796875
Epoch: 23, Loss: 159102.265625
Epoch: 24, Loss: 155092.296875
Epoch: 25, Loss: 150826.59375
Epoch: 26, Loss: 145669.25
Epoch: 27, Loss: 140490.96875
Epoch: 28, Loss: 135612.046875
Epoch: 29, Loss: 130374.828125
Epoch: 30, Loss: 123725.046875
Epoch: 31, Loss: 118007.

KeyboardInterrupt: 

In [17]:
print("Mean RMSE:", rmses)

Mean RMSE: 47087.4132771151


In [3]:
model.reset()
model.learn(
    x_train_df=train_df.drop(columns=["SalePrice"], inplace=False),
    y_train_df=pd.DataFrame(train_df["SalePrice"]),
    epochs=500, learning_rate=0.5, weight_decay=0.1
)
predictions = model.predict(test_df).detach().numpy()
make_kaggle_submission_file(predictions, test_df)

Epoch: 0, Loss: 197583.671875
Epoch: 1, Loss: 197549.953125
Epoch: 2, Loss: 197478.875
Epoch: 3, Loss: 197345.875
Epoch: 4, Loss: 197121.359375
Epoch: 5, Loss: 196783.03125
Epoch: 6, Loss: 196299.4375
Epoch: 7, Loss: 195776.171875
Epoch: 8, Loss: 195187.671875
Epoch: 9, Loss: 194044.6875
Epoch: 10, Loss: 192963.15625
Epoch: 11, Loss: 191703.984375
Epoch: 12, Loss: 190265.484375
Epoch: 13, Loss: 188683.140625
Epoch: 14, Loss: 187345.359375
Epoch: 15, Loss: 184995.6875
Epoch: 16, Loss: 182503.578125
Epoch: 17, Loss: 180126.390625
Epoch: 18, Loss: 177617.953125
Epoch: 19, Loss: 174410.828125
Epoch: 20, Loss: 171262.03125
Epoch: 21, Loss: 167680.9375
Epoch: 22, Loss: 164376.359375
Epoch: 23, Loss: 160079.640625
Epoch: 24, Loss: 155643.28125
Epoch: 25, Loss: 151103.046875
Epoch: 26, Loss: 146280.125
Epoch: 27, Loss: 142783.25
Epoch: 28, Loss: 138014.046875
Epoch: 29, Loss: 131710.265625
Epoch: 30, Loss: 125203.4453125
Epoch: 31, Loss: 119981.71875
Epoch: 32, Loss: 113421.7734375
Epoch: 33, 

In [6]:
# ensemble of catboost and nn
catboost = CatBoost(early_stopping_rounds=2000, iterations=15000, rsm=0.1)
catboost.learn(
    x_train_df=train_df.drop(columns=["SalePrice"], inplace=False),
    y_train_df=pd.DataFrame(train_df["SalePrice"])
)
predictions_catboost = catboost.predict(test_df)
nn = NN(hidden_size1=1028, hidden_size2=512, hidden_size3=256)
nn.learn(
    x_train_df=train_df.drop(columns=["SalePrice"], inplace=False),
    y_train_df=pd.DataFrame(train_df["SalePrice"]),
    epochs=500, learning_rate=0.5, weight_decay=0.1
)
predictions_nn = nn.predict(test_df).detach().numpy()
predictions = (predictions_catboost + predictions_nn) / 2
make_kaggle_submission_file(predictions, test_df)

Learning rate set to 0.004808
0:	learn: 79169.9295586	total: 23.3ms	remaining: 5m 48s
250:	learn: 43667.4253451	total: 1.72s	remaining: 1m 40s
500:	learn: 30874.6197873	total: 3.24s	remaining: 1m 33s
750:	learn: 25404.2889562	total: 4.63s	remaining: 1m 27s
1000:	learn: 22629.8463694	total: 6.08s	remaining: 1m 25s
1250:	learn: 20915.7963649	total: 7.47s	remaining: 1m 22s
1500:	learn: 19663.4328627	total: 8.73s	remaining: 1m 18s
1750:	learn: 18631.7390518	total: 10.3s	remaining: 1m 17s
2000:	learn: 17709.1985461	total: 11.7s	remaining: 1m 15s
2250:	learn: 16941.6500068	total: 13.1s	remaining: 1m 14s
2500:	learn: 16294.4989642	total: 14.5s	remaining: 1m 12s
2750:	learn: 15714.6912783	total: 15.9s	remaining: 1m 10s
3000:	learn: 15222.0871982	total: 17.3s	remaining: 1m 9s
3250:	learn: 14794.4746812	total: 18.7s	remaining: 1m 7s
3500:	learn: 14398.6278099	total: 20.1s	remaining: 1m 5s
3750:	learn: 14042.9421487	total: 21.4s	remaining: 1m 4s
4000:	learn: 13682.3142411	total: 22.7s	remaining: 

AttributeError: 'numpy.ndarray' object has no attribute 'detach'

In [17]:
# sum to arrays of length i into an array of length i
predictions = [float(predictions_catboost[i] + predictions_nn[i])/2 for i in range(len(predictions_catboost))]
make_kaggle_submission_file(predictions_nn, test_df)

  predictions = [float(predictions_catboost[i] + predictions_nn[i])/2 for i in range(len(predictions_catboost))]


In [7]:
predictions_catboost = catboost.predict(test_df)
nn = NN(hidden_size1=1028, hidden_size2=512, hidden_size3=256)
nn.learn(
    x_train_df=train_df.drop(columns=["SalePrice"], inplace=False),
    y_train_df=pd.DataFrame(train_df["SalePrice"]),
    epochs=500, learning_rate=0.5, weight_decay=0.1
)
predictions_nn = nn.predict(test_df).detach().numpy()
predictions = (predictions_catboost + predictions_nn) / 2
make_kaggle_submission_file(predictions, test_df)

Epoch: 0, Loss: 197583.890625
Epoch: 1, Loss: 197553.875
Epoch: 2, Loss: 197490.203125
Epoch: 3, Loss: 197366.1875
Epoch: 4, Loss: 197149.265625
Epoch: 5, Loss: 196810.78125
Epoch: 6, Loss: 196338.859375
Epoch: 7, Loss: 195850.859375
Epoch: 8, Loss: 195306.625
Epoch: 9, Loss: 194110.6875
Epoch: 10, Loss: 193028.25
Epoch: 11, Loss: 191781.40625
Epoch: 12, Loss: 190373.140625
Epoch: 13, Loss: 188809.625
Epoch: 14, Loss: 187000.734375
Epoch: 15, Loss: 184946.015625
Epoch: 16, Loss: 182643.671875
Epoch: 17, Loss: 180202.328125
Epoch: 18, Loss: 177622.90625
Epoch: 19, Loss: 174428.421875
Epoch: 20, Loss: 171282.1875
Epoch: 21, Loss: 168285.453125
Epoch: 22, Loss: 164239.921875
Epoch: 23, Loss: 160203.859375
Epoch: 24, Loss: 156082.6875
Epoch: 25, Loss: 151425.0
Epoch: 26, Loss: 146608.609375
Epoch: 27, Loss: 142081.109375
Epoch: 28, Loss: 136591.046875
Epoch: 29, Loss: 131014.1484375
Epoch: 30, Loss: 125506.7265625
Epoch: 31, Loss: 120684.796875
Epoch: 32, Loss: 113829.2421875
Epoch: 33, Lo

ValueError: Expected a 1D array, got an array with shape (1459, 1459)