In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

In [65]:
selected_features = pd.read_csv('data/selected_features.csv')
target_column = 'price actual'
y = selected_features[target_column]

In [66]:
try: 
    selected_features.drop(target_column, axis=1, inplace=True)
    selected_features.drop(target_column + ' ewm', axis=1, inplace=True)
except:
    print('Already dropped')

In [67]:
#make predictions on n-folds of train and test dataset. This function returns the predictions for #train and test for each model.
# def Stacking1(model, train, y, test, n_fold):
#     folds=StratifiedKFold(n_splits=n_fold)
#     test_pred=np.empty((test.shape[0],1),float)
#     train_pred=np.empty((0,1),float)
#     for train_indices, val_indices in folds.split(train, y.values):
#         x_train, x_val = train.iloc[train_indices],train.iloc[val_indices]
#         y_train, y_val = y.iloc[train_indices],y.iloc[val_indices]
        
#         model.fit(X=x_train,y=y_train)
#         train_pred=np.append(train_pred,model.predict(x_val))
#         test_pred=np.append(test_pred,model.predict(test))
#     return test_pred.reshape(-1,1),train_pred

def Stacking(model, train, y, test, n_fold):
    folds = KFold(n_splits=n_fold)
    test_pred = np.empty((0, test.shape[0]), float)
    train_pred = np.empty((0), float)
    
    for train_indices, val_indices in folds.split(train):
        x_train, x_val = train.iloc[train_indices], train.iloc[val_indices]
        y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]
        
        model.fit(X=x_train, y=y_train)
        train_pred = np.append(train_pred, model.predict(x_val))

        # Storing test set predictions for each fold
        test_pred = np.vstack([test_pred, model.predict(test)])

    # Averaging the test set predictions across folds
    test_pred = test_pred.mean(axis=0).reshape(-1, 1)
    
    return test_pred, train_pred


In [68]:
x = selected_features[selected_features.columns]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


degree = 4
model1 = make_pipeline(PolynomialFeatures(degree), LinearRegression())
test_pred1 ,train_pred1 = Stacking(model = model1, n_fold = 10, train = x_train, test = x_test, y = y_train)
train_pred1 = pd.DataFrame(train_pred1)
test_pred1 = pd.DataFrame(test_pred1)


model2 = LinearRegression()
test_pred2, train_pred2 = Stacking(model = model2, n_fold = 10, train = x_train, test = x_test, y = y_train)
train_pred2 = pd.DataFrame(train_pred2)
test_pred2 = pd.DataFrame(test_pred2)

df = pd.concat([train_pred1, train_pred2], axis=1)
df_test = pd.concat([test_pred1, test_pred2], axis=1)
meta_model = make_pipeline(PolynomialFeatures(4), LinearRegression())
meta_model.fit(df, y_train)
y_pred = meta_model.predict(df_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

Mean Squared Error (MSE): 87.55458118670104
