In [3]:
import pandas as pd
pd.set_option('display.max_columns', 100)
import numpy as np
np.set_printoptions(legacy="1.25")
import seaborn as sns
import matplotlib.pyplot as plt
import math
import scipy

from sklearn import set_config
set_config(transform_output='pandas')

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import (mean_absolute_error, mean_squared_error, r2_score, 
                             mean_absolute_percentage_error, root_mean_squared_error)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, PolynomialFeatures, FunctionTransformer
from sklearn.tree import DecisionTreeRegressor, plot_tree

import category_encoders as ce
from category_encoders.hashing import HashingEncoder
from category_encoders.ordinal import OrdinalEncoder

from xgboost import XGBRegressor

import joblib

In [None]:
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_proc, y_train)
depths = range(1, dt.get_depth()+1)
depth_df = pd.DataFrame(index=depths, columns=["Training log RMSE", "Test log RMSE", "Distance from training"])

for depth in depths:
    # try max depth in a decision tree
    dt_temp = DecisionTreeRegressor(max_depth=depth, random_state=42)
    dt_temp.fit(X_train_proc, y_train)
    train_pred = dt_temp.predict(X_train_proc)
    trainRMSE = root_mean_squared_error(np.log1p(np.clip(y_train, a_min=0, a_max=np.max(y_train))), np.log1p(np.clip(train_pred, a_min=0, a_max=np.max(train_pred))))
    test_pred = dt_temp.predict(X_test_proc)
    testRMSE = root_mean_squared_error(np.log1p(np.clip(y_test, a_min=0, a_max=np.max(y_test))), np.log1p(np.clip(test_pred, a_min=0, a_max=np.max(test_pred))))
    depth_df.loc[depth, "Training log RMSE"] = trainRMSE
    depth_df.loc[depth, "Test log RMSE"] = testRMSE
    depth_df.loc[depth, "Distance from training"] = np.abs(trainRMSE - testRMSE)

depth_df.sort_values(by=["Test log RMSE"])

In [3]:
Lasso().get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 1000,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [4]:
Ridge().get_params()


{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'positive': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.0001}

In [5]:
ElasticNet().get_params()


{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'l1_ratio': 0.5,
 'max_iter': 1000,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [5]:
GridSearchCV().get_params()

TypeError: GridSearchCV.__init__() missing 2 required positional arguments: 'estimator' and 'param_grid'

In [None]:
# computing the stack features
s_train, s_test = stacking(list_of_best_estimators, X_train_proc, X_test_proc, y_train, regression=True, n_folds=4)
 
# initializing the second-level model
final_model = LinearRegression()
 
# fitting the second level model with stack features
final_model = final_model.fit(s_train, y_train)
 
# predicting the final output using stacking
pred_final = final_model.predict(s_test)
pred_final

In [None]:
# Initialize 1st level estimators
estimators = [('lr', LinearRegression()),
              ('ridge', Ridge(random_state=True))]

# Initialize StackingTransformer
stack = StackingTransformer(estimators, regression=True, verbose=2)

# Fit
stack = stack.fit(X_train_proc, y_train)

# Get your stacked features
S_train = stack.transform(X_train_proc)
S_test = stack.transform(X_test_proc)

# Use 2nd level estimator with stacked features
# initializing the second-level model
final_model = LinearRegression()
 
# fitting the second level model with stack features
final_model = final_model.fit(S_train, y_train)

# predicting the final output using stacking
pred_final = final_model.predict(S_test)
pred_final

task:         [regression]
metric:       [mean_absolute_error]
variant:      [A]
n_estimators: [2]





ValueError: Setting a random_state has no effect since shuffle is False. You should leave random_state to its default (None), or set shuffle=True.

In [None]:
def model_stacker(models_list, X_train, X_test, y_train):
    # df_train = X_train.copy()
    df_test = X_test.copy()
    for model in models_list:
        model.fit(X_train, y_train)
        # train_pred = pd.DataFrame(model.predict(X_train))
        test_pred = pd.DataFrame(model.predict(X_test))
        # df_train = pd.concat([df_train, train_pred], axis=1)
        df_test = pd.concat([df_test, test_pred], axis=1)
    return df_test

In [None]:
df_test = model_stacker(list_of_best_estimators, X_train_proc, X_test_proc, y_train)

In [None]:
stacked_dfs = list(model_stacker(list_of_best_estimators, X_train_proc, X_test_proc, y_train))
df_train = stacked_dfs[0]
df_test = stacked_dfs[1]

df_test

Unnamed: 0,MiscVal,YrSold,PoolArea,GarageCars,GarageArea,GarageYrBlt,Fireplaces,TotRmsAbvGrd,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,MasVnrArea,LotFrontage,LotArea,YearBuilt,YearRemodAdd,OverallQual,OverallCond,MoSold_1,MoSold_2,MoSold_3,MoSold_4,MoSold_5,MoSold_6,MoSold_7,MoSold_8,MoSold_9,MoSold_10,MoSold_11,MoSold_12,GarageFinish_Fin,GarageFinish_Missing,GarageFinish_RFn,GarageFinish_Unf,CentralAir_Y,Street_Pave,Alley_Grvl,Alley_Missing,Alley_Pave,LotShape_IR1,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,Utilities_NoSeWa,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Gtl,LandSlope_Mod,LandSlope_Sev,Porch_sum,col_0,col_1,col_2,col_3,col_4,col_5,Fence,PavedDrive,Functional,Electrical,BsmtExposure,HeatingQC,KitchenQual,GarageCond,GarageQual,FireplaceQu,BsmtQual,BsmtCond,ExterCond,ExterQual,PoolQC,0,0.1,0.2,0.3,0.4
892,0.0,2006.0,0.0,1.0,264.0,1963.0,0.0,6.0,1068.0,0.0,0.0,1068.0,0.0,1.0,1.0,0.0,3.0,1.0,663.0,0.0,396.0,1059.0,0.0,70.0,8414.0,1963.0,2003.0,6.0,8.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,192.0,5.0,2.0,3.0,2.0,4.0,3.0,3.0,3.0,0.0,2.0,1.0,3.0,3.0,3.0,3.0,0.0,3.0,3.0,3.0,3.0,0.0,,,,,
1105,0.0,2010.0,0.0,2.0,712.0,1994.0,2.0,9.0,1500.0,1122.0,0.0,2622.0,1.0,0.0,2.0,1.0,3.0,1.0,1032.0,0.0,431.0,1463.0,362.0,98.0,12256.0,1994.0,1995.0,8.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,218.0,3.0,3.0,2.0,3.0,3.0,5.0,0.0,3.0,0.0,2.0,3.0,5.0,4.0,3.0,3.0,3.0,5.0,3.0,3.0,4.0,0.0,,,,,
413,0.0,2010.0,0.0,2.0,360.0,1927.0,1.0,5.0,1028.0,0.0,0.0,1028.0,0.0,0.0,1.0,0.0,2.0,1.0,0.0,0.0,1008.0,1008.0,0.0,56.0,8960.0,1927.0,1950.0,5.0,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,130.0,6.0,0.0,3.0,4.0,1.0,5.0,0.0,3.0,0.0,1.0,1.0,4.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,0.0,,,,,
522,0.0,2006.0,0.0,2.0,420.0,1950.0,2.0,7.0,1004.0,660.0,0.0,1664.0,0.0,0.0,2.0,0.0,3.0,1.0,399.0,0.0,605.0,1004.0,0.0,50.0,5000.0,1947.0,1950.0,6.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,60.0,5.0,1.0,4.0,2.0,2.0,5.0,0.0,3.0,0.0,2.0,1.0,5.0,3.0,3.0,3.0,4.0,3.0,3.0,4.0,3.0,0.0,,,,,
1036,0.0,2009.0,0.0,3.0,912.0,2008.0,1.0,6.0,1620.0,0.0,0.0,1620.0,1.0,0.0,2.0,0.0,2.0,1.0,1022.0,0.0,598.0,1620.0,70.0,89.0,12898.0,2007.0,2008.0,9.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,228.0,4.0,2.0,4.0,2.0,2.0,5.0,0.0,3.0,0.0,2.0,4.0,5.0,5.0,3.0,3.0,5.0,5.0,3.0,3.0,4.0,0.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,233829.900000,216225.135369,215904.931530,214989.672734,215105.328218
287,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,115760.000000,101306.017543,103595.420204,115586.920572,113310.769146
288,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,253400.000000,263268.343000,260794.198490,256124.249413,257993.672516
290,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,149308.333333,99309.503071,101810.376649,106585.652553,104840.533323


In [None]:
final_model = LinearRegression()
final_model.fit(stacked_dfs[0], y_train)

In [None]:
final_model = LinearRegression()
final_model.fit(stacked_dfs[0], y_train)
final_pred = final_model.predict(stacked_dfs[1])

TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.