# Create Custom Function for ensembling with stacking and blending

#### We will use already processed data to predict interest rate by stacking and blending.¶
#### In this example we will divide the data in 20 sets. That can be any number. ( N )
#### 1st Layer Model : Linear Regression, Decision Tree, KNN, CatBoost ( These can be any regressor , you need to import the function only)
#### 2nd Layer Model : Linear Regression ( Again this can be any method)
#### To specify any parameters we have to ingest it directly in the model argument
#### Finally we will wrap the entire process in a single function

In [40]:
import pandas as pd
import numpy as np
x_train1 = pd.read_csv('../input/regression-datasets/X_train1_reg.csv')
x_train2 = pd.read_csv('../input/regression-datasets/X_train2_reg.csv')
y_train1 = pd.read_csv('../input/regression-datasets/y_train1_reg.csv')
y_train2 = pd.read_csv('../input/regression-datasets/y_train2_reg.csv')

x_train1.shape,x_train2.shape,y_train1.shape,y_train2.shape

((1759, 29), (440, 29), (1759, 1), (440, 1))

In [41]:
x_train1.head()

Unnamed: 0,ID,Amount.Requested,Debt.To.Income.Ratio,Monthly.Income,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,ll_36,lp_10,lp_11,...,fico,Employment.Length_7years,Employment.Length_6years,Employment.Length_1year,Employment.Length_4years,Employment.Length_5years,Employment.Length_3years,Employment.Length_2years,Employment.Length_LT_1year,Employment.Length_10years
0,34331,3300.0,4.16,3730.0,7.0,3675.0,1.0,1,0.0,1,...,702,0,0,1,0,0,0,0,0,0
1,92046,32000.0,15.61,9250.0,15.0,32566.0,1.0,0,0.0,0,...,702,0,0,0,0,0,0,1,0,0
2,8382,6250.0,18.04,2400.0,5.0,14863.0,0.0,1,0.0,0,...,722,0,0,0,0,0,0,0,0,1
3,47097,7200.0,12.13,2333.33,7.0,6262.0,1.0,1,0.0,0,...,727,0,0,0,0,0,0,1,0,0
4,62900,6000.0,5.65,7083.33,15.0,5747.0,0.0,1,0.0,0,...,752,0,0,0,1,0,0,0,0,0


In [42]:
y_train1.head()

Unnamed: 0,Interest.Rate
0,11.71
1,17.27
2,10.25
3,7.9
4,6.03


# First Check the individual performance

## Linear Regression

In [4]:
(final_df,ffrom sklearn.linear_model import LinearRegression
lm=LinearRegression()
lm.fit(x_train1,y_train1)
test_pred=lm.predict(x_train2)
from sklearn.metrics import mean_squared_error
E = mean_squared_error(test_pred,y_train2)**0.5
print("The RMSE of Linear Regression is :",E)

The RMSE of Linear Regression is : 2.0711465717862696


## Decision Tree

In [5]:
from sklearn.tree import DecisionTreeRegressor
reg=DecisionTreeRegressor()
reg.fit(x_train1,y_train1)
test_pred=reg.predict(x_train2)
from sklearn.metrics import mean_squared_error
E = mean_squared_error(test_pred,y_train2)**0.5
print("The RMSE of DecisionTreeRegressor is :",E)

The RMSE of DecisionTreeRegressor is : 2.2680158028149147


## KNN

In [6]:
from sklearn.neighbors import KNeighborsRegressor
reg=KNeighborsRegressor()
reg.fit(x_train1,y_train1)
test_pred=reg.predict(x_train2)
from sklearn.metrics import mean_squared_error
E = mean_squared_error(test_pred,y_train2)**0.5
print("The RMSE of KNeighborsRegressor is :",E)

The RMSE of KNeighborsRegressor is : 4.032259878439475


# Cat Boost

In [7]:
from catboost import CatBoostRegressor
reg=CatBoostRegressor(logging_level ='Silent',iterations=200)
reg.fit(x_train1,y_train1)
test_pred=reg.predict(x_train2)
from sklearn.metrics import mean_squared_error
E = mean_squared_error(test_pred,y_train2)**0.5
print("The RMSE of CatBoostRegressor is :",E)

The RMSE of CatBoostRegressor is : 1.3616745217649515


## Step -1 : Divide the Datasets in N parts ( here we use 20 Parts)
* Leaving One chunk out to get N train datasets
* The other chunk will be test datasets

In [45]:
def get_dataset(x_train,y_train,N=5) :
    merge = pd.concat([x_train,y_train],axis=1)
    merge = merge.sample(frac=1, random_state=1).reset_index(drop=True)
    y_train = merge.iloc[:,(merge.shape[1]-1):(merge.shape[1])]
    x_train = merge.iloc[:,0:(merge.shape[1]-1)]
    
    z = int(len(x_train)/N)
    start = [0]
    stop = []
    for i in range(1,N):
        start.append(z*i)
        stop.append(z*i)
    stop.append(len(x_train))
    
    c = list()
    train_data = list()
    test_data = list()
    y_data = list()
    for i in range(0,N):
        c=list(range(start[i],stop[i]))
        train_data.append(x_train.iloc[[k for k in range(0,len(x_train)) if k not in c],:])
        
        y_data.append(y_train.iloc[[k for k in range(0,len(y_train)) if k not in c],:])
        test_data.append(x_train.iloc[c,:])
        
    return(train_data,y_data,test_data,y_train)

datasets = get_dataset(x_train1,y_train1,20)


train_data = datasets[0]
y_data = datasets[1]
test_data = datasets[2]
final_y =  datasets[3]

In [35]:
datasets

Unnamed: 0,Interest.Rate
0,15.80
1,8.90
2,7.29
3,21.49
4,7.90
...,...
1754,14.09
1755,16.32
1756,6.91
1757,17.49


## Step 2 : Define the first layer models and assign a code for each model
### We can assign hyperparameters too inside the model

In [51]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor, Pool

models = [LinearRegression(),
          DecisionTreeRegressor(),
          KNeighborsRegressor(),
          CatBoostRegressor(logging_level ='Silent')]
code = ['lin_reg','dtree_reg','Knn_reg','cat_reg']

## Step 3 : Prediction function for all the models together

In [52]:
def stack(x_train, y_train , x_test , models,code):
    
    def flatten_list(_2d_list):
        flat_list = []
        for element in _2d_list:
            if type(element) is list:
                for item in element:
                    flat_list.append(item)
            else:
                flat_list.append(element)
        return flat_list

    result = list()
    for i in list(range(len(models))):
        reg = models[i]
        reg.fit(x_train,y_train)
        test_pred = flatten_list(reg.predict(x_test).tolist())
        result.append(test_pred)

    result_df = pd.DataFrame()    
    for i in list(range(len(code))):
        result_df[code[i]] = result[i]
    return result_df

## Step 4 : Predict for each of the chunk to get the final Data Frame

In [55]:
final_df = pd.DataFrame(columns = code)
for i in range(0,len(train_data)):
    current_df = stack(train_data[i],y_data[i],test_data[i],models,code)
    final_df = pd.concat([final_df,current_df])
    
final_test = stack(x_train1,y_train1,x_train2,models,code)

final_df

Unnamed: 0,lin_reg,dtree_reg,Knn_reg,cat_reg
0,14.840151,15.31,14.526,15.305565
1,11.170331,9.76,15.394,9.971487
2,9.163785,7.49,10.102,8.118306
3,18.402849,17.14,18.988,20.255904
4,4.412023,8.90,12.920,8.669037
...,...,...,...,...
101,12.478576,11.71,13.146,13.334835
102,18.319170,16.45,12.348,16.798192
103,10.443720,7.90,14.166,8.849920
104,15.536780,13.06,15.010,15.817681


In [56]:
final_test

Unnamed: 0,lin_reg,dtree_reg,Knn_reg,cat_reg
0,16.839227,14.33,16.752,16.952163
1,15.753904,15.65,13.078,13.588350
2,19.738247,21.98,13.992,22.503461
3,10.276658,11.14,10.054,10.715493
4,12.645331,9.63,12.898,12.011820
...,...,...,...,...
435,12.095842,8.88,17.798,11.026539
436,14.905575,15.96,10.714,15.547650
437,14.827977,12.92,12.312,15.549109
438,6.944630,7.14,10.512,7.081406


## Step 5 : Build the second Layer Model

In [57]:
reg2 = LinearRegression() 
reg2.fit(final_df,final_y)
test_pred = reg2.predict(final_test)
mean_squared_error(test_pred,y_train2)**0.5

1.3357571609087031

# RMSE

# Linear Regression : 2.07
# Decision Tree : 2.30
# KNN : 4.03
# CatBoost : 1.40

# Stacked Model: 1.33

# Now Create the function

In [58]:
def stackblend_reg(x_train,y_train,x_test,models,code,N=20,final_layer=LinearRegression()):
    
    def get_dataset(x_train,y_train,N=5) :
        merge = pd.concat([x_train,y_train],axis=1)
        merge = merge.sample(frac=1, random_state=1).reset_index(drop=True)
        y_train = merge.iloc[:,(merge.shape[1]-1):(merge.shape[1])]
        x_train = merge.iloc[:,0:(merge.shape[1]-1)]

        z = int(len(x_train)/N)
        start = [0]
        stop = []
        for i in range(1,N):
            start.append(z*i)
            stop.append(z*i)
        stop.append(len(x_train))

        c = list()
        train_data = list()
        test_data = list()
        y_data = list()
        for i in range(0,N):
            c=list(range(start[i],stop[i]))
            train_data.append(x_train.iloc[[k for k in range(0,len(x_train)) if k not in c],:])
            y_data.append(y_train.iloc[[k for k in range(0,len(y_train)) if k not in c],:])
            test_data.append(x_train.iloc[c,:])

        return(train_data,y_data,test_data,y_train)
    
    datasets = get_dataset(x_train,y_train,N)
    train_data = datasets[0]
    y_data = datasets[1]
    test_data = datasets[2]
    final_y =  datasets[3]
    
    def stack(x_train, y_train , x_test , models=models,code=code):
    
        def flatten_list(_2d_list):
            flat_list = []
            for element in _2d_list:
                if type(element) is list:
                    for item in element:
                        flat_list.append(item)
                else:
                    flat_list.append(element)
            return flat_list

        result = list()
        for i in list(range(len(models))):
            reg = models[i]
            reg.fit(x_train,y_train)
            test_pred = flatten_list(reg.predict(x_test).tolist())
            result.append(test_pred)

        result_df = pd.DataFrame()    
        for i in list(range(len(code))):
            result_df[code[i]] = result[i]
        return result_df
    final_df = pd.DataFrame(columns = code)
    
    for i in range(0,len(train_data)):
        current_df = stack(train_data[i],y_data[i],test_data[i],models,code)
        final_df = pd.concat([final_df,current_df])
        
    final_test = stack(x_train,y_train,x_test,models,code)
    
    reg2 = final_layer
    reg2.fit(final_df,final_y)
    test_pred = reg2.predict(final_test)
    
    return test_pred

In [59]:
stack_pred = stackblend_reg(x_train1,y_train1,x_train2,
                            models = [LinearRegression(),
                                      DecisionTreeRegressor(),
                                      KNeighborsRegressor(),
                                      CatBoostRegressor(logging_level ='Silent')],
                            code = ['lin_reg','dtree_reg','Knn_reg','cat_reg'],N=5,
                            final_layer=LinearRegression())
mean_squared_error(stack_pred,y_train2)**0.5

1.3322892197689933