In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor

In [2]:
train = pd.read_csv('../data/train.csv')
train_stack = pd.DataFrame(columns=['ridge', 'rf', 'grad', 'knn'], index=train.index)

In [3]:
kfold = KFold(n_splits=5)

In [4]:
def rmse(pred, actual):
    return np.sqrt(mean_squared_error(pred, actual))

### Training base models and the stacking model

In [5]:
poly = PolynomialFeatures(degree=3)

for train_idx, test_idx in kfold.split(train):
    
    train_x = train.iloc[train_idx, :-1]
    train_y = train.iloc[train_idx, -1]
    test_x = train.iloc[test_idx, :-1]
    test_y = train.iloc[test_idx, -1]
    
    train_x_poly = pd.DataFrame(poly.fit_transform(train_x.copy()), index=train_idx)
    test_x_poly = pd.DataFrame(poly.fit_transform(test_x.copy()), index=test_idx)
    
    # Train ridge
    ridge = Ridge(alpha=100)
    ridge.fit(train_x_poly, train_y)
    ridge_pred = ridge.predict(test_x_poly).reshape(-1,1)
    
    # Train RandomForestRegressor
    rf = RandomForestRegressor(max_depth=15, max_features=0.7, min_samples_split=2, n_estimators=250)
    rf.fit(train_x, train_y)
    rf_pred = rf.predict(test_x).reshape(-1,1)
    
    # Train GradientBoostingRegressor
    gb = GradientBoostingRegressor(n_estimators=1000, max_depth=12, max_features=0.5, min_samples_split=10, subsample=1.0)
    gb.fit(train_x, train_y)
    gb_pred = gb.predict(test_x).reshape(-1,1)
    
    # Train KNeighborsRegressor
    knn =  KNeighborsRegressor(n_neighbors=5, p=1, weights='distance')
    knn.fit(train_x, train_y)
    knn_pred = knn.predict(test_x).reshape(-1,1)
    
    train_stack.iloc[test_idx] = np.concatenate((ridge_pred, rf_pred, gb_pred, knn_pred), axis=1)

In [6]:
from sklearn.linear_model import LinearRegression
meta = LinearRegression()
meta.fit(train_stack, train.iloc[:,-1])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [7]:
train_x = train.iloc[:, :-1]
train_y = train.iloc[:, -1]

train_x_poly = pd.DataFrame(poly.fit_transform(train_x.copy()), index=train.index)

# Train ridge
ridge = Ridge(alpha=100)
ridge.fit(train_x_poly, train_y)

# Train RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(train_x, train_y)

# Train GradientBoostingRegressor
gb = GradientBoostingRegressor(n_estimators=1000)
gb.fit(train_x, train_y)

# Train KNeighborsRegressor
knn =  KNeighborsRegressor(n_neighbors=5, p=1, weights='distance')
knn.fit(train_x, train_y)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=1,
                    weights='distance')

In [8]:
# Saving the base and meta models

pickle.dump(ridge, open('model/ridge.pickle', 'wb'))
pickle.dump(rf, open('model/rf.pickle', 'wb'))
pickle.dump(gb, open('model/gb.pickle', 'wb'))
pickle.dump(knn, open('model/knn.pickle', 'wb'))
pickle.dump(meta, open('model/meta.pickle', 'wb'))

In [9]:
def predict(features):
    ridge = pickle.load(open('model/ridge.pickle', 'rb'))
    rf = pickle.load(open('model/rf.pickle', 'rb'))
    gb = pickle.load(open('model/gb.pickle', 'rb'))
    knn = pickle.load(open('model/knn.pickle', 'rb'))
    meta = pickle.load(open('model/meta.pickle', 'rb'))
    
    meta_test = pd.DataFrame(columns=['ridge', 'rf', 'gb', 'knn'], index=features.index)
    
    poly = PolynomialFeatures(degree=3)
    ridge_pred = ridge.predict(poly.fit_transform(features)).reshape(-1,1)
    rf_pred = rf.predict(features).reshape(-1,1)
    gb_pred = gb.predict(features).reshape(-1,1)
    knn_pred = knn.predict(features).reshape(-1,1)
    
    meta_test = np.concatenate((ridge_pred, rf_pred, gb_pred, knn_pred), axis=1)
    
    y_pred = meta.predict(meta_test)
    
    return y_pred

In [10]:
test = pd.read_csv('../data/test.csv')
y_pred = predict(test.iloc[:,:-1])

In [11]:
rmse(y_pred, test.iloc[:,-1])

114.27170794925478

In [12]:
ridge_a = pickle.load(open('model/ridge.pickle', 'rb'))
rf_a = pickle.load(open('model/rf.pickle', 'rb'))
gb_a = pickle.load(open('model/gb.pickle', 'rb'))
knn_a = pickle.load(open('model/knn.pickle', 'rb'))

print("RMSE for Ridge: ", rmse(ridge_a.predict(PolynomialFeatures(degree=3).fit_transform(test.iloc[:,:-1])), test.iloc[:,-1]))
print("RMSE for RF: ", rmse(rf_a.predict(test.iloc[:,:-1]), test.iloc[:,-1]))
print("RMSE for GB: ", rmse(gb_a.predict(test.iloc[:,:-1]), test.iloc[:,-1]))
print("RMSE for KNN: ", rmse(knn_a.predict(test.iloc[:,:-1]), test.iloc[:,-1]))

RMSE for Ridge:  164.97732253734705
RMSE for RF:  119.84165372046535
RMSE for GB:  117.59937339843971
RMSE for KNN:  163.27561410314317
