In [24]:
import pandas as pd
from sklearn.model_selection import  KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [25]:
data=pd.read_csv('../data/imputed.csv')
target = ['EURWH_MBOE','OilEURWH_MBBL','GasEURWH_BCF']

X = data[[column for column in data.columns if column not in target]]
y = data['EURWH_MBOE']

In [38]:
def cross_validate (model, X, y):
    cv = KFold(n_splits = 10)
    pipeline = Pipeline([('scaler', StandardScaler()), ('model', model)])

    rmse = []
    nrmse = []
    predictions = []
    
    for i, (train_index, test_index) in enumerate(cv.split(X)):
        
        print("Fold:", i + 1)
        X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        pipeline.fit(X_train, y_train)
        pred = pipeline.predict(X_test)

        rmse.append(np.sqrt(((y_test - pred) ** 2).mean()))
        nrmse.append(np.sqrt(((y_test - pred) ** 2).mean()) / y_test.mean(axis = 0))
        predictions.append(pred)

    print("Mean RMSE:", np.mean(np.array(rmse), axis = 0))
    print("Mean NRMSE:", np.mean(np.array(nrmse), axis = 0))

    return rmse, nrmse, np.hstack(predictions)

In [40]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
rmse, nrmse, predictions = cross_validate(model, X, y)

Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 10
Mean RMSE: 519.4715650973463
Mean NRMSE: 0.4908922307604242
(16505,)


In [71]:
from sklearn.linear_model import Ridge

model = Ridge()
rmse, nrmse = cross_validate(model, X, y)

Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 10
Mean RMSE: 519.467483742307
Mean NRMSE: 0.490888407599593


In [72]:
from sklearn.linear_model import Lasso

model = Lasso()
rmse, nrmse = cross_validate(model, X, y)

Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 10
Mean RMSE: 519.424883169093
Mean NRMSE: 0.4908477957086627


In [73]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
rmse, nrmse = cross_validate(model, X, y)

Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 10
Mean RMSE: 453.213332900757
Mean NRMSE: 0.4282619568776149


In [76]:
from lightgbm import LGBMRegressor

model = LGBMRegressor()
rmse, nrmse = cross_validate(model, X, y)

Fold: 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004462 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5079
[LightGBM] [Info] Number of data points in the train set: 14854, number of used features: 49
[LightGBM] [Info] Start training from score 1057.559243
Fold: 2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003190 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5092
[LightGBM] [Info] Number of data points in the train set: 14854, number of used features: 49
[LightGBM] [Info] Start training from score 1057.451461
Fold: 3
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005376 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5088
[LightGBM] [Info] Number of data points in the train set: 14854, number of used features: 

In [34]:
from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor(n_neighbors=10)
rmse, nrmse, predictions = cross_validate(model, X, y)

Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 10
Mean RMSE: 516.8658229850446
Mean NRMSE: 0.48841275860084876


In [36]:
from sklearn.svm import SVR

model = SVR()
rmse, nrmse, predictions = cross_validate(model, X, y)

Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 10
Mean RMSE: 613.1975335514837
Mean NRMSE: 0.5794708068806337
