In [62]:
# Import all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as mp
import sklearn as sk
import glob
import pickle
data_dict = {"DCOILBRENTEU_CH1":"Change from year ago dollars per barrel",  
             "DCOILBRENTEU_CHG":"Change dollars per barrel",
             "DCOILBRENTEU_PCA":"Compound annual rate of change",
             "DCOILBRENTEU_CCA":"Continuously Compounded annual rate of change",
             "DCOILBRENTEU_CCH":"Continuously Coumpounded rate of change",
             "DCOILBRENTEU"    :"Dollars per barrel",
             "DCOILBRENTEU_NBD19870520":"Index",
             "DCOILBRENTEU_PC1":"percent Change from a year ago",
             "DCOILBRENTEU_PCH":"percent change"}

In [2]:
# get data file names
def merge_data_datetime(path, reg_exp):
    
    filenames = glob.glob(path + "/"+ reg_exp+ ".csv")

    dfs = []
    for filename in filenames:
        dfs.append(pd.read_csv(filename))

    big_frame = dfs[0]
    big_frame.set_index(pd.to_datetime(big_frame['DATE']), inplace=True)

    for  i in dfs:
        i.set_index(pd.to_datetime(i['DATE']), inplace=True)
        i.drop('DATE', axis=1, inplace=True)

    del(dfs[0])
    # Concatenate all data into one DataFrame
    for i in dfs:
        big_frame = pd.concat([big_frame,i], axis=1)
    big_frame.to_csv(path + "/"  + "temp.csv")
    big_frame = pd.read_csv(path + "/"  + "temp.csv", low_memory=False, parse_dates=['DATE'])
    big_frame.rename(columns=data_dict, inplace=True)
    return big_frame

In [3]:
# Preprocessing data
def parse_datetime_oil(crude_oil):
    crude_oil['yy']=crude_oil.DATE.dt.year
    crude_oil['mm']=crude_oil.DATE.dt.month
    crude_oil['dd']=crude_oil.DATE.dt.day
    crude_oil['Dayofweek']=crude_oil.DATE.dt.dayofweek
    crude_oil['Dayofyear']=crude_oil.DATE.dt.dayofyear
    crude_oil.drop(columns='DATE', inplace=True)
    
    # change objects to categorical data
    df = crude_oil
    for val, cont in df.items():
        if pd.api.types.is_object_dtype(cont):
            df[val] = cont.astype("category").cat.as_ordered()
    
    crude_oil = df
    # Turn categorical values into numbers
    for lb, cont in crude_oil.items():
        if pd.api.types.is_categorical_dtype(cont):
            crude_oil[lb+"_is_missing"] = pd.isnull(cont)
            crude_oil[lb] = pd.Categorical(cont).codes+1
    return crude_oil

In [4]:
# Getting data ready
train_data = merge_data_datetime(path=r'data', reg_exp="*")
test_data = merge_data_datetime(path=r'test', reg_exp="*")

train_data = parse_datetime_oil(train_data)
test_data = parse_datetime_oil(test_data)

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8421 entries, 0 to 8420
Data columns (total 23 columns):
 #   Column                                                    Non-Null Count  Dtype
---  ------                                                    --------------  -----
 0   percent Change from a year ago                            8421 non-null   int16
 1   Change dollars per barrel                                 8421 non-null   int16
 2   Change from year ago dollars per barrel                   8421 non-null   int16
 3   Compound annual rate of change                            8421 non-null   int16
 4   percent change                                            8421 non-null   int16
 5   Dollars per barrel                                        8421 non-null   int16
 6   Continuously Coumpounded rate of change                   8421 non-null   int16
 7   Index                                                     8421 non-null   int16
 8   Continuously Compounded annual rate of

In [6]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 561 entries, 0 to 560
Data columns (total 23 columns):
 #   Column                                                    Non-Null Count  Dtype
---  ------                                                    --------------  -----
 0   Change from year ago dollars per barrel                   561 non-null    int16
 1   Change dollars per barrel                                 561 non-null    int16
 2   Compound annual rate of change                            561 non-null    int16
 3   percent change                                            561 non-null    int16
 4   Index                                                     561 non-null    int16
 5   Continuously Compounded annual rate of change             561 non-null    int16
 6   Continuously Coumpounded rate of change                   561 non-null    int16
 7   percent Change from a year ago                            561 non-null    int16
 8   Dollars per barrel                      

In [7]:
# Splitting data into x and y
X_train, y_train = train_data.drop(columns='Dollars per barrel'), train_data['Dollars per barrel']
X_valid, y_valid = test_data.drop(columns='Dollars per barrel'), test_data['Dollars per barrel']

In [8]:
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((8421, 22), (8421,), (561, 22), (561,))

In [9]:
# Create evaluation function (the competition uses Root Mean Square Log Error)
from sklearn.metrics import mean_squared_log_error, mean_absolute_error

def rmsle(y_test, y_preds):
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

# Create function to evaluate our model
def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_valid)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Valid MAE": mean_absolute_error(y_valid, val_preds),
              "Training RMSLE": rmsle(y_train, train_preds),
              "Valid RMSLE": rmsle(y_valid, val_preds),
              "Training R^2": model.score(X_train, y_train),
              "Valid R^2": model.score(X_valid, y_valid)}
    return scores

In [40]:
# Training model on Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor
%time
model = RandomForestRegressor(n_jobs=-1)
model.fit(X_train, y_train)


CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.15 µs


RandomForestRegressor(n_jobs=-1)

In [41]:
model.score(X_valid, y_valid)

-71.31231413470813

In [42]:
show_scores(model)

{'Training MAE': 0.9641040256501567,
 'Valid MAE': 1289.1587165775402,
 'Training RMSLE': 0.08080279763940797,
 'Valid RMSLE': 2.4550195379599318,
 'Training R^2': 0.9995562252273448,
 'Valid R^2': -71.31231413470813}

In [34]:
%time
from sklearn.model_selection import RandomizedSearchCV

# Different RandomForestClassifier hyperparameters
rf_grid = {"n_estimators": np.arange(10, 100, 10),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2),
           "max_features": [0.5, 1, "sqrt", "auto"],
           "max_samples": [100, 500, 1000, 2000, 3000, 4000, 5000, 6000]}

rs_model = RandomizedSearchCV(RandomForestRegressor(),
                              param_distributions=rf_grid,
                              n_iter=2000,
                              cv=5,
                              verbose=True)

rs_model.fit(X_train, y_train)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.72 µs
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=2000,
                   param_distributions={'max_depth': [None, 3, 5, 10],
                                        'max_features': [0.5, 1, 'sqrt',
                                                         'auto'],
                                        'max_samples': [100, 500, 1000, 2000,
                                                        3000, 4000, 5000,
                                                        6000],
                                        'min_samples_leaf': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19]),
                                        'min_samples_split': array([ 2,  4,  6,  8, 10, 12, 14, 16, 18]),
                                        'n_estimators': array([10, 20, 30, 40, 50, 60, 70, 80, 90])},
                   verbose=True)

In [35]:
rs_model.best_params_

{'n_estimators': 20,
 'min_samples_split': 12,
 'min_samples_leaf': 1,
 'max_samples': 4000,
 'max_features': 'auto',
 'max_depth': 10}

In [36]:
show_scores(rs_model)

{'Training MAE': 3.214629320017231,
 'Valid MAE': 1289.0907048601646,
 'Training RMSLE': 0.10609049095130219,
 'Valid RMSLE': 2.4549821264987357,
 'Training R^2': 0.9979967181269015,
 'Valid R^2': -71.30560161764555}

In [61]:
save_regressor_model = RandomForestRegressor(n_estimators=20,
                                             min_samples_split= 12,
                                             min_samples_leaf= 1,
                                             max_samples= 4000,
                                             max_features= 'auto',
                                             max_depth= 10)
save_regressor_model.fit(all_train_x, all_train_y)

RandomForestRegressor(max_depth=10, max_samples=4000, min_samples_split=12,
                      n_estimators=20)

In [47]:
len(X_train.append(X_valid))

8982

In [57]:
all_train_x = X_train.append(X_valid, ignore_index=True)

In [58]:
all_train_y = y_train.append(y_valid, ignore_index=True)

In [59]:
len(all_train_x), len(all_train_y)

(8982, 8982)

In [64]:
filename = "random_forest_regressor_model.pkl"
pickle.dump(save_regressor_model, open(filename, 'wb'))

In [66]:
show_scores(save_regressor_model)

{'Training MAE': 2.7323624076169026,
 'Valid MAE': 152.17002442520482,
 'Training RMSLE': 0.09878548721816731,
 'Valid RMSLE': 1.3776955385788041,
 'Training R^2': 0.9980888529867834,
 'Valid R^2': -0.4329336217851254}