# Modelling

The objective fo this notebook is to utilize some methods to find and determine the optimal model to solve the challenge problem. 

## Libraries

In [1]:
import numpy as np 
import cupy as cp
import pandas as pd
from cnr_methods import get_selected_features, transform_data, revert_data,metric_cnr, get_simplified_data
import tsfresh


from sklearn.model_selection import TimeSeriesSplit, train_test_split
from collections import deque
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import xgboost as xgb

## Read Data

Here, the data used correspond to the results of the Feature Engineering and Selection Step. (Add Later)

In [2]:
def get_manual_features(feature_data):

    index = feature_data.index
    features = ['T', 'CLCT', 'U_100m','V_100m','U_10m','V_10m']

    # Wind Speed Vector
    feature_data['Wind Speed 100m'] = np.sqrt(feature_data['U_100m']**2 + feature_data['V_100m']**2)
    feature_data['Wind Direction 100m'] = np.arctan(feature_data['V_100m']/feature_data['U_100m'])
    feature_data['Wind Speed 10m'] = np.sqrt(feature_data['U_10m']**2 + feature_data['V_10m']**2)
    feature_data['Wind Direction 10m'] = np.arctan(feature_data['V_10m']/feature_data['U_10m'])

    feature_data['Wind Direction 100m'] = feature_data['Wind Direction 100m'].apply(lambda x: 360 + x if x < 0 else x)
    feature_data['Wind Direction 10m'] = feature_data['Wind Direction 10m'].apply(lambda x: 360 + x if x < 0 else x)

    # Time Relative Variables 

    for column in features:
        feature_data[column + '_last_week'] = feature_data[column].shift(7) # Values for Last Week
        feature_data[column + '_last_month'] = feature_data[column].shift(30) # Values for Last Month

    feature_data['Month_Number'] = feature_data.index.month # Month Number
    feature_data['Quarter_Number'] = feature_data.index.quarter # Quarter Number

    mean_month = feature_data.groupby('Month_Number').mean()[features] # Month Mean
    median_month = feature_data.groupby('Month_Number').median()[features] # Month Median
    variance_month = feature_data.groupby('Month_Number').var()[features] # Month Variance

    mean_quarter = feature_data.groupby('Quarter_Number').mean()[features] # Quarter Mean
    median_quarter = feature_data.groupby('Quarter_Number').median()[features] # Quarter Median
    variance_quarter = feature_data.groupby('Quarter_Number').var()[features] # Quarter Variance

    mean_month.columns = mean_month.columns + '_Month_Mean'
    median_month.columns = median_month.columns + '_Month_Median'
    variance_month.columns = variance_month.columns + '_Month_Variance'
    mean_quarter.columns = mean_quarter.columns + '_Quarter_Mean'
    median_quarter.columns = median_quarter.columns + '_Quarterh_Median'
    variance_quarter.columns = variance_quarter.columns + '_Quarter_Variance'

    feature_data = feature_data.merge(mean_month,on='Month_Number',how='left')
    feature_data = feature_data.merge(median_month,on='Month_Number',how='left')
    feature_data = feature_data.merge(variance_month,on='Month_Number',how='left')
    feature_data = feature_data.merge(mean_quarter,on='Quarter_Number',how='left')
    feature_data = feature_data.merge(median_quarter,on='Quarter_Number',how='left')
    feature_data = feature_data.merge(variance_quarter,on='Quarter_Number',how='left')
    feature_data.index = index

    # Periodical Features

    day = feature_data.index.day
    hour = feature_data.index.hour
    minute = feature_data.index.minute
    dayofweek = feature_data.index.dayofweek
    dayofyear = feature_data.index.dayofyear
    days_in_month = feature_data.index.days_in_month

    feature_data["cos_day"], feature_data["sin_day"] = (
    np.cos(2 * np.pi * (day - 1) / days_in_month),
    np.sin(2 * np.pi * (day - 1) / days_in_month),
    )

    feature_data["cos_hour"], feature_data["sin_hour"] = (
        np.cos(2 * np.pi * hour / 24),
        np.sin(2 * np.pi * hour / 24),
        )

    feature_data["cos_minute"], feature_data["sin_minute"] = (
        np.cos(2 * np.pi * minute / 60),
        np.sin(2 * np.pi * minute / 60),
    )

    feature_data["cos_dayofyear"], feature_data["sin_dayofyear"] = (
        np.cos(2 * np.pi * (dayofyear - 1) / 365),
        np.sin(2 * np.pi * (dayofyear - 1) / 365),
    )

    feature_data["cos_dayofweek"], feature_data["sin_dayofweek"] = (
        np.cos(2 * np.pi * dayofweek / 7),
        np.sin(2 * np.pi * dayofweek / 7),
    )

    # Distance from Max and Min

    for column in features:
        feature_data[column + '_Distance_Max'] = feature_data.index - feature_data[column].idxmax()
        feature_data[column + '_Distance_Min'] = feature_data.index - feature_data[column].idxmin()
        feature_data[column + '_Distance_Max'] = feature_data[column + '_Distance_Max'].apply(lambda x : x.days)
        feature_data[column + '_Distance_Min'] = feature_data[column + '_Distance_Min'].apply(lambda x : x.days)

    # Dropping Base Features 
    #features.append(['Month_Number','Quarter Number'])
    feature_data = feature_data.drop(features,axis=1)

    return feature_data

In [3]:
X,y_train = get_simplified_data()

X_train = X[X['Set']=='Train']
X_test = X[X['Set']=='Test']

In [4]:
X_train = get_manual_features(X_train)
X_test = get_manual_features(X_test)

In [5]:
X_train['Set'] = 'Train'
X_test['Set'] = 'Test'

full_data = pd.concat([X_train,X_test],axis=0)

full_data = full_data.loc[:,~full_data.columns.duplicated()]

In [6]:
'''' 
full_data = get_selected_features(100)

full_data = full_data.rename({'Unnamed: 0' : 'Time'},axis=1)
full_data = full_data.set_index('Time')
'''

full_label = pd.read_csv('Data/Y_train.csv')
X = full_data[full_data['Set']=='Train']

WF = 'WF1'
X = X[X['WF']==WF]
y = full_label[full_label['ID'].isin(X['ID'])]

In [7]:
X = transform_data(X.drop(['ID','WF','Set'],axis=1))
y = transform_data(y)['Production']

## Validation Scheme

Before proceeding to the Hyperparameter search, it is necessary first to have some way to reliably measure the performance of the model. For this purpose, it will be used a Time Split Cross Validation Method, were the "Test" Fold for each Iteration is going to be used as the Validation Data, and so, to make Early Stopping on the data.

In [8]:
k_fold_splits = 8
num_boost_round = 500
early_stopping_rounds = 10

In [9]:
def gpu_df(df,y):
    gpu_matrix = cp.asarray(df)
    gpu_matrix = xgb.DMatrix(gpu_matrix,label=y)
    return gpu_matrix

In [10]:
def objective(param,k_fold_splits=k_fold_splits,num_boost_round=num_boost_round,early_stopping_rounds=early_stopping_rounds):
    # Define Time Split Cross Validation
    tscv = TimeSeriesSplit(n_splits=k_fold_splits)

    # Separating Data from Hold Out Set

    X_cv, _, y_cv, _ = train_test_split(X, y, test_size=0.125, shuffle=False)

    # Set XGBoost for GPU
    param['tree_method'] = 'gpu_hist'

    train_scores = np.empty(0)
    val_scores = np.empty(0)
    test_scores = np.empty(0)
    for train_index, test_index in tscv.split(X_cv):
        # Get the Data of the Split
        X_train, X_test = X_cv.iloc[train_index], X_cv.iloc[test_index]
        y_train, y_test = y_cv.iloc[train_index], y_cv.iloc[test_index]

        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.143, shuffle=False)

        dtrain = gpu_df(X_train,y_train)
        dval = gpu_df(X_val,y_val)
        dtest = gpu_df(X_test,y_test)

        # Train the Model
        progress = dict()
        watchlist = [(dtrain,'train'),(dval,'eval')]
        bst = xgb.train(param, dtrain, num_boost_round=num_boost_round, evals=watchlist, feval=metric_cnr,early_stopping_rounds=early_stopping_rounds,verbose_eval=False,evals_result=progress)

        train_score = np.array(progress['train']['CAPE']).mean()
        val_score = np.array(progress['eval']['CAPE']).mean()
        preds = bst.predict(dtest,ntree_limit=bst.best_ntree_limit)
        test_score = metric_cnr(preds,dtest)

        train_scores = np.append(train_scores,train_score)
        val_scores = np.append(val_scores,val_score)
        test_scores = np.append(test_scores,test_score[1])

    return {'loss' : test_scores.mean(), 'params' : param, 'status' : STATUS_OK, 'train_loss' : train_scores.mean(), 'val_loss' : val_scores.mean()}

## Hyperparameter Tuning

For the Hyperparameter Tuning, the HyperOpt Library will be used, which implements some techniques for a more efficient search for parameters.

### Domain Space

In [11]:
space = {
    'max_depth' : 1 + hp.randint('max_depth', 15),
    'subsample' : hp.uniform('subsample', 0, 1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0, 1),
    'colsample_bylevel' : hp.uniform('colsample_bylevel', 0, 1),
    'min_child_weight' : hp.uniform('min_child_weight', 0, 10),
    'lambda' : hp.uniform('lambda', 0, 1),
    'alpha' : hp.uniform('alpha', 0, 1),
    'eta' : hp.uniform('eta', 0, 1)
}

### Optimization Algorithm

In [12]:
tpe_algorithm = tpe.suggest
bayes_trials = Trials()

### Bayesian Optimization

In [13]:
MAX_EVALS = 300

In [14]:
best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = MAX_EVALS, trials = bayes_trials, rstate = np.random.RandomState(50))

0%|          | 0/300 [00:00<?, ?trial/s, best loss=?]job exception: object of type 'numpy.float64' has no len()

  0%|          | 0/300 [00:00<?, ?trial/s, best loss=?]


TypeError: object of type 'numpy.float64' has no len()

### Hold Out Score

In [15]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.125, shuffle=False)
dhold = gpu_df(X_holdout,y_holdout)

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.143, shuffle=False)
dtrain = gpu_df(X_train,y_train)
dval = gpu_df(X_val,y_val)

In [17]:
progress = dict()
watchlist = [(dtrain,'train'),(dval,'eval')]
bst = xgb.train(best, dtrain, num_boost_round=num_boost_round, evals=watchlist, feval=metric_cnr,early_stopping_rounds=early_stopping_rounds,verbose_eval=False,evals_result=progress)

NameError: name 'best' is not defined

In [18]:
preds = bst.predict(dhold,ntree_limit=bst.best_ntree_limit)
score = metric_cnr(preds,dhold)

NameError: name 'bst' is not defined

In [19]:
print(score)

NameError: name 'score' is not defined

## Generating Predictions

In [42]:
best

{'alpha': 0.15584429956186577,
 'colsample_bylevel': 0.001892811744339036,
 'colsample_bytree': 0.6482004005331071,
 'eta': 0.9100506133647966,
 'lambda': 0.8893423815751322,
 'max_depth': 14,
 'min_child_weight': 2.036724529281875,
 'subsample': 0.23264657550379841,
 'validate_parameters': 1}

In [43]:
preds = []
for WF in full_data['WF'].unique():
    X_WF = full_data[full_data['WF']==WF]
    X_train = X_WF[X_WF['Set']=='Train']
    y_train = full_label[full_label['ID'].isin(X_train['ID'])]
    X_test = X_WF[X_WF['Set']=='Test']

    #Transform Data
    X_train = transform_data(X_train.drop(['ID','WF','Set'],axis=1))
    X_test = transform_data(X_test.drop(['ID','WF','Set'],axis=1))
    y_train = transform_data(y_train)['Production']
    dtrain = gpu_df(X_train,y_train)
    dtest = gpu_df(X_test,None)

    bst = xgb.train(best,dtrain)
    pred = bst.predict(dtest)

    preds = np.append(preds,pred)

In [44]:
preds = revert_data(preds)

### Generate Submissions

In [45]:
preds_id = pd.read_csv(r'C:\Users\andre_\OneDrive\Documentos\GitHub\cnr\Data\random_submission_example.csv')['ID']

In [46]:
submission = pd.DataFrame()
submission['ID'] = preds_id
submission['Production'] = preds
submission = submission.set_index('ID')

In [47]:
submission.to_csv(r'Data\Submission.csv')