In [1]:
from copy import deepcopy
import numpy as np
import pandas as pd
import itertools
from sklearn.utils import shuffle
from sklearn.model_selection import KFold, TimeSeriesSplit, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, mean_absolute_error, r2_score
from collections import Counter
from time import localtime, strftime
import math
from helper import ( prepare_data, train_test_shuffle_split, train_test_seq_split, print_folds_stats )

from xgboost import XGBClassifier

from matplotlib import pyplot as plt
plt.style.use('ggplot')     # 'fivethirtyeight'
%matplotlib inline
%load_ext autoreload
%autoreload 2

  from pandas import MultiIndex, Int64Index


In [2]:
def plot_confusion_matrix( cm, classes, title='Confusion matrix', figsize=(5,5),
                           cmap=plt.cm.PuBu ):   # originally plt.cm.Blues; also good: BuPu,RdPu,PuRd,OrRd,Oranges
    """
    Plot the confusion matrix
    """
    plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False
    plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True
            
    plt.figure(figsize=figsize)
    im = plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar(im, fraction=0.046, pad=0.05)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)
    
    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True labels')
    plt.xlabel('Predicted labels')
    plt.tight_layout()
    plt.show()

# Part 1. Linear Implementation

In [3]:
events = { 'texas2step' : {
                            'file': 'data/2step_20220807.csv',
                            'features': ['Num1', 'Num2', 'Num3', 'Num4',]
                         }
        }
event = 'texas2step'

In [4]:
file     = events[ event ]['file']
features = events[ event ]['features']

df = pd.read_csv( file, encoding='utf-8' )
print('Shape of data:', df.shape, '\n')

Shape of data: (2215, 9) 



In [5]:
n_steps      = 4
random_state = 34
overlap      = True
flatten      = True
y_pos        = 0

( X,
  y, )  = prepare_data(  df,
                         features,
                         n_steps,
                         overlap=overlap,
                         flatten=flatten,
                         y_pos=y_pos, )
print('X and y shape:', X.shape, y.shape)

max_value = max(y)
print(f'Replacing y={max_value} with 0 for use with XGBoost')
y[y==max_value] = 0
max_value = max(y)
print(f'Max value after replacement: {max_value}')

X and y shape: (2211, 16) (2211,)
Replacing y=35 with 0 for use with XGBoost
Max value after replacement: 34


In [7]:
c = Counter(y)
c.most_common()

[(25, 85),
 (30, 79),
 (22, 73),
 (4, 72),
 (21, 72),
 (8, 72),
 (12, 70),
 (6, 70),
 (9, 69),
 (7, 68),
 (34, 68),
 (15, 67),
 (17, 67),
 (10, 66),
 (26, 65),
 (1, 64),
 (31, 64),
 (0, 62),
 (32, 61),
 (11, 61),
 (19, 61),
 (20, 61),
 (3, 60),
 (13, 60),
 (33, 59),
 (2, 59),
 (18, 58),
 (5, 58),
 (23, 57),
 (29, 57),
 (16, 56),
 (27, 56),
 (24, 47),
 (28, 46),
 (14, 44)]

In [6]:
test_size=0.2

# stratified implemented only if shuffle=True
X_sh, y_sh = deepcopy(X), deepcopy(y)
X_train_sh, X_test_sh, y_train_sh, y_test_sh = train_test_split( X_sh, y_sh,
                                                                 test_size=test_size,
                                                                 random_state=random_state,
                                                                 shuffle=True,
                                                                 stratify=y )

# not stratified - time series
X_train, X_test, y_train, y_test = train_test_seq_split(X, y, test_size=test_size)

In [9]:
clf_params_xgb = {
    'n_estimators': 500,
    'max_depth': None,
    'learning_rate': 0.01,                                # eta
    'objective': 'multi:softmax',                        # multi:softmax, multi:softprob, rank:pairwise
    'eval_metric': 'mlogloss',                           # multiclass - merror, mlogloss
    'base_score': 0.5,
    'booster': 'gbtree',                                 # gbtree, dart
    'tree_method': 'auto',                             # auto, exact, approx, hist and gpu_hist
    'importance_type': 'gain',                           # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
    'gamma': 0,                                       # larger - more conservative, [0, inf], default 0
    'reg_alpha': 0,                                    # L1 reg., larger - more conservative, default 0
    'reg_lambda': 1,                                     # L2 rreg., larger - more conservative, default 1
    'sampling_method': 'uniform',                        # uniform, gradient_based
    'max_delta_step': 1,                                 # 1-10
    'min_child_weight': 1,
    'subsample': 1,                                   # 0-1  (lower values prevent overfitting)    
    'colsample_bylevel': 1,                            # 0-1
    'colsample_bynode': 1,                            # optimized for higher recall
    'colsample_bytree': 1,                            # 0-1  
    'seed': 5,
    'num_class': 35,
    'use_label_encoder': False,
    'random_state': random_state,
    'n_jobs': -1,    
}

## Time Series Split (No Shuffling)

In [29]:
# FIT & TEST
clf = XGBClassifier( **clf_params_xgb )
clf.fit( X_train, y_train )
y_pred = clf.predict( X_test )

# PRINT RESULTS
print( strftime("%Y-%m-%d %H:%M:%S", localtime()) )
print(f'Event: {event}\nTrain-test split: time series\nSteps: {n_steps}  |  Y_pos: {y_pos}  |  Random state: {random_state}')
print(f'Features: {features}\nOverlap: {overlap}  |  Flatten: {flatten}\n')

clf_report = classification_report( y_test, y_pred, output_dict=True)
print('Classification report')
print('\tAccuracy:         ', clf_report['accuracy'])
print('\tMacro F1 score:   ', clf_report['macro avg']['f1-score'])
print('\tWeighted F1 score:', clf_report['weighted avg']['f1-score'], '\n')

rmse = math.sqrt(mean_squared_error(y_test, y_pred))
mae  = mean_absolute_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)

print(f'RMSE = {round(rmse, 4)}')
print(f'MAE  = {round(mae, 4)}')
print(f'R2   = {round(r2, 4)}')

print_cm = False
if print_cm:
    labels = list(range(0, 35))
    cm = confusion_matrix( y_test, y_pred )
    plot_confusion_matrix( cm, labels, figsize=(10,10), )

2022-08-14 19:05:21
Event: texas2step
Train-test split: time series
Steps: 1  |  Y_pos: 0  |  Random state: 34
Features: ['Num1', 'Num2', 'Num3', 'Num4']
Overlap: True  |  Flatten: True

Classification report
	Accuracy:          0.020361990950226245
	Macro F1 score:    0.021863045801362356
	Weighted F1 score: 0.02018190000022613 

RMSE = 14.3708
MAE  = 11.7783
R2   = -1.0


## Stratified Split (With Shuffling)

In [30]:
# FIT & TEST
clf_sh = XGBClassifier( **clf_params_xgb )
clf_sh.fit( X_train_sh, y_train_sh )
y_pred_sh = clf_sh.predict( X_test_sh )

# PRINT RESULTS
print( strftime("%Y-%m-%d %H:%M:%S", localtime()) )
print(f'Event: {event}\nTrain-test split: stratified w/shuffling\nSteps: {n_steps}  |  Y_pos: {y_pos}  |  Random state: {random_state}')
print(f'Features: {features}\nOverlap: {overlap}  |  Flatten: {flatten}\n')

clf_report_sh = classification_report( y_test_sh, y_pred_sh, output_dict=True)
print('Classification report')
print('\tAccuracy:         ', clf_report_sh['accuracy'])
print('\tMacro F1 score:   ', clf_report_sh['macro avg']['f1-score'])
print('\tWeighted F1 score:', clf_report_sh['weighted avg']['f1-score'], '\n')

rmse_sh = math.sqrt(mean_squared_error(y_test_sh, y_pred_sh))
mae_sh  = mean_absolute_error(y_test_sh, y_pred_sh)
r2_sh   = r2_score(y_test_sh, y_pred_sh)

print(f'RMSE = {round(rmse_sh, 4)}')
print(f'MAE  = {round(mae_sh, 4)}')
print(f'R2   = {round(r2_sh, 4)}')

print_cm = False
if print_cm:
    labels = list(range(0, 35))
    cm_sh = confusion_matrix( y_test_sh, y_pred_sh )
    plot_confusion_matrix( cm_sh, labels, figsize=(10,10), )

2022-08-14 19:06:53
Event: texas2step
Train-test split: stratified w/shuffling
Steps: 1  |  Y_pos: 0  |  Random state: 34
Features: ['Num1', 'Num2', 'Num3', 'Num4']
Overlap: True  |  Flatten: True

Classification report
	Accuracy:          0.029345372460496615
	Macro F1 score:    0.02819593180065346
	Weighted F1 score: 0.02842889084451297 

RMSE = 13.5411
MAE  = 10.8916
R2   = -0.7852


In [12]:
unknown_variable

NameError: name 'unknown_variable' is not defined

# Part 2. Parameter Search Loop

In [11]:
random_state = 34
overlap      = True
flatten      = True
y_pos        = 1

for n_steps in range(1, 11):
        
    # PREPARE DATA
    ( X,
      y, )  = prepare_data(  df,
                             features,
                             n_steps,
                             overlap=overlap,
                             flatten=flatten,
                             y_pos=y_pos, )
    #print('X and y shape:', X.shape, y.shape)

    max_value = max(y)
    #print(f'Replacing y={max_value} with 0 for use with XGBoost')
    y[y==max_value] = 0
    max_value = max(y)
    #print(f'Max value after replacement: {max_value}')
        
    
    # TRAIN-TEST SPLIT
    test_size=0.2

    # stratified implemented only if shuffle=True
    X_sh, y_sh = deepcopy(X), deepcopy(y)
    X_train_sh, X_test_sh, y_train_sh, y_test_sh = train_test_split( X_sh, y_sh,
                                                                     test_size=test_size,
                                                                     random_state=random_state,
                                                                     shuffle=True,
                                                                     stratify=y )

    # not stratified - time series
    X_train, X_test, y_train, y_test = train_test_seq_split(X, y, test_size=test_size)
        
    
    # CLASSIFIER
    clf_params_xgb = {
        'n_estimators': 500,
        'max_depth': None,
        'learning_rate': 0.01,                            # eta
        'objective': 'multi:softmax',                     # multi:softmax, multi:softprob, rank:pairwise
        'eval_metric': 'mlogloss',                        # multiclass - merror, mlogloss
        'base_score': 0.5,
        'booster': 'gbtree',                              # gbtree, dart
        'tree_method': 'auto',                            # auto, exact, approx, hist and gpu_hist
        'importance_type': 'gain',                        # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
        'gamma': 0,                                       # larger - more conservative, [0, inf], default 0
        'reg_alpha': 0,                                   # L1 reg., larger - more conservative, default 0
        'reg_lambda': 1,                                  # L2 rreg., larger - more conservative, default 1
        'sampling_method': 'uniform',                     # uniform, gradient_based
        'max_delta_step': 1,                              # 1-10
        'min_child_weight': 1,
        'subsample': 1,                                   # 0-1  (lower values prevent overfitting)    
        'colsample_bylevel': 1,                           # 0-1
        'colsample_bynode': 1,                            # optimized for higher recall
        'colsample_bytree': 1,                            # 0-1  
        'seed': 5,
        'num_class': 35,
        'use_label_encoder': False,
        'random_state': random_state,
        'n_jobs': -1,    
    }
        
    
    
    # TRAIN ON TIME SERIES SPLIT (NO SHUFFLING)
    clf = XGBClassifier( **clf_params_xgb )
    clf.fit( X_train, y_train )
    y_pred = clf.predict( X_test )

    print( strftime("%Y-%m-%d %H:%M:%S", localtime()) )
    print(f'Event: {event}\nTrain-test split: time series\nSteps: {n_steps}  |  Y_pos: {y_pos}  |  Random state: {random_state}')
    print(f'Features: {features}\nOverlap: {overlap}  |  Flatten: {flatten}\n')

    clf_report = classification_report( y_test, y_pred, output_dict=True)
    print('Classification report')
    print('\tAccuracy:         ', clf_report['accuracy'])
    print('\tMacro F1 score:   ', clf_report['macro avg']['f1-score'])
    print('\tWeighted F1 score:', clf_report['weighted avg']['f1-score'], '\n')

    rmse = math.sqrt(mean_squared_error(y_test, y_pred))
    mae  = mean_absolute_error(y_test, y_pred)
    r2   = r2_score(y_test, y_pred)

    print(f'RMSE = {round(rmse, 4)}')
    print(f'MAE  = {round(mae, 4)}')
    print(f'R2   = {round(r2, 4)}')
    print('\n', '+'*100, '\n', sep='')

    print_cm = False
    if print_cm:
        labels = list(range(0, 35))
        cm = confusion_matrix( y_test, y_pred )
        plot_confusion_matrix( cm, labels, figsize=(10,10), )    
    
    
    # TRAIN ON STRATIFIED SPLIT (WITH SHUFFLING)
    clf_sh = XGBClassifier( **clf_params_xgb )
    clf_sh.fit( X_train_sh, y_train_sh )
    y_pred_sh = clf_sh.predict( X_test_sh )

    print( strftime("%Y-%m-%d %H:%M:%S", localtime()) )
    print(f'Event: {event}\nTrain-test split: stratified w/shuffling\nSteps: {n_steps}  |  Y_pos: {y_pos}  |  Random state: {random_state}')
    print(f'Features: {features}\nOverlap: {overlap}  |  Flatten: {flatten}\n')

    clf_report_sh = classification_report( y_test_sh, y_pred_sh, output_dict=True)
    print('Classification report')
    print('\tAccuracy:         ', clf_report_sh['accuracy'])
    print('\tMacro F1 score:   ', clf_report_sh['macro avg']['f1-score'])
    print('\tWeighted F1 score:', clf_report_sh['weighted avg']['f1-score'], '\n')

    rmse_sh = math.sqrt(mean_squared_error(y_test_sh, y_pred_sh))
    mae_sh  = mean_absolute_error(y_test_sh, y_pred_sh)
    r2_sh   = r2_score(y_test_sh, y_pred_sh)

    print(f'RMSE = {round(rmse_sh, 4)}')
    print(f'MAE  = {round(mae_sh, 4)}')
    print(f'R2   = {round(r2_sh, 4)}')
    print('\n\n', '='*100, '\n\n', sep='')

    print_cm = False
    if print_cm:
        labels = list(range(0, 35))
        cm_sh = confusion_matrix( y_test_sh, y_pred_sh )
        plot_confusion_matrix( cm_sh, labels, figsize=(10,10), )

2022-08-14 22:32:00
Event: texas2step
Train-test split: time series
Steps: 1  |  Y_pos: 1  |  Random state: 34
Features: ['Num1', 'Num2', 'Num3', 'Num4']
Overlap: True  |  Flatten: True

Classification report
	Accuracy:          0.02262443438914027
	Macro F1 score:    0.02192872838470208
	Weighted F1 score: 0.0203522117022713 

RMSE = 14.5276
MAE  = 11.8507
R2   = -0.9691

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

2022-08-14 22:32:10
Event: texas2step
Train-test split: stratified w/shuffling
Steps: 1  |  Y_pos: 1  |  Random state: 34
Features: ['Num1', 'Num2', 'Num3', 'Num4']
Overlap: True  |  Flatten: True

Classification report
	Accuracy:          0.03611738148984198
	Macro F1 score:    0.033973429665856564
	Weighted F1 score: 0.03416078762942411 

RMSE = 13.8501
MAE  = 11.377
R2   = -0.8289




2022-08-14 22:32:22
Event: texas2step
Train-test split: time series
Steps: 2  |  Y_pos: 1  |  Random state: 34
Features: ['Num1', '

  _warn_prf(average, modifier, msg_start, len(result))


2022-08-14 22:35:44
Event: texas2step
Train-test split: stratified w/shuffling
Steps: 8  |  Y_pos: 1  |  Random state: 34
Features: ['Num1', 'Num2', 'Num3', 'Num4']
Overlap: True  |  Flatten: True

Classification report
	Accuracy:          0.029411764705882353
	Macro F1 score:    0.026520626759940286
	Weighted F1 score: 0.026897814613904388 

RMSE = 14.7378
MAE  = 12.0724
R2   = -1.0788




2022-08-14 22:36:04
Event: texas2step
Train-test split: time series
Steps: 9  |  Y_pos: 1  |  Random state: 34
Features: ['Num1', 'Num2', 'Num3', 'Num4']
Overlap: True  |  Flatten: True

Classification report
	Accuracy:          0.04081632653061224
	Macro F1 score:    0.034923825231386645
	Weighted F1 score: 0.038536984718474075 

RMSE = 14.6592
MAE  = 11.78
R2   = -1.0012

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

2022-08-14 22:36:24
Event: texas2step
Train-test split: stratified w/shuffling
Steps: 9  |  Y_pos: 1  |  Random state: 34
Featu

__OVERALL CONCLUSIONS__   
* For all of the experiments, `RMSE~14, MAE~11, R2~-1`  
* The best accuracy is around [0.045 - 0.047]  
* Shuffled vs. non-shuffled data - no clear winner  
* Small vs. large n_steps - no clear winner, but small n_steps are not considerably worse and sometimes they are better than some large n_steps  

-----


__IN MORE DETAIL__  
__y_pos=0__:
* Better results w/shuffled data
* Best results when n_step=4 (0.047), also good results for n_step=1 or 8 (which is suspicious)

__y_pos=1__:
* Better results with non-shuffled time series (especially for larger n_steps)
* Best results when n_steps=7 (0.045), also good for n_step=8 or 9

__y_pos=2__:
* Better results seem to be w/non-shuffled time series data, but the best result is w/shuffled data
* Best results when n_steps=2 (0.047)

__y_pos=3__:
* Better results with sometimes shuffled, sometimes non-shuffled data
* Best results when n_steps=6 (0.045) w/non-shuffled time series data and when n_step=9 (0.045) w/shuffled data!

# Part 3. Other Classifiers
Best OOTB results - RandomForest?

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [None]:
clf_params = {
}

In [13]:
# FIT & TEST
clf_sh = SVC()
clf_sh.fit( X_train_sh, y_train_sh )
y_pred_sh = clf_sh.predict( X_test_sh )

# PRINT RESULTS
print( strftime("%Y-%m-%d %H:%M:%S", localtime()) )
print(f'Event: {event}\nTrain-test split: stratified w/shuffling\nSteps: {n_steps}  |  Y_pos: {y_pos}  |  Random state: {random_state}')
print(f'Features: {features}\nOverlap: {overlap}  |  Flatten: {flatten}\n')

clf_report_sh = classification_report( y_test_sh, y_pred_sh, output_dict=True)
print('Classification report')
print('\tAccuracy:         ', clf_report_sh['accuracy'])
print('\tMacro F1 score:   ', clf_report_sh['macro avg']['f1-score'])
print('\tWeighted F1 score:', clf_report_sh['weighted avg']['f1-score'], '\n')

rmse_sh = math.sqrt(mean_squared_error(y_test_sh, y_pred_sh))
mae_sh  = mean_absolute_error(y_test_sh, y_pred_sh)
r2_sh   = r2_score(y_test_sh, y_pred_sh)

print(f'RMSE = {round(rmse_sh, 4)}')
print(f'MAE  = {round(mae_sh, 4)}')
print(f'R2   = {round(r2_sh, 4)}')

print_cm = False
if print_cm:
    labels = list(range(0, 35))
    cm_sh = confusion_matrix( y_test_sh, y_pred_sh )
    plot_confusion_matrix( cm_sh, labels, figsize=(10,10), )

2022-08-14 23:41:17
Event: texas2step
Train-test split: stratified w/shuffling
Steps: 4  |  Y_pos: 0  |  Random state: 34
Features: ['Num1', 'Num2', 'Num3', 'Num4']
Overlap: True  |  Flatten: True

Classification report
	Accuracy:          0.033860045146726865
	Macro F1 score:    0.02381843839085463
	Weighted F1 score: 0.026012751185623323 

RMSE = 14.3492
MAE  = 11.5847
R2   = -1.0046


  _warn_prf(average, modifier, msg_start, len(result))


5

## Appendix

```python
n_steps = 4

clf_params_xgb = {
    'n_estimators': 500,
    'max_depth': None,
    'learning_rate': 0.01,                                # eta
    'objective': 'multi:softmax',                        # multi:softmax, multi:softprob, rank:pairwise
    'eval_metric': 'mlogloss',                           # multiclass - merror, mlogloss
    'base_score': 0.5,
    'booster': 'gbtree',                                 # gbtree, dart
    'tree_method': 'auto',                             # auto, exact, approx, hist and gpu_hist
    'importance_type': 'gain',                           # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
    'gamma': 0,                                       # larger - more conservative, [0, inf], default 0
    'reg_alpha': 0,                                    # L1 reg., larger - more conservative, default 0
    'reg_lambda': 1,                                     # L2 rreg., larger - more conservative, default 1
    'sampling_method': 'uniform',                        # uniform, gradient_based
    'max_delta_step': 1,                                 # 1-10
    'min_child_weight': 1,
    'subsample': 1,                                   # 0-1  (lower values prevent overfitting)    
    'colsample_bylevel': 1,                            # 0-1
    'colsample_bynode': 1,                            # optimized for higher recall
    'colsample_bytree': 1,                            # 0-1  
    'seed': 5,
    'num_class': 35,
    'use_label_encoder': False,
    'random_state': random_state,
    'n_jobs': -1,    
}

X_train, X_test, y_train, y_test = train_test_split( X_sh, y_sh,
                                                     test_size=0.2,
                                                     random_state=random_state,
                                                     shuffle=True,
                                                     stratify=y, )

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.00      0.00      0.00        13
           2       0.05      0.08      0.06        12
           3       0.00      0.00      0.00        12
           4       0.06      0.07      0.07        14
           5       0.00      0.00      0.00        12
           6       0.00      0.00      0.00        14
           7       0.09      0.14      0.11        14
           8       0.06      0.07      0.06        15
           9       0.00      0.00      0.00        14
          10       0.00      0.00      0.00        13
          11       0.00      0.00      0.00        12
          12       0.00      0.00      0.00        14
          13       0.00      0.00      0.00        12
          14       0.00      0.00      0.00         9
          15       0.00      0.00      0.00        13
          16       0.11      0.09      0.10        11
          17       0.00      0.00      0.00        13
          18       0.11      0.08      0.10        12
          19       0.12      0.08      0.10        12
          20       0.00      0.00      0.00        12
          21       0.04      0.07      0.05        15
          22       0.05      0.07      0.06        15
          23       0.00      0.00      0.00        11
          24       0.00      0.00      0.00         9
          25       0.07      0.12      0.09        17
          26       0.18      0.15      0.17        13
          27       0.00      0.00      0.00        11
          28       0.00      0.00      0.00         9
          29       0.00      0.00      0.00        11
          30       0.00      0.00      0.00        16
          31       0.00      0.00      0.00        13
          32       0.00      0.00      0.00        12
          33       0.00      0.00      0.00        12
          34       0.07      0.07      0.07        14

    accuracy                           0.03       443
   macro avg       0.03      0.03      0.03       443
weighted avg       0.03      0.03      0.03       443
```