# Predicting Unknown Data With Optimized Model Parameters

In this notebook, the following happens: 
- Complete unknown, prepared test data is loaded
- An optimized model is loaded
- y values are predicted
- y values are saved

## Parameter

In [15]:
import os

filetraindataresultdirectory = '02_Training_Data'
filetestdataresultdirectory = '03_Test_Prepared_Data'
modelresultdirectory = '04_Model'
finalresultdirectory = '05_Results'

filenametrainprefix = 'omxs30'
filenametestprefix = 'omxs30_test'
filenameXsuffix = ''
filenameysuffix = '_prediction_LongTrend'

X_filename_test = filetestdataresultdirectory + "/" + filenametestprefix + "_X" + filenameXsuffix + ".csv"
y_pred_filename = finalresultdirectory + "/" + filenametestprefix + "_y" + filenameysuffix + ".csv"
ylabel_filename = filetraindataresultdirectory + "/" + filenametrainprefix + "_y" + "_labels.csv"
X_filename_selected_feature_column = filetraindataresultdirectory + "/" + filenametrainprefix + "_X" + filenameXsuffix + "_selected_feature_columns.csv"
filename_timegraph = filetestdataresultdirectory + "/" + filenametestprefix + "_timegraph" + ".csv"


model_filename =  os.getcwd() + "/" + modelresultdirectory + "/" + filenametrainprefix + "_final_model" + filenameXsuffix + ".sav"

print("X train:{}\ny file name to be protected: {}\ny label: {}\nfeature columns: {}\n".format(X_filename_test, y_pred_filename, ylabel_filename, X_filename_selected_feature_column))

X train:03_Test_Prepared_Data/omxs30_test_X.csv
y file name to be protected: 05_Results/omxs30_test_y_prediction_LongTrend.csv
y label: 02_Training_Data/omxs30_y_labels.csv
feature columns: 02_Training_Data/omxs30_X_selected_feature_columns.csv



## Imports

In [16]:
#Imports
%matplotlib notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as m
from sklearn.externals import joblib
import matplotlib.dates as mdates
import datetime

import DataSupportFunctions as sup
import DatavisualizationFunctions as vis

from sklearn.model_selection import train_test_split

#Global settings
np.set_printoptions(precision=3)

#Suppress print out in scientific notiation
np.set_printoptions(suppress=True)

#### Pipeline elements

In [17]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
# Column extractor object

class Nosampler(BaseEstimator, TransformerMixin):
    '''The nosampler class do not do any type of sampling. It shall be used to compare with common over, under and 
    combined samplers'''

    #def __init__(self):
        #self.cols = cols

    def transform(self, X):
        return X

    def fit(self, X, y=None):
        return self

#res = Nosampler().transform(X_train.values)
#display(res)
#X_train.values

In [18]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
# Column extractor object

class ColumnExtractor(BaseEstimator, TransformerMixin):
    '''Column extractor method to extract selected columns from a list. This is used as a feature selector. Source
    https://stackoverflow.com/questions/25250654/how-can-i-use-a-custom-feature-selection-function-in-scikit-learns-pipeline,
    http://danielhnyk.cz/creating-your-own-estimator-scikit-learn/.'''

    def __init__(self, cols=[0]):
        self.cols = cols

    def transform(self, X):
        col_list = []
        if self.cols is not None:
            return X[:,self.cols]
        else:
            return X
        #for c in self.cols:
        #    col_list.append(X[:, c:c+1])
        #return np.concatenate(col_list, axis=1)
        #return X[self.cols].values

    def fit(self, X, y=None):
        return self

#X_train[['MA2Norm', 'MA5Norm']]
#colList = [0, 1]
#col = ColumnExtractor(cols=colList).transform(X_train.values)
#col

In [19]:
def getListFromColumn(df, df_source, col_number):
    col_names = list(df[df.columns[col_number]].dropna().values)
    col_index = [i for i, col in enumerate(df_source) if col in col_names]
    return col_index

#display(feature_dict)

#Index(pd.Series(data=list(df_X.columns))).get_loc(x)

## Load Unknown Data

In [20]:
#Load features from X
df_X = pd.read_csv(X_filename_test, delimiter=';').set_index('id')
print("Loaded feature names for X={}".format(df_X.columns))
print("X. Shape={}".format(df_X.shape))
print("Indexes of X={}".format(df_X.index.shape))

#Load original data for visualization
df_timegraph = pd.read_csv(filename_timegraph, delimiter=';').set_index('id')
df_timegraph['Time'] = pd.to_datetime(df_timegraph['Time'])
df_timegraph['Time'].apply(mdates.date2num)
print("Loaded feature names for time graph={}".format(df_timegraph.columns))
print("X. Shape={}".format(df_timegraph.shape))
display(df_timegraph.head())

#Load classes
df_y_classes = pd.read_csv(ylabel_filename, delimiter=';', header=None)
y_classes = sup.inverse_dict(df_y_classes.set_index(df_y_classes.columns[0]).to_dict()[1])
print("Loaded classes into dictionary: {}".format(y_classes))

#Load list of feature columns
df_feature_columns = pd.read_csv(X_filename_selected_feature_column, delimiter=';')
print("Selected features")
display(df_feature_columns)

model = joblib.load(model_filename)
print("Loaded trained model from ", model_filename)
print("Model", model)

Loaded feature names for X=Index(['MA2Norm', 'MA5Norm', 'MA10Norm', 'MA18Norm', 'MA20Norm', 'MA40Norm',
       'MA50Norm', 'MA75Norm', 'MA100Norm', 'MA125Norm', 'MA150Norm',
       'MA175Norm', 'MA200Norm', 'MA225Norm', 'MA250Norm', 'MA275Norm',
       'MA300Norm', 'MA325Norm', 'MA350Norm', 'MA375Norm', 'MA400Norm',
       'MA2NormDiff', 'MA5NormDiff', 'MA10NormDiff', 'MA18NormDiff',
       'MA20NormDiff', 'MA40NormDiff', 'MA50NormDiff', 'MA75NormDiff',
       'MA100NormDiff', 'MA125NormDiff', 'MA150NormDiff', 'MA175NormDiff',
       'MA200NormDiff', 'MA225NormDiff', 'MA250NormDiff', 'MA275NormDiff',
       'MA300NormDiff', 'MA325NormDiff', 'MA350NormDiff', 'MA375NormDiff',
       'MA400NormDiff', 'RSI2', 'RSI5', 'RSI14', 'RSI20', 'RSI25',
       'Stoch_slowk13', 'Stoch_slowd8', 'MACD12_26_9', 'MACD12_26_9Diff',
       'MACD12_26_9DiffDirChange'],
      dtype='object')
X. Shape=(258, 52)
Indexes of X=(258,)
Loaded feature names for time graph=Index(['Time', 'High', 'Low', 'Close'], dty

Unnamed: 0_level_0,Time,High,Low,Close
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
400,2018-08-07,1621.26,1610.52,1619.89
401,2018-08-08,1624.1,1614.73,1622.63
402,2018-08-09,1631.74,1618.72,1631.23
403,2018-08-10,1627.44,1613.39,1616.82
404,2018-08-13,1616.34,1607.81,1611.8


Loaded classes into dictionary: {0: 'negative', 1: 'positive'}
Selected features


Unnamed: 0,Lasso,Tree,Backward_Elimination,RecursiveTop13,RecursiveTop26,RecursiveTop39,Manual,All
0,MA250Norm,MACD12_26_9,MA20Norm,MA225Norm,MA225Norm,MA225Norm,MA100Norm,MA2Norm
1,MA225Norm,RSI14,MA75Norm,MA250Norm,MA250Norm,MA250Norm,MA125Norm,MA5Norm
2,MA5Norm,MA40Norm,MA100Norm,MA200Norm,MA200Norm,MA200Norm,MA150Norm,MA10Norm
3,MA10Norm,RSI20,MA125Norm,MA275Norm,MA275Norm,MA275Norm,MA200Norm,MA18Norm
4,MA20Norm,MA50Norm,MA150Norm,MA175Norm,MA175Norm,MA175Norm,MA20Norm,MA20Norm
5,RSI25,RSI25,MA200Norm,MA300Norm,MA300Norm,MA300Norm,MA225Norm,MA40Norm
6,,MA75Norm,MA225Norm,MA150Norm,MA150Norm,MA150Norm,MA250Norm,MA50Norm
7,,MA100Norm,MA250Norm,MA325Norm,MA325Norm,MA325Norm,MA375Norm,MA75Norm
8,,MA150Norm,MA375Norm,MA125Norm,MA125Norm,MA125Norm,MA75Norm,MA100Norm
9,,MA125Norm,MA75NormDiff,MA400Norm,MA400Norm,MA400Norm,RSI14,MA125Norm


Loaded trained model from  C:\Users\wendt.ICT\Documents\Alexander\03_PythonProjects\04_OmxTemplate/04_Model/omxs30_final_model.sav
Model Pipeline(memory=None,
         steps=[('scaler',
                 QuantileTransformer(copy=True, ignore_implicit_zeros=False,
                                     n_quantiles=1000,
                                     output_distribution='uniform',
                                     random_state=None, subsample=100000)),
                ('sampling', Nosampler()),
                ('feat',
                 ColumnExtractor(cols=[6, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20,
                                       30, 45])),
                ('svm',
                 SVC(C=1738.2156298329437, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma=0.2336648982574579, kernel='rbf', max_iter=-1,
                     probability=True, random_state=None, shrinking=True,
                

In [21]:
selected_features = [getListFromColumn(df_feature_columns, df_X, i) for i in range(0, df_feature_columns.shape[1])]
feature_dict = dict(zip(df_feature_columns.columns, selected_features))

## Predict new Trend

In [22]:
optclf = model
X=df_X

print("Predict unknown data")
y_test_pred = optclf.predict(X.values)
y_test_pred_scores = optclf.decision_function(X.values)
y_test_pred_proba = optclf.predict_proba(X.values)

y_order_test_pred = pd.DataFrame(index=X.index, data=pd.Series(data=y_test_pred, index=X.index, name="y")).sort_index()

Predict unknown data


In [23]:
def amplifyForPlot(binaryArray, targetArray, distance):
    return binaryArray * targetArray * (1-distance)

In [24]:
%matplotlib notebook

# Plot test data
plt.figure(num=None, figsize=(11.5, 7), dpi=80, facecolor='w', edgecolor='k')
#plt.plot(df_timegraph['Time'],df_timegraph['Close'])
#plt.plot(df_timegraph['Time'],amplifyForPlot(y, df_timegraph['Close'].values, 0.00), color='green')

plt.plot(df_timegraph['Time'],df_timegraph['Close'])
#plt.plot(df_timegraph['Time'][X_test.index],amplifyForPlot(y_order_test['y'].values, df_timegraph['Close'][X_test.index].values, 0.00), color='green')
plt.plot(df_timegraph['Time'],amplifyForPlot(y_order_test_pred['y'].values, df_timegraph['Close'].values, 0.0), color='green')
plt.title("Plot results")
plt.ylim([np.min(df_timegraph['Close'].values)*0.99, np.max(df_timegraph['Close'].values)*1.02])
plt.grid()
plt.show()

<IPython.core.display.Javascript object>

In [25]:
df_timegraph.join(y_order_test_pred['y']).tail()

Unnamed: 0_level_0,Time,High,Low,Close,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
653,2019-08-12,1550.96,1528.11,1533.06,1
654,2019-08-13,1548.4,1509.61,1538.53,1
655,2019-08-14,1539.35,1498.41,1500.06,1
656,2019-08-15,1509.58,1476.83,1495.71,0
657,2019-08-16,1519.56,1496.94,1519.41,1
