# Outcome and Feature Construction
Generate the class values, i.e. the y for the data. Construct features

The following dataframes are generated:
- source
- features
- outcomes

## Parameters
Here, all parameters of the notebook are set

In [2]:
#from IPython.core.display import display
#from pylab import *

In [1]:
use_training_settings = True
dataset_name = "omxs30"
class_name = "LongTrend"

if use_training_settings == True:
    #Load stock market files 
    filename = '01_Source/Source_OMX1986-2019.CSV'
    filedataresultdirectory = "02_Training_Data"
    
    filenameprefix = dataset_name + "_" + class_name
    filenamesuffix = ""
    
    #Allow cropping of data, which are longer than moving averages in the future. For training data, this value shall be 
    #true to not make false values. For the test values, which do not use any y values, the value shall be false.
    cut_data = True
else:
    ### FOR TEST DATA ###
    filename = '01_Source/Source_OMX2017-2019_testset.CSV'
    filedataresultdirectory = "03_Test_Prepared_Data"
    
    filenameprefix = dataset_name + "_" + class_name
    filenamesuffix = "_test" #Add test to the test files

    #Allow cropping of data, which are longer than moving averages in the future. For training data, this value shall be 
    #true to not make false values. For the test values, which do not use any y values, the value shall be false.
    cut_data = False

In [2]:
dataset_filename = filedataresultdirectory + "/" + filenameprefix + "_dataset" + filenamesuffix + ".csv"
ylabel_filename = filedataresultdirectory + "/" + filenameprefix + "_y" + "_labels" + filenamesuffix + ".csv"
source_filename = filedataresultdirectory + "/" + filenameprefix + "_timegraph" + filenamesuffix + ".csv"

## Load Raw Data
Stock data will be loaded, train and test values will be calculated to be used as Y values for the system

In [3]:
#%matplotlib notebook

import pandas as pd
import numpy as np
# import pandas_datareader as datareader
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime

import DatavisualizationFunctions as vis
#from matplotlib.finance import candlestick_ohlc
# from mpl_finance import candlestick_ohlc

#%matplotlib inline
#%matplotlib notebook
#%matplotlib ipympl

source = pd.read_csv(filename, sep=';')
source.index.name = "id"
source.columns = ['Time', 'High', 'Low', 'Close']
source['Time'] = pd.to_datetime(source['Time'])
source['Time'].apply(mdates.date2num)
#df.head()

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

plt.interactive(True)

plt.figure(num=None, figsize=(12.5, 7), dpi=80, facecolor='w', edgecolor='k')
plt.plot(source['Time'],source['Close'])
plt.title(filename)
plt.show()

NameError: name 'filename' is not defined

## Class Generation

Here, 4 classes will be generated:
- LongTerm
- Intermediate term 20d
- Short term 5d
- very short term 1d

#### Create labels

In [4]:
y_labels = {
    'neutral' : 0,
    'positive' : 1,
    'negative':  2
}

### Calculate Tops and Bottoms

In [42]:
m=source.shape[0]
factor = 10000000
topsTemp = np.zeros([m, 4]);
topsTemp
bottomsTemp = np.ones([m, 4])*factor;
bottomsTemp
#close=source['Close']
#close

array([[10000000., 10000000., 10000000., 10000000.],
       [10000000., 10000000., 10000000., 10000000.],
       [10000000., 10000000., 10000000., 10000000.],
       ...,
       [10000000., 10000000., 10000000., 10000000.],
       [10000000., 10000000., 10000000., 10000000.],
       [10000000., 10000000., 10000000., 10000000.]])

In [43]:
# Get tops and bottoms from the chart
#Parameter
maxDecline = 0.02
maxIncrease = 0.02
factor = 10000000

#Format: Time, High, Low, Close
m=source.shape[0]

topsTemp = np.zeros([m, 4])
bottomsTemp = np.ones([m, 4])*factor

high=source['High']
low=source['Low']
close=source['Close']

#Run 1 for the rough tops and bottoms
for i, data in enumerate(source.values):
    #Get top
    if i>3 and i<m-3:
        #Decline close >2% from top high
        decline = (high[i] - min(close[i+1:i+2]))/high[i];
        if decline > maxDecline or high[i]==max(high[i-3:i+3]):
            #Top found
            topsTemp[i, 1]=high[i];
            #print("Top found at i={} value={}".format(i, high[i]));
    
    #%Get bottom
    if i>3 and i<m-3:
    #    %Decline close >2% from top high
        increase = (low[i] - max(close[i+1:i+2]))/low[i];
        if increase > maxIncrease or low[i]==min(low[i-3:i+3]):
            #Top found
            bottomsTemp[i, 1]=low[i];
            #print("Bottom found at i={} value={}".format(i, low[i]));

print("{} tops, {} bottoms found.".format(sum(topsTemp[:, 1]>0), sum(bottomsTemp[:, 1]<factor)));

#%Run 2 for exacter tops and bottoms
iTop = topsTemp[:,1];
iBottom = bottomsTemp[:,1];
for i, data in enumerate(source.values):
    #Tops
    if i>20 and i<m-20:
        if iTop[i]>0 and max(iTop[i-15:i+15])<=iTop[i]:
            topsTemp[i, 2]=iTop[i];
            #%fprintf("Intermediate top found at i=%i value=%.0f\n", i, iTop(i));
    
        if iBottom[i]<factor and min(iBottom[i-15:i+15])>=iBottom[i]:
            bottomsTemp[i, 2]=iBottom[i];
            #%fprintf("Intermediate bottom found at i=%i value=%.0f\n", i, iBottom(i));

bottomsTemp[bottomsTemp==factor]=0
bottoms=bottomsTemp[:, 2]
tops=topsTemp[:, 2]
print("Reduced to {} tops and {} bottoms.".format(sum(tops[:]>0), sum(bottoms[:]>0)));

1739 tops, 1237 bottoms found.
Reduced to 193 tops and 222 bottoms.


In [44]:
#topsTemp[topsTemp[:,1]>0]

In [45]:
#bottomsTemp[0:10,:]

In [46]:
plt.figure(num=None, figsize=(12.5, 7), dpi=80, facecolor='w', edgecolor='k')
plt.plot(source['Time'],source['Close'])
plt.plot(source['Time'], tops[:])
plt.plot(source['Time'], bottoms[:])
plt.title(filename)
plt.show()

<IPython.core.display.Javascript object>

In [47]:
#Calculate the latest single event from a list of [0 0 0 0 2 0 0 1 0]->[0 0 0 0 2 2 2 2 1 1]
def calculateLatestEvent(eventList):
    previousItem=0;
    result = np.zeros(eventList.shape[0])
    for i in range(len(eventList)):
        if eventList[i]!=previousItem and eventList[i]!= 0:
            result[i]=eventList[i]
            previousItem=eventList[i]
        else: 
            result[i]=previousItem
    return result

latestBottoms = calculateLatestEvent(bottoms)
latestTops = calculateLatestEvent(tops)

In [48]:
#Calculate varios MA
#mov = pd.Series(np.arange(0, 100, 1), name='test')
#print(mov)
#Moving Average 
# mov: close
# n: Number of samples
# shift: shift of the window. shift < 0 future, shift > 0 history
def MA(mov, n, shift):  
    MA = mov.rolling(n).mean()
    #print(MA)
    source=pd.DataFrame(MA)
    source.columns = ['SMA' + str(n) + 'shift' + str(shift)]
    shiftedMA = source.shift(shift)
    
    return shiftedMA

In [49]:
ma50Future = MA(close, 50, -50)

In [50]:
#Use Lowess to create a smoothed trend as an y value for a long trend
from statsmodels.nonparametric.smoothers_lowess import lowess
import numpy as np
from scipy.ndimage.interpolation import shift

#Fraction for the lowess smoothing function
def calculate_lowess(days_to_consider, close, time):
    frac=days_to_consider/len(source['Close'])
    filtered = lowess(source['Close'], source['Time'], frac=frac)
    #Calculate the dlowess/dt to see if it is raising or declining
    shiftCol = filtered[:,1] - shift(filtered[:,1], 1, cval=np.NaN)
    pos_trend = shiftCol>0
    #print(pos_trend[0:5])

    fig = plt.figure(num=None, figsize=(10, 7), dpi=80, facecolor='w', edgecolor='k')
    plt.plot(source['Time'],source['Close'])
    plt.plot(source['Time'], filtered[:, 1], 'r-', linewidth=3)
    plt.plot(source['Time'], filtered[:, 1]*pos_trend, 'g-', linewidth=3)
    #plt.plot(source['Time'], filtered[:, 1]*pos_trend_cleaned, 'y-', linewidth=3)
    
    return pos_trend, fig

pos_trend_long, fig_long = calculate_lowess(300, source['Close'], source['Time'])
plt.gca()
plt.show()

  del sys.path[0]


<IPython.core.display.Javascript object>

In [51]:
pos_trend_short, fig_short = calculate_lowess(10, source['Close'], source['Time'])
plt.gca()
plt.show()

  del sys.path[0]


<IPython.core.display.Javascript object>

### Calculate the Y values for 1d, 5d, 20d and the Long Trend

In [52]:
#Calculte the 1d trend

high=source['High']
low=source['Low']
close=source['Close']

y1day=np.zeros(m)
#Calculate the 5d trend
y5day=np.zeros(m)
#20d trend
y20day=np.zeros(m)
#long term trend
ylong=np.zeros(m)
signalLong=0;

for i in range(m-50):
    #=== 1d trend ===#
    if close[i+1]>close[i]:
        y1day[i]=1
    
    
    
    #=== 5day short trend ===#
    #if (pos_trend_short[i+10]==True) and (pos_trend_short[i+1]==True) and (pos_trend_short[i+2]==True) and (future_difference>0.001) and close[i+1]>close[i]:
    #Positive buy
    if i>5 and np.max(bottoms[i-5:i-1])>0 and np.mean(close[i+1:i+5])>close[i]:
        y5day[i]=1;
    
    #negtive, sell
    if i>5 and np.max(tops[i-5:i-1])>0 and np.mean(close[i+1:i+5])<close[i]:
        y5day[i]=2;
    
    #=== median trend 20d ===#
    if close[i+20]>close[i]:
        y20day[i]=1;
    
    #=== long term trend ===#
    #Trigger positive, buy
    if pos_trend_long[i] == True and close[i]>latestTops[i]:
        signalLong=1;
    #negative, sell
    elif pos_trend_long[i] == False and close[i]<latestBottoms[i]:
        signalLong=2;
    
    if signalLong==1:
        ylong[i] = 1;
    elif signalLong==2:
        ylong[i] = 2;
    else:
        ylong[i] = 0;

    #=== end ===#
print("y1day", sum(y1day))
print("y5day", sum(y5day))
print("y20day", sum(y20day))
print("ylong", sum(ylong))
print("Generated trends 1d, 5d, 20d, long.")

y1day 4254.0
y5day 1733.0
y20day 4771.0
ylong 10717.0
Generated trends 1d, 5d, 20d, long.


In [53]:
#Clean bad signals 1
previousSignalCount = sum(y1day)
for i in range(m-50):
    #If the signal is only valid for one or 2 days the signal was bad and
    #noisy. Only if the signal is valid for 3 days, it can be consideres as
    #a real signal
    if np.mean(y1day[i:i+3])<0.75:
        y1day[i]=0

print("Previous signal count y1day={}. New signal count={}".format(previousSignalCount, sum(y1day)))
print("Cleaned bad signals 1");

Previous signal count y1day=4254.0. New signal count=1215.0
Cleaned bad signals 1


In [54]:
#Clean bad signals 2, filter single days, enhance trend
print("signals y1day=", sum(y1day))
print("signals ylong=", sum(ylong))

#for i in range(m-50):
    # short term +1d
    #if i>1 and y1day[i-1]==0 and y1day[i+1]==0:
    #    y1day[i]=0;
    
    #long term, remove all values < 5 days to remove noise
    #use sliding window
    #if i>5 and ylong[i]==1:
    #    slideresult = np.zeros(5);
    #    for j in range(-5,0):
    #        slideresult[j+5] = np.mean(ylong[i+j:i+j+4])
    #    
    #    if max(slideresult)<1:
    #        ylong[i]=0;

print("signals y1day=", sum(y1day))
print("signals ylong=", sum(ylong))
print("Cleaned bad signals 2");

signals y1day= 1215.0
signals ylong= 10717.0
signals y1day= 1215.0
signals ylong= 10717.0
Cleaned bad signals 2


In [55]:
#Clean bad signals 3, filter single days
print("signals ylong=", sum(ylong))
for i in range(m-50):
    #long term, fill in all values < 5 days to remove noise
    #Fill gaps
    #Use sliding window
    if i>20-1 and ylong[i]==0 and np.mean(ylong[i-20:i+20])>0.5:
        ylong[i]=1;
    
    #Enhance the trend to run as far as possible
    if i>=1 and ylong[i-1]==1 and ylong[i]==0 and close[i]>latestBottoms[i]:
        ylong[i]=1;

print("signals ylong=", sum(ylong))
print("Cleaned bad signals 3.");

signals ylong= 10717.0
signals ylong= 10717.0
Cleaned bad signals 3.


In [56]:
#Merge all y values to the series start
outcomes=source.join(
    pd.Series(y1day, name="1dTrend").astype('int64')).join(
    pd.Series(y5day, name="5dTrend").astype('int64')).join(
    pd.Series(y20day, name="20dTrend").astype('int64')).join(
    pd.Series(ylong, name="LongTrend").astype('int64'))
#result = [source array2table([tops bottoms y1day y5day y20day ylong], 'VariableNames',{'Tops', 'Bottoms', 'answer1d', 'answer5d', 'answer20d', 'answerLong'})];
#result(end-100:end,:)=[];

In [57]:
if cut_data == True:
    #Drop the 50 last values as they cannot be used for prediction as +50 days ahead is predicted
    outcomes_cut = outcomes.drop(outcomes.tail(50).index, inplace=False)
    #Drop from the timerows too
    source_cut = source.drop(source.tail(50).index, inplace=False)
else:
    source_cut = source
    outcomes_cut = outcomes

#fprintf("Finish\n");
display(outcomes_cut.head())
display(outcomes_cut.tail())
#merged_source.shape
#source.shape

Unnamed: 0_level_0,Time,High,Low,Close,1dTrend,5dTrend,20dTrend,LongTrend
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1986-09-30,125.0,125.0,125.0,1,0,1,0
1,1986-10-01,125.93,125.93,125.93,1,0,1,1
2,1986-10-02,126.25,126.25,126.25,1,0,1,1
3,1986-10-03,126.68,126.68,126.68,1,0,1,1
4,1986-10-06,128.15,128.15,128.15,0,0,1,1


Unnamed: 0_level_0,Time,High,Low,Close,1dTrend,5dTrend,20dTrend,LongTrend
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8082,2018-11-27,1505.07,1490.76,1496.41,0,0,0,2
8083,2018-11-28,1505.74,1493.12,1500.45,0,0,0,2
8084,2018-11-29,1518.62,1501.65,1515.38,0,0,0,2
8085,2018-11-30,1519.18,1509.22,1514.63,0,0,0,2
8086,2018-12-03,1544.39,1523.88,1529.78,0,0,0,2


In [58]:
#def amplifyForPlot(binaryArray, targetArray, distance):
#    return binaryArray * targetArray * (1-distance)
#amplifyForPlot(merged_df['LongTrend'], merged_df['Close'], 0.01)

#### Plot 3 class data

%matplotlib notebook

def plot_three_class_graph(y0, y1, y2, y_ref, y_time, offset1, offset2, offset3, legend):
    
    plot_data_OK = amplifyForPlot(y0, y_ref, offset1)
    plot_data_blim = amplifyForPlot(y1, y_ref, offset2)
    plot_data_tlim = amplifyForPlot(y2, y_ref, offset3)
    
    # Plot test data
    plt.figure(num=None, figsize=(11.5, 7), dpi=80, facecolor='w', edgecolor='k')

    plt.plot(y_time, y_ref)
    plt.plot(y_time, plot_data_OK, color='grey')
    plt.plot(y_time, plot_data_blim, color='green')
    plt.plot(y_time, plot_data_tlim, color='red')
    plt.title("Prediction Results")
    plt.ylim([np.min(y_ref)*0.99999, np.max(y_ref)*1.00002])
    plt.grid()
    plt.legend(legend)
    plt.show()

In [59]:
#Present long term term data
#y_order_data_ok = (outcomes_cut['LongTrend'].values==0)*1
#y_order_data_blim = (outcomes_cut['LongTrend'].values==1)*1
#y_order_data_tlim = (outcomes_cut['LongTrend'].values==2)*1
#plot_three_class_graph(y_order_data_ok, y_order_data_blim, y_order_data_tlim, source_cut['Close'], 
#                       source_cut['Time'], 0,0,0,('close', 'neutral', 'positive', 'negative'))
vis.plot_three_class_graph(outcomes_cut['LongTrend'].values, source_cut['Close'], 
                       source_cut['Time'], 0,0,0,('close', 'neutral', 'positive', 'negative'))

<IPython.core.display.Javascript object>

In [60]:
#Present short term data
#y_order_data_ok = (outcomes_cut['5dTrend'].values==0)*1
#y_order_data_blim = (outcomes_cut['5dTrend'].values==1)*1
#y_order_data_tlim = (outcomes_cut['5dTrend'].values==2)*1
#plot_three_class_graph(y_order_data_ok, y_order_data_blim, y_order_data_tlim, source_cut['Close'], 
#                       source_cut['Time'], 0,0,0,('close', 'close', 'bottom', 'top'))

vis.plot_three_class_graph(outcomes_cut['5dTrend'].values, source_cut['Close'], 
                       source_cut['Time'], 0,0,0,('close', 'neutral', 'positive', 'negative'))

<IPython.core.display.Javascript object>

#### Plot 2 class data

In [61]:
def plot_two_class_graph(binclass, y_ref, y_time, offset_binclass, legend):
    plt.figure(num=None, figsize=(12.5, 7), dpi=80, facecolor='w', edgecolor='k')
    plt.plot(y_time, y_ref)
    plt.plot(y_time,vis.amplifyForPlot(binclass, y_ref, offset_binclass), color='orange')
    plt.title(filename)
    plt.title("Prediction Results")
    plt.ylim([np.min(y_ref)*0.99999, np.max(y_ref)*1.00002])
    plt.grid()
    plt.legend(legend)
    plt.show()

plot_two_class_graph(outcomes_cut['LongTrend']-1, source_cut['Close'], source_cut['Time'], 0, ('close', 'Positive Trend'))

<IPython.core.display.Javascript object>

In [62]:
#Outcomes
#Rename cut outcomes to outcomes
outcomes = outcomes_cut
display(outcomes.head(5))

Unnamed: 0_level_0,Time,High,Low,Close,1dTrend,5dTrend,20dTrend,LongTrend
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1986-09-30,125.0,125.0,125.0,1,0,1,0
1,1986-10-01,125.93,125.93,125.93,1,0,1,1
2,1986-10-02,126.25,126.25,126.25,1,0,1,1
3,1986-10-03,126.68,126.68,126.68,1,0,1,1
4,1986-10-06,128.15,128.15,128.15,0,0,1,1


## Generate Features from Raw Data
Generate features based on price data X

Data structure generated: features

In [63]:
#Inputs
close = outcomes['Close']
high = outcomes['High']
low = outcomes['Low']
yCol = outcomes['LongTrend'];

#Define features df
features = pd.DataFrame(index=outcomes.index)