### Genetic Programming - First Model
First implementation of GP using tree depth of 2 to 3 and basic operators. Strongly typed GP is used to ensure final tree node produces required signal string. Fitness function evaluates performance of each trading rule based on buy/sell signals and closing prices on day of financial statement. Uses Robust Scaler and attributes with min. 80% complete data.

In [1]:
import operator
import math
import random
import pandas as pd
import datetime as dt
import numpy as np
import pygraphviz as pgv
import os
import urllib
import json
import requests
from io import StringIO
import sys
from pathlib import Path
from eod import EodHistoricalData

from functools import partial

from deap import algorithms
from deap import base
from deap import creator
from deap import tools
from deap import gp

pd.set_option('display.max_rows', None)

In [4]:
#paths for data - set prefix to location of Data folder
path_prefix = r'C:\Users\OEM\GDrive\WQU'
path_att = r'\Data\fundamentals_by_attribute'
path_fun = r'\Data\fundamentals_by_ticker'
path_std = r'\Data\standardised_fundamentals'
path_eda = r'\Data\exploratory_data_analysis'
path_rob = r'\Data\robust_scaling'
path_cor = r'\Data\correlation_matrices'

In [5]:
#read csv of % NaN values by attribute
mean_pct_nans = pd.read_csv(path_prefix + path_eda + "/df_agg.csv", index_col=0).loc['pct_nan']
mean_pct_nans.sort_values(inplace=True)
to_drop = mean_pct_nans[mean_pct_nans > 20].index.to_list()

In [6]:
#read 
filepath = Path(path_prefix + path_fun + '\df_fun_TECK.csv')
df = pd.read_csv(filepath, index_col=0, parse_dates=True)
df.drop(['date', 'filing_date', 'currency_symbol'], inplace=True)
df.drop(to_drop, inplace=True)

In [7]:
#create list of types for when iniitialising Pset (float x num attributes)
num_atts = len(df)
types_list = []
for i in range(num_atts):
    types_list.append(float)

In [8]:
#initialise Pset to receive floats and output string (buy/sell/hold)
pset = gp.PrimitiveSetTyped("main", types_list, str)

In [9]:
#rename input arguments (Terminals) with fundamental attribute names
for i in range(num_atts): 
    ind_name =  df.index[i]
    if ind_name != 'date' and ind_name != 'currency_symbol' and ind_name != 'filing_date' :
        argstring = "ARG{}".format(str(i))
        pset.renameArguments(**{argstring:ind_name})

In [10]:
# Define new functions
def protectedDiv(left, right):
    try:
        return left / right
    except ZeroDivisionError:
        return 1

In [11]:
#define function to provide signal
def buy_sell (A, B):
    if A > B:
        return 'buy'
    elif A < B:
        return 'sell'
    else:
        return 'hold'

In [12]:
#add functions to Primitive set
pset.addPrimitive(buy_sell, [float, float], str)
pset.addPrimitive(operator.add, [float, float], float)
pset.addPrimitive(operator.sub, [float, float], float)
pset.addPrimitive(operator.mul, [float, float], float)
pset.addPrimitive(protectedDiv, [float, float], float)

In [13]:
#create invididual class to maximise fitness
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMax, pset=pset)

In [14]:
#instantiate toolbox and register components for evolving trees
toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=2, max_=3)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("compile", gp.compile, pset=pset)

In [15]:
individual = toolbox.individual()

In [16]:
expr = gp.genFull(pset, min_=1, max_=3)
tree = gp.PrimitiveTree(expr)
str(tree)

'buy_sell(totalAssets, retainedEarnings)'

In [17]:
#load dataframe of robust scaled fundamentals
ticker = "AA"
filepath = Path(path_prefix + path_rob + '\df_rob_{}.csv'.format(ticker))
df_std = pd.read_csv(filepath, index_col=0)
df_std.drop(to_drop, axis=1, inplace=True)
df_std.sort_index(inplace=True)
df_std['signal'] = ""

In [18]:
def eval_trading_rule(individual):
    #transform the tree in to a callable function
    function = toolbox.compile(expr=individual)
    
    #generate signal - pass each value to evolved function as a variable, add signal to df\
    for row in df_std.index:
        var_list = []
        for col in df_std.columns:
            if col != 'signal' and col != 'adjusted_close' and col != 'prev_close':
                var = df_std.loc[row, col]
                var_list.append(var)
        signal = function(*var_list)
        df_std.loc[row, 'signal'] = signal

    #initialise variables
    buy_price = 0
    sell_price = 0
    bank = 10000

    #Loop through df and check entry signal (SELL to BUY) or exit signal (BUY to SELL) generated
    for i in range(len(df_std)):
        loc_ac = df_std.columns.get_loc('adjusted_close') 
        loc_sig = df_std.columns.get_loc('signal') 

        if df_std.iloc[i, loc_sig] == 'buy' and (df_std.iloc[i-1, loc_sig] == 'sell' or df_std.iloc[i-1, loc_ac] == 'hold') and i > 0:
            buy_price = df_std.iloc[i, loc_ac]
            buy_date = df_std.index[i]
            #print('Bought at ${} on {}'.format(buy_price, buy_date))

        elif df_std.iloc[i, loc_sig] == 'sell' and (df_std.iloc[i-1, loc_sig] == 'buy' or df_std.iloc[i-1, loc_sig] == 'hold')  and i > 0:
            sell_price = df_std.iloc[i, loc_ac]
            sell_date = df_std.index[i]
            #print('Sold at ${} on {}'.format(sell_price, sell_date))
            
        else:
            None

        #Check if buy and sell complete and update bank if so
        if buy_price > 0 and sell_price > 0:
            bank = bank + (sell_price - buy_price)
            buy_price = 0
            sell_price = 0
            #print("New bank = {}".format(bank))

    #print("Final bank = {}".format(bank))
    bank = [bank]
    return tuple(bank)

In [19]:
eval_trading_rule(individual)

  return left / right


(10046.874800000001,)

In [20]:
toolbox.register("evaluate", eval_trading_rule)
toolbox.register("select", tools.selTournament, tournsize=5)
toolbox.register("mate", gp.cxOnePoint)
toolbox.register("expr_mut", gp.genHalfAndHalf, min_=1, max_=1)
toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)

In [21]:
def main():
    random.seed(10)
    pop = toolbox.population(n=250)
    hof = tools.HallOfFame(1)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)

    #algorithms.eaSimple(pop, toolbox, 0.5, 0.2, 40, stats, halloffame=None)
    algorithms.eaSimple(pop, toolbox, 0.4, 0.2, 100, stats=stats, halloffame=hof)
    return pop, stats, hof

In [22]:
if __name__ == "__main__":
    pop, stats, hof = main()

  return left / right
  return left / right


gen	nevals	avg    	std    	min    	max    
0  	250   	9991.84	30.8581	9918.27	10085.5
1  	126   	10016.1	30.267 	9903.65	10085.5
2  	124   	10031.2	30.9834	9932.72	10085.5
3  	143   	10037.1	33.5415	9932.61	10089.7




4  	113   	10047.9	35.4257	9929.89	10094.8
5  	121   	10053.2	40.9447	9932.11	10098  
6  	140   	10057.6	40.0978	9923.38	10098  
7  	132   	10060.8	39.2388	9932.34	10098  
8  	124   	10068.8	37.9429	9929.94	10102.4
9  	131   	10067.4	39.4192	9935.92	10102.4




10 	124   	10070.3	40.0239	9939.56	10102.4
11 	127   	10068.2	43.5748	9929.87	10102.4
12 	138   	10066.9	46.4742	9919.45	10102.4
13 	133   	10068  	45.7353	9928.16	10102.4
14 	120   	10073.9	41.1304	9929.87	10102.4
15 	136   	10068.8	43.0271	9935.92	10107.3
16 	129   	10072.2	44.0601	9919.45	10107.3
17 	131   	10066.5	46.8916	9913.11	10107.3
18 	118   	10071.5	42.3209	9936.54	10107.3
19 	128   	10069.9	46.1973	9934.23	10107.3
20 	142   	10067.2	44.9765	9929.87	10107.3
21 	113   	10074  	47.5658	9917.1 	10107.3
22 	106   	10078.2	42.6492	9923.32	10107.3
23 	129   	10073.8	43.3129	9925.99	10107.3
24 	137   	10075.1	42.7266	9922.21	10112.7
25 	138   	10078.3	38.1501	9939.08	10112.7
26 	130   	10083.4	34.3109	9956.99	10112.7
27 	119   	10083.3	37.4023	9936.88	10112.7
28 	119   	10083  	38.0511	9929.9 	10112.7
29 	130   	10085.2	35.5967	9950.97	10113.5
30 	129   	10086.5	35.8799	9966.92	10113.5
31 	114   	10091.7	35.5989	9933.36	10113.5
32 	124   	10090.4	35.2085	9949.39	10113.5
33 	147   	

In [23]:
#show the executable code for the best performing Individual (Hall of Fame)
tree = gp.PrimitiveTree(hof[0])
str(tree)

'buy_sell(add(add(protectedDiv(accumulatedOtherComprehensiveIncome, nonCurrrentAssetsOther), mul(sub(operatingIncome, totalCurrentAssets), costOfRevenue)), totalOtherIncomeExpenseNet), protectedDiv(totalCurrentAssets, nonCurrrentAssetsOther))'

In [215]:
#create tree diagram
nodes, edges, labels = gp.graph(hof[0])

g = pgv.AGraph()
g.add_nodes_from(nodes)
g.add_edges_from(edges)
g.layout(prog="dot")

for i in nodes:
    n = g.get_node(i)
    n.attr["label"] = labels[i]

g.draw("tree.pdf")