In [0]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
import sklearn.metrics as metrics
import os
from datetime import datetime
import time
import os
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

Splitting the dataset into features and labels for machine learning and data required for backtesting.

In [0]:
def split_dataset(dataset):
  
  dataset = dataset.reindex(np.random.permutation(dataset.index))
  
  Z = np.array(dataset[['Stock Percent Difference', 'Index Percent Difference']])
  Y = dataset['Label'].values


  columns_to_remove = ['Stock Price', 
                          'Index Price', 
                          'Stock Percent Difference',
                          'Index Percent Difference',
                          'Ticker',
                          'Report Date',
                          'Fiscal Year',
                          'Publish Date',
                          'Fiscal Period',
                          'Currency']
  for column in columns_to_remove:
    try:
      dataset.drop(columns = column, inplace=True)
    except:
      print("There is no ", column, " in this dataset.")
  dataset = dataset.fillna(0)
  dataset.drop(columns = 'Label', inplace= True)
  
  X = dataset.values
  scaler = preprocessing.MinMaxScaler()
  X = scaler.fit_transform(X)

  return X,Y,Z
  

Creating folders where the log files from experiments are stored dynamuically.

In [0]:
def path_to_log():
  now = datetime.now()
  current_time = now.strftime("%d-%m-%Y %H:%M:%S")
  str_path = '/content/drive/My Drive/Logs SVM/%s' %current_time
  os.makedirs(str_path)
  return str_path

def spec_path(kernel, gen_path, gamma):
  spec_path = '/%s/%s' % (kernel, gamma)
  os.makedirs(gen_path + spec_path)
  return gen_path + spec_path

Loading the data and splitting it into training and testing datasets.

In [0]:
big_data = pd.read_csv('/content/drive/My Drive/Data Sets/labelled_fundamental_data.csv')

In [0]:
features, labels, p_change = split_dataset(big_data)

In [0]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.25, shuffle = False, random_state= 42)
p_change = p_change[len(labels_test):]

In [0]:
print('Features train',len(features_train))
print('Features test',len(features_test))
print('Labels train',len(labels_train))
print('Labels test',len(labels_test))

Defining the backtesting method.

In [0]:
def backtest(predicted_labels, labels_test, p_change, path):
  count_correct = 0
  invest_amount = 100
  total_trades = 0
  if_mark = 0
  if_strat = 0
  string = "\n"
  
  for pred in range(len(predicted_labels)):
    if predicted_labels[pred] == 1:
      strat = invest_amount + (invest_amount * (p_change[pred][0]/100))
      mark = invest_amount + (invest_amount * (p_change[pred][1]/100))
      total_trades += 1
      if_mark += mark
      if_strat += strat

  in_bank = total_trades * invest_amount
  compare_p = ((if_strat-if_mark)/if_mark)*100
  compare = if_strat-in_bank
  avg_market = ((if_mark-in_bank)/in_bank)*100
  avg_strat = ((if_strat-in_bank)/in_bank)*100


  string += "Total trades: %s \n" %total_trades
  string += "Invest in market: %s \n" %round(if_mark)
  string += "Invest with predictions: %s \n" %round(if_strat)
  string += "Savings: %s \n" %in_bank

 
  string += "Difference strategy-savings total: %s \n" %round(compare)
  string += "Difference strategy-savings percent: %s \n" %round(compare_p)


  
  string += "Average Market Return: %s \n" %round(avg_market)
  string += "Average Strategy return: %s \n" %round(avg_strat)


  
  x = np.arange(3)
  money = [in_bank, if_mark, if_strat]


  def millions(x, pos):
      'The two args are the value and tick position'
      return '$%1.1fM' % (x * 1e-6)


  formatter = FuncFormatter(millions)

  fig, ax = plt.subplots()
  ax.yaxis.set_major_formatter(formatter)
  plt.bar(x, money)
  plt.xticks(x, ('Savings', 'Market ETF (SPY)', 'Strategy'))
  plt.savefig("%s/backtest.png" %path)
  plt.show()

  return string

Running experiments with different hyper parameters for SVM and logging the results.

In [0]:
path_to_save = path_to_log()

kernels = ['rbf']
gammas = [1000]
Cs = [175]

for kernel in kernels:
  for gamma in gammas:
    spec_path_to_save = spec_path(kernel, path_to_save, gamma)
    for C in Cs:

      start_time = time.time()
      
      
      file_path = '%s/%s_%s_%s_results.txt' %(spec_path_to_save, kernel, gamma, C)
      f = open(file_path, 'w+')
      
      string = 'Tuning parameters for Kernel: %s  Gammma: %s C: %s\n' %(kernel, gamma, C)
      
      print("Training started.")

      model = svm.SVC(kernel = kernel, C = C, gamma = gamma)
      model.fit(features_train, labels_train)
      
      predictions = model.predict(features_test)
      print('Training and prediction complete.')

      string += str("Accuracy: " + str(metrics.accuracy_score(labels_test, predictions))) + '\n'
      
      string += str("Classification report: \n" + str(metrics.classification_report(labels_test, predictions))) + '\n'
      
      string += str("Confusion matrix: \n" + str(metrics.confusion_matrix(labels_test, predictions))) + '\n'
      
      end_time = time.time() - start_time

      seconds = 'Time taken (seconds): %.2f \n' %end_time
      
      print(seconds)
      
      string += seconds
      
      string += 'Time taken (minutes): %.2f' %(end_time/60)

      string += backtest(predictions, labels_test, p_change, spec_path_to_save)

      svc_roc = metrics.plot_roc_curve(model, features_test, labels_test)
      plt.savefig("%s/roc.png" %spec_path_to_save)
      plt.show()

      svc_conf = metrics.plot_confusion_matrix(model, features_test, labels_test)
      plt.savefig("%s/confusion.png" %spec_path_to_save)
      plt.show()

      f.write(str(string + '\n'))

      f.close()

Train a single algorithm and display results from algorithm predictions.

In [0]:
import time
start_time = time.time()

SVM = svm.SVC(kernel='rbf', C = 175, gamma = 1000) 
print("SVM initalised")

SVM.fit(features_train, labels_train)
print("Training took: ", (time.time()-start_time), "seconds")

predicted_labels = SVM.predict(features_test)

accuracy = metrics.accuracy_score(labels_test, predicted_labels)

print("Accuracy: ", accuracy)
print(metrics.classification_report(labels_test, predicted_labels))
print(metrics.confusion_matrix(labels_test, predicted_labels))

svc_disp = metrics.plot_roc_curve(SVM, features_test, labels_test)
plt.show()
svc_disp = metrics.plot_confusion_matrix(SVM, features_test, labels_test)
plt.show()
print(backtest(predicted_labels, labels_test, p_change, "/content/"))

Save the trained model.

In [0]:
import joblib
joblib.dump(SVM, '/content/logs/SVM_classifier_C_gamma_175.sav')

Extract data from log files and use it for plotting graphs.

In [0]:
gammas = []
accuracys = []

In [0]:
with open('/content/drive/My Drive/Logs SVM/03-05-2020 13:35:49/rbf/rbf_1000_175_results.txt', 'r') as file:
    data = file.readlines()
    #print(data)
    dat_1 = data[0]
    gam = dat_1.find('Gammma: ')
    gam += len("Gamma: ")
    c = dat_1.find(" C:")
    gam = float(dat_1[gam:c])
    c += len(" C:")
    n_line = dat_1.find('\n')
    c = float(dat_1[c:n_line])
    gammas.append(gam)
    
    dat_2 = data[1]
    acc = dat_2.find("Accuracy: ")
    acc += len("Accuracy: ")
    n_line = dat_2.find("\n")
    acc = float(dat_2[acc:n_line])
    accuracys.append(acc)
    print(gam, c, acc)

In [0]:
len(gammas)

In [0]:
# Create the figure and axes objects
fig, ax = plt.subplots(1, figsize=(10, 6))
fig.suptitle('Accuracy increase based on gamma')

# Plot the data
ax.plot(gammas,accuracys)

# Show the grid lines as dark grey lines
plt.grid(b=True, which='major', color='#666666', linestyle='-')
plt.xlabel('Gamma Value')
plt.ylabel('Accuracy %')
plt.savefig('accuracies.png')
plt.show()
#plt.axis([400,900, 70, 76])

In [0]:
plt.savefig("accuracies.png")