-
Notifications
You must be signed in to change notification settings - Fork 10
/
FundTradingAlgo.py
150 lines (127 loc) · 6.21 KB
/
FundTradingAlgo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 5 12:26:38 2020
@author: ArmelFabrice
"""
## Reference: https://github.com/robertmartin8/MachineLearningStocks
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
import os
pwd = r"YourPath"
cols = ['Ticker', 'Close', 'Forward Semester Returns', 'SPY', 'Forward SPY Semester Returns',
'3-months Volume','Shares Outstanding', 'MA50', 'MA200', 'beta', 'Market Cap',
'Shares (Diluted)', 'Net Income', 'Revenue', 'Gross Profit',
'Operating Income (Loss)', 'Cost of Revenue', 'Net Income (Common)',
'Revenue Qrt', 'Net Income (Common) Qrt', 'Revenue Per Share',
'Diluted EPS', 'Profit Margin', 'Operating Margin', 'Net Profit Margin',
'Quarterly Revenue Growth', 'Quarterly Earnings Growth',
'TTM Earnings Growth', 'Total Current Assets', 'Total Assets',
'Total Current Liabilities', 'Short Term Debt', 'Long Term Debt',
'Total Equity', 'Total Cash', 'Total Debt', 'Total Debt/Equity',
'Total Cash Per Share', 'Current Ratio', 'Operating Cash Flow',
'Free Cash Flow', 'Return on Assets', 'Return on Equity', 'EBITDA',
'Enterprise Value', 'Enterprise value Per Revenue',
'Enterprise value Per EBITDA', 'Earnings Per Share', 'PE',
'Book Value Per Share', 'PB', 'PS', 'PEG']
from sklearn.preprocessing import StandardScaler
data_df2 = pd.read_csv(os.path.join(pwd,"dataset.csv"), index_col="Date")
data_df = data_df2[cols]
data_df.dropna(axis=0, how="any", inplace=True)
data_df = data_df.sort_index(axis=0)
def status_calc(stock, sp500, outperformance=10):
"""A simple function to classify whether a stock outperformed the S&P500
:param stock: stock price
:param sp500: S&P500 price
:param outperformance: stock is classified 1 if stock price > S&P500 price + outperformance
:return: true/false
"""
if outperformance < 0:
raise ValueError("outperformance must be positive")
return stock - sp500 >= outperformance
def backtest(data_df, dates, date_test_ini):
"""
A simple backtest, which splits the dataset into a train set and test set,
then fits a Random Forest classifier to the train set. We print the precision and accuracy
of the classifier on the test set, then run a backtest comparing this strategy's performance
to passive investment in the S&P500.
Please note that there is a methodological flaw in this backtest which will give deceptively
good results, so the results here should not encourage you to live trade.
"""
# Build the dataset, and drop any rows with missing values
#data_df = pd.read_csv("keystats.csv", index_col="Date")
df = data_df.loc[dates[0]:dates[1]]
#features = data_df.columns[6:]
features = df.columns[5:]
X = df[features].values
# The labels are generated by applying the status_calc to the dataframe.
# '1' if a stock beats the S&P500 by more than x%, else '0'. Here x is the
# outperformance parameter, which is set to 10 by default but can be redefined.
y = list(
status_calc(
df["Forward Semester Returns"], df["Forward SPY Semester Returns"], outperformance=2
)
)
# z is required for us to track returns
z = np.array(df[["Forward Semester Returns", "Forward SPY Semester Returns"]])
#Train-test split
df_test = df.loc[date_test_ini:dates[1]]
l = int(len(df)-len(df_test))
X_train, X_test, y_train, y_test, z_test = X[:l], X[l:], y[:l], y[l:], z[l:]
# Instantiate a RandomForestClassifier with 100 trees, then fit it to the training data
# We begin y scaling data
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
# Generate the predictions, then print test set accuracy and precision
y_pred = clf.predict(X_test)
print('')
print('From {} to {}'.format(df_test.index[0],df_test.index[-1]))
print('')
print("Classifier performance\n", "=" * 20)
print(f"Accuracy score: {clf.score(X_test, y_test): .2f}")
print(f"Precision score: {precision_score(y_test, y_pred): .2f}")
# Because y_pred is an array of 1s and 0s, the number of positive predictions
# is equal to the sum of the array
num_positive_predictions = sum(y_pred)
if num_positive_predictions < 0:
print("No stocks predicted!")
# Recall that z_test stores the change in stock price in column 0, and the
# change in S&P500 price in column 1._
# Whenever a stock is predicted to outperform (y_pred = 1), we 'buy' that stock
# and simultaneously `buy` the index for comparison.
stock_returns = 1 + z_test[y_pred, 0] / 100
market_returns = 1 + z_test[y_pred, 1] / 100
# Calculate the average growth for each stock we predicted 'buy'
# and the corresponding index growth
avg_predicted_stock_growth = sum(stock_returns) / num_positive_predictions
index_growth = sum(market_returns) / num_positive_predictions
percentage_stock_returns = 100 * (avg_predicted_stock_growth - 1)
percentage_market_returns = 100 * (index_growth - 1)
total_outperformance = percentage_stock_returns - percentage_market_returns
print("\n Stock prediction performance report \n", "=" * 40)
print(f"Total Trades:", num_positive_predictions)
print(f"Average return for stock predictions: {percentage_stock_returns: .1f} %")
print(
f"Average market return in the same period: {percentage_market_returns: .1f}% "
)
print(
f"Compared to the index, our strategy earns {total_outperformance: .1f} percentage points more"
)
if __name__ == "__main__":
indexes = data_df.index
unique_indexes = list()
for k in range(len(indexes)):
if indexes[k] not in unique_indexes:
unique_indexes.append(indexes[k])
#Unique_indexes contains the number of snapshot dates, around 88 here
j = 1
f = 4
for i in range(0,len(unique_indexes) - f*j,j):
dates = [unique_indexes[i], unique_indexes[i+f*j]]
date_test_ini = unique_indexes[i+(f-1)*j]
backtest(data_df, dates, date_test_ini)