Equity/Fundamental Trading/FundTradingAlgo.py

# -*- coding: utf-8 -*-
"""
Created on Thu Apr  5 12:26:38 2020

@author: ArmelFabrice
"""

## Reference: https://github.com/robertmartin8/MachineLearningStocks

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
import os

pwd = r"YourPath"

cols = ['Ticker', 'Close', 'Forward Semester Returns', 'SPY', 'Forward SPY Semester Returns',
        '3-months Volume','Shares Outstanding', 'MA50', 'MA200', 'beta', 'Market Cap',
        'Shares (Diluted)', 'Net Income', 'Revenue', 'Gross Profit',
        'Operating Income (Loss)', 'Cost of Revenue', 'Net Income (Common)',
        'Revenue Qrt', 'Net Income (Common) Qrt', 'Revenue Per Share',
        'Diluted EPS', 'Profit Margin', 'Operating Margin', 'Net Profit Margin',
        'Quarterly Revenue Growth', 'Quarterly Earnings Growth',
        'TTM Earnings Growth', 'Total Current Assets', 'Total Assets',
        'Total Current Liabilities', 'Short Term Debt', 'Long Term Debt',
        'Total Equity', 'Total Cash', 'Total Debt', 'Total Debt/Equity',
        'Total Cash Per Share', 'Current Ratio', 'Operating Cash Flow',
        'Free Cash Flow', 'Return on Assets', 'Return on Equity', 'EBITDA',
        'Enterprise Value', 'Enterprise value Per Revenue',
        'Enterprise value Per EBITDA', 'Earnings Per Share', 'PE',
        'Book Value Per Share', 'PB', 'PS', 'PEG']

from sklearn.preprocessing import StandardScaler

data_df2 = pd.read_csv(os.path.join(pwd,"dataset.csv"), index_col="Date")
data_df = data_df2[cols]
data_df.dropna(axis=0, how="any", inplace=True)
data_df = data_df.sort_index(axis=0)

def status_calc(stock, sp500, outperformance=10):
    """A simple function to classify whether a stock outperformed the S&P500
    :param stock: stock price
    :param sp500: S&P500 price
    :param outperformance: stock is classified 1 if stock price > S&P500 price + outperformance
    :return: true/false
    """
    if outperformance < 0:
        raise ValueError("outperformance must be positive")
    return stock - sp500 >= outperformance

def backtest(data_df, dates, date_test_ini):
    """
    A simple backtest, which splits the dataset into a train set and test set,
    then fits a Random Forest classifier to the train set. We print the precision and accuracy
    of the classifier on the test set, then run a backtest comparing this strategy's performance
    to passive investment in the S&P500.
    Please note that there is a methodological flaw in this backtest which will give deceptively
    good results, so the results here should not encourage you to live trade.
    """
    # Build the dataset, and drop any rows with missing values
    #data_df = pd.read_csv("keystats.csv", index_col="Date")
    df = data_df.loc[dates[0]:dates[1]]
    #features = data_df.columns[6:]
    features = df.columns[5:]
    X = df[features].values

    # The labels are generated by applying the status_calc to the dataframe.
    # '1' if a stock beats the S&P500 by more than x%, else '0'. Here x is the
    # outperformance parameter, which is set to 10 by default but can be redefined.
    y = list(
        status_calc(
            df["Forward Semester Returns"], df["Forward SPY Semester Returns"], outperformance=2
        )
    )

    # z is required for us to track returns
    z = np.array(df[["Forward Semester Returns", "Forward SPY Semester Returns"]])

    #Train-test split
    df_test = df.loc[date_test_ini:dates[1]]
    l = int(len(df)-len(df_test))
    X_train, X_test, y_train, y_test, z_test = X[:l], X[l:], y[:l], y[l:], z[l:]
    
    # Instantiate a RandomForestClassifier with 100 trees, then fit it to the training data
    # We begin y scaling data
    sc = StandardScaler()
    sc.fit(X_train)
    X_train = sc.transform(X_train)
    X_test = sc.transform(X_test)
    
    clf = RandomForestClassifier(n_estimators=100, random_state=0)
    clf.fit(X_train, y_train)

    # Generate the predictions, then print test set accuracy and precision
    y_pred = clf.predict(X_test)
    
    print('')
    print('From {} to {}'.format(df_test.index[0],df_test.index[-1]))
    print('')
    print("Classifier performance\n", "=" * 20)
    print(f"Accuracy score: {clf.score(X_test, y_test): .2f}")
    print(f"Precision score: {precision_score(y_test, y_pred): .2f}")

    # Because y_pred is an array of 1s and 0s, the number of positive predictions
    # is equal to the sum of the array
    num_positive_predictions = sum(y_pred)
    if num_positive_predictions < 0:
        print("No stocks predicted!")

    # Recall that z_test stores the change in stock price in column 0, and the
    # change in S&P500 price in column 1._
    # Whenever a stock is predicted to outperform (y_pred = 1), we 'buy' that stock
    # and simultaneously `buy` the index for comparison.
    stock_returns = 1 + z_test[y_pred, 0] / 100
    market_returns = 1 + z_test[y_pred, 1] / 100

    # Calculate the average growth for each stock we predicted 'buy'
    # and the corresponding index growth
    avg_predicted_stock_growth = sum(stock_returns) / num_positive_predictions
    index_growth = sum(market_returns) / num_positive_predictions
    percentage_stock_returns = 100 * (avg_predicted_stock_growth - 1)
    percentage_market_returns = 100 * (index_growth - 1)
    total_outperformance = percentage_stock_returns - percentage_market_returns

    print("\n Stock prediction performance report \n", "=" * 40)
    print(f"Total Trades:", num_positive_predictions)
    print(f"Average return for stock predictions: {percentage_stock_returns: .1f} %")
    print(
        f"Average market return in the same period: {percentage_market_returns: .1f}% "
    )
    print(
        f"Compared to the index, our strategy earns {total_outperformance: .1f} percentage points more"
    )


if __name__ == "__main__":
    indexes = data_df.index
    unique_indexes = list()
    for k in range(len(indexes)):
        if indexes[k] not in unique_indexes:
            unique_indexes.append(indexes[k])
            
    #Unique_indexes contains the number of snapshot dates, around 88 here
    j = 1
    f = 4
    for i in range(0,len(unique_indexes) - f*j,j):
        dates = [unique_indexes[i], unique_indexes[i+f*j]]
        date_test_ini = unique_indexes[i+(f-1)*j]
        backtest(data_df, dates, date_test_ini)