## 1. Setup

### Install and import dependies

In [1]:
#!pip install numerapi==2.3.8
#!pip install yfinance
#!pip install simplejson

In [2]:
import numerapi
import yfinance
import simplejson

import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
import requests as re 
from datetime import datetime
from dateutil.relativedelta import relativedelta, FR

In [3]:
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

### Numerapi and Signals tickers

In [4]:
NAPI_PUBLIC_KEY = '276T6FPFGSKDOT2STIHYDFYZQ67IBBMU'
NAPI_PRIVATE_KEY = 'NEQ33QL63TEST6EK4YQLFRGF4257MK4POPOEGJRGZOYERVJ6C3K2GLUOFPKYF6GM'
napi = numerapi.SignalsAPI(NAPI_PUBLIC_KEY, NAPI_PRIVATE_KEY)

In [12]:
# read in list of active Signals tickers which can change slightly era to era
eligible_tickers = pd.Series(napi.ticker_universe(), name = 'ticker')
print(f"Number of eligble tickers : {len(eligible_tickers)}")

Number of eligble tickers : 5412


In [13]:
# read in yahoo to numerai ticker map, still a work in progress, h/t wsouza and
# this tickermap is a work in progress and not guaranteed to be 100% correct 
ticker_map = pd.read_csv('https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_ticker_map_w_bbg.csv')
ticker_map = ticker_map[ticker_map.bloomberg_ticker.isin(eligible_tickers)]

numerai_tickers = ticker_map['bloomberg_ticker']
yfinance_tickers = ticker_map['yahoo']
print(f"Number of eligible tickers in map: {len(ticker_map)}")

Number of eligible tickers in map: 5411


### YFinance price feed download
Download price data (adjusted close) using Yahoo Finance Wrapper

In [16]:
n = 600 #chunk row size
chunk_df = [yfinance_tickers.iloc[i:i+n] for i in range(0, len(yfinance_tickers), n)]

In [17]:
concat_dfs = []
for df in chunk_df:
    try:
        # set_threads = True for faster performance, but tickers will fail, script may hang
        # set_threads = False for slower performance, but more tickers will succeed
        temp_df = yfinance.download(df.str.cat(sep=' '), start='2002-12-01', threads=False)
        temp_df = temp_df['Adj Close'].stack().reset_index()
    except simplejson.errors.JSONDecodeError:
        pass

[*********************100%***********************]  599 of 599 completed
[*********************100%***********************]  600 of 600 completed
[*********************100%***********************]  594 of 594 completed
[*********************100%***********************]  594 of 594 completed
[*********************100%***********************]  595 of 595 completed
[*********************100%***********************]  588 of 588 completed
[*********************100%***********************]  591 of 591 completed
[*********************100%***********************]  594 of 594 completed
[*********************100%***********************]  595 of 595 completed
[*********************100%***********************]  11 of 11 completed


In [None]:
full_data = pd.concat(concat_dfs)

## 2. Data wrangle

In [None]:
# properly position and clean raw data, after taking adjusted close only
full_data.columns = ['date', 'ticker', 'price']
full_data.set_index('date', inplace=True)

# convert yahoo finace tickers back to numerai tickers
full_data['ticker'] = full_data.ticker.map(dict(zip(yfinance_tickers, numerai_tickers)))

print(f"Number of tickers with data: {len(full_data.ticker.unique())}")
full_data.head()

### Engineer features
Now that we've downloaded the raw price data, we need to set up features we want to predict the target. For this example, we'll use RSI, which is a technical indicator that attempts to measure how "oversold" or "overbought" a stock is. A good rule of thumb is that an RSI > 70 indicates a stock is overbought and an RSI < 30 is oversold.

In [None]:
def RSI(prices, interval=14):
    '''Computes Relative Strength Index given a price series and 
    lookback interval'''
    delta = prices.diff()

    dUp, dDown = delta.copy(), delta.copy()
    dUp[dUp < 0] = 0
    dDown[dDown >] = 0

    RollUp =dUp.rolling(interval).mean()
    RollDown = dDown.rolling(interval).mean.abs()

    RS = RollUp / RollDown
    RSI = 100.0 - (100.0 / (1.0 + RS))
    return RSI 


In [None]:
ticker_groups = full_data.groupby('ticker')
full_data['RSI'] = ticker_groups['price'].transform(lambda x: RSI(x))

In [None]:
# group by era (date) and create quintile labels within each era, useful for learning relative ranking 
date_groups = full_data.groupby(full_data.index)
full_data['RSI_quintile'] = date_groups['RSI'].transform(lambda group: pd.qcut(group, 5, labels=False, duplicates='drop'))
full_data.dropna(inplace=True)

full_data.head()

In [None]:
ticker_groups = full_data('ticker')

# create lagged features, lag 0 is that day's value, lag 1 is yesterday's value, etc
num_days = 5
for day in range(num_days+1):
    full_data[f'RSI_quintile_lag_{day}'] = ticker_groups['RSI_quintile'].transform(lambda group: group.shift(day))

In [None]:
full_data.tail()

In [None]:
# create difference of the lagged fetures (change RSI quintile by day)
for day in range(num_days):
    full_data[f'RSI_diff_{day}'] = full_data[f'RSI_quintile_lag_{day}'] - full_data[f'RSI_quintile_lag_{day + 1}']
    full_data[f'RSI_abs_diff_{day}'] = np.abs(full_data[f'RSI_quintile_lag_{day}'] - full_data[f'RSI_quintile_lag_{day + 1}'])

In [None]:
full_data.tail()

In [None]:
feature_names = [f'RSI_quintile_lag_{num}' for num in range(num_days)] + [f'RSI_diff_{num}' for num in range(num_days)] + [f'RSI_abs_diff_{num}' for num in range(num_days)]
print(f'Features for training:\n (feature_names)')

### Targets 
Every era will begin on a Friday, ignore first two days of return (i.e. Monday, Tuesday) and then be based on the subsequent 4 days of return (i.e. Tuesday Close to Monday Close). The targets take on values of {0, 0.25, 0.5, 0.75, 1}, but they are not balanced classes.

Try to think of the targets as normalized and neutralized rankings of returns within each era. 10% of the values take on 0 or 1, 40% of the values take on 0.25 or 0.75 and 50% of the values take on 0.5.

In [None]:
TARGET_NAME = 'target'