### Import all necessary libraries

In [1]:
import yfinance as yf
import pandas as pd
import datetime
import calendar
import numpy as np
import matplotlib.pyplot as plt
from calendar import monthrange
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

### Get stock data from Yahoo Finance
I have chosen my favourite stock here Bajaj Finance. It is a high beta stocks and thus provides opportunites for short term trading.
Each entry represents 1 day staring from 1 Jan 2011

In [2]:
ticker='BAJFINANCE.NS'
tickerData = yf.Ticker(ticker)
df = tickerData.history(period='1d', start='2011-1-1')

### Bollinger Bands
Bollinger Bands are a technical analysis tool developed by John Bollinger for generating oversold or overbought signals.
There are three lines that compose Bollinger Bands: A simple moving average (middle band) and an upper and lower band.
The upper and lower bands are typically 2 standard deviations +/- from a 20-day simple moving average, but can be modified.

In [3]:
### Bollinger Bands
window = 21
no_of_std = 2

#Calculate rolling mean and standard deviation using number of days set above
rolling_mean = df['Close'].rolling(window).mean()
rolling_std = df['Close'].rolling(window).std()
rolling_vol = df['Volume'].rolling(window).mean()

#create two new DataFrame columns to hold values of upper and lower Bollinger bands
df['Rolling Mean'] = rolling_mean
df['Bollinger High'] = rolling_mean + (rolling_std * no_of_std)
df['Bollinger Low'] = rolling_mean - (rolling_std * no_of_std)
df['Rolling Vol'] = rolling_vol

### Relative Strength Index (RSI)
The relative strength index (RSI) is a momentum indicator used in technical analysis that measures the magnitude of recent price changes to evaluate overbought or oversold conditions in the price of a stock or other asset. The RSI is displayed as an oscillator (a line graph that moves between two extremes) and can have a reading from 0 to 100. The indicator was originally developed by J. Welles Wilder Jr. and introduced in his seminal 1978 book, "New Concepts in Technical Trading Systems."

Traditional interpretation and usage of the RSI are that values of 70 or above indicate that a security is becoming overbought or overvalued and may be primed for a trend reversal or corrective pullback in price. An RSI reading of 30 or below indicates an oversold or undervalued condition.

In [4]:
#### RSI
window = 14
no_of_std = 2
df['GL'] = (df['Close']-df['Open'])/df['Open']
df['rsi']=None
for row in range(df.shape[0]):
    tmp = df['GL'].iloc[row-14:row]
    avg_gain = (tmp[tmp>0].mean()*13+df['GL'].iloc[row])/14
    avg_loss = (abs(tmp[tmp<0].mean())*13+df['GL'].iloc[row])/14
    rs = avg_gain/avg_loss
    df['rsi'].iloc[row] = 100-100/(1+rs)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


### MACD
Moving Average Convergence Divergence (MACD) is a trend-following momentum indicator that shows the relationship between two moving averages of a security’s price. The MACD is calculated by subtracting the 26-period Exponential Moving Average (EMA) from the 12-period EMA.

The result of that calculation is the MACD line. A nine-day EMA of the MACD called the "signal line," is then plotted on top of the MACD line, which can function as a trigger for buy and sell signals. Traders may buy the security when the MACD crosses above its signal line and sell—or short—the security when the MACD crosses below the signal line. Moving Average Convergence Divergence (MACD) indicators can be interpreted in several ways, but the more common methods are crossovers, divergences, and rapid rises/falls.

In [5]:
#### macd
df['macd'] = df['Close'].ewm(span=12, adjust=False).mean()-df['Close'].ewm(span=26, adjust=False).mean()
df['signal'] = df['macd'].ewm(span=9, adjust=False).mean()
df['macd_hist'] = df['macd']-df['signal']

### Target variable
Buy today only if stock moves upwards tomorrow

In [6]:
df['Buy'] = df['GL'].shift(-1)
df['Buy'] = df['Buy']>0
df['Buy']=df['Buy'].apply(lambda x:int(x))

In [7]:
def prev_weekday(d, weekday):
    days_ahead = d.weekday() - weekday
    if days_ahead <= 0:
        days_ahead += 7
    return d - datetime.timedelta(days_ahead)
def days2expiry(dt):
    yr = dt.year
    mth = dt.month
    d = datetime.date(yr,mth,monthrange(yr, mth)[1])
    expiry = prev_weekday(d, 3)
    return (expiry-dt.date()).days

### Feature Engineering
Using ideas from all of the above, let us develop some features.

In [8]:
#### Bollinger band features
df['BreakHigh'] = 0
df['BreakClose'] = 0
df['BreakHigh'][df['High']>df['Bollinger High']] = 1
df['BreakClose'][df['Close']>df['Bollinger High']] = 1

df['BreakLow'] = 0
df['BreakBear'] = 0
df['BreakLow'][df['Low']<df['Bollinger Low']] = 1
df['BreakBear'][df['Close']<df['Bollinger Low']] = 1

#### MACD features
df['MACDmoreSignal'] = 0
df['MACDmoreSignal'][df['macd']>df['signal']] = 1

#### RSI features
df['overbought']=0
df['overbought'][df['rsi']>70]=1
df['oversold']=0
df['oversold'][df['rsi']<30]=1

### Volume based features
df['OperatorEntry']=0
df['OperatorEntry'][df['Volume']>df['Rolling Vol']]=1

## HmO, HmC, HmL, OmL, CmL
df['HmO'] = (df['High'] - df['Open'])/df['Open']
df['HmC'] = (df['High'] - df['Close'])/df['Open']
df['HmL'] = (df['High'] - df['Low'])/df['Open']
df['OmL'] = (df['Open'] - df['Low'])/df['Open']
df['CmL'] = (df['Close'] - df['Low'])/df['Open']

### Simple moving average features
### SMAs 20, 50, 100, 200
df['sma_20'] = df['Close'].rolling(20).mean()
df['sma_50'] = df['Close'].rolling(50).mean()
df['sma_100'] = df['Close'].rolling(100).mean()
df['sma_200'] = df['Close'].rolling(200).mean()
df['20g50'] = df['sma_20'] > df['sma_50']
df['20g100'] = df['sma_20'] > df['sma_100']
df['20g200'] = df['sma_20'] > df['sma_200']
df['Cg20'] = df['Close'] > df['sma_20']
df['Cg50'] = df['Close'] > df['sma_50']
df['Cg100'] = df['Close'] > df['sma_100']
df['Cg200'] = df['Close'] > df['sma_200']

### DOW - Day of Week, DFME - Days from Monthly Expiry, DOM - Day of Month
df['date'] = df.index
df['dow'] = df.index.dayofweek
df['DFME'] = df['date'].apply(lambda x:days2expiry(x))
df['DOM'] = df['date'].apply(lambda x:x.day)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slic

### Remove raw data and keep only useful features

In [9]:
df=df[['macd_hist', 'Buy', 'BreakHigh', 'BreakClose',
       'BreakLow', 'BreakBear', 'MACDmoreSignal', 'overbought', 'oversold',
       'OperatorEntry', 'HmO', 'HmC', 'HmL', 'OmL', 'CmL', '20g50', '20g100',
       '20g200','dow', 'DFME', 'DOM', 'Cg20','Cg50','Cg100','Cg200']]
df=df.dropna()

### Logistic Regresssion vs Random Forest
Let's compare OOTB logistic regression to Random Forest

In [10]:
tt_split_days = 1335
col = [k for k in df.columns if k != 'Buy']
X_train = df[col].iloc[:tt_split_days]
y_train = df['Buy'].iloc[:tt_split_days]
X_test = df[col].iloc[tt_split_days:]
y_test = df['Buy'].iloc[tt_split_days:]

In [11]:
def fit_predict(clf):
    y_pred = clf.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print ('Test set statistics:')
    print ("Precision: {}, Recall :{}, Accuracy :{}".format(tp/(tp+fp), tp/(tp+fn), (tp+tn)/(tp+tn+fp+fn)))
    y_pred = clf.predict(X_train)
    tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()
    print ('Train set statistics:')
    print ("Precision: {}, Recall :{}, Accuracy :{}".format(tp/(tp+fp), tp/(tp+fn), (tp+tn)/(tp+tn+fp+fn)))

In [12]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
fit_predict(clf)

Test set statistics:
Precision: 0.5018050541516246, Recall :0.54296875, Accuracy :0.5299539170506913
Train set statistics:
Precision: 0.5455746367239102, Recall :0.612759643916914, Accuracy :0.5468164794007491




In [16]:
for max_d in range(2,10):
    for criterion in []:
        print ('max depth:{}'.format(max_d))
        clf = RandomForestClassifier(random_state=0, max_depth=max_d).fit(X_train, y_train)
        fit_predict(clf)
        print ('\n')

Test set statistics:
Precision: 0.4619354838709677, Recall :0.69921875, Accuracy :0.4737327188940092
Train set statistics:
Precision: 0.5658823529411765, Recall :0.7136498516320475, Accuracy :0.5790262172284644
max depth:2

Test set statistics:
Precision: 0.4676470588235294, Recall :0.62109375, Accuracy :0.4875576036866359
Train set statistics:
Precision: 0.6211699164345403, Recall :0.6617210682492581, Accuracy :0.6254681647940075
max depth:3

Test set statistics:
Precision: 0.46935201401050786, Recall :0.5234375, Accuracy :0.49585253456221196
Train set statistics:
Precision: 0.648068669527897, Recall :0.672106824925816, Accuracy :0.650187265917603
max depth:4

Test set statistics:
Precision: 0.4789915966386555, Recall :0.556640625, Accuracy :0.5050691244239631
Train set statistics:
Precision: 0.6813031161473088, Recall :0.7136498516320475, Accuracy :0.6868913857677903
max depth:5

Test set statistics:
Precision: 0.4758942457231726, Recall :0.59765625, Accuracy :0.49953917050691243
Tra

