In [3]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [4]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use("fivethirtyeight")
%matplotlib inline

# For reading stock data from yahoo
from pandas_datareader.data import DataReader
from pandas_datareader import data as pdr
import yfinance as yf

# For time stamps
from datetime import datetime

yf.pdr_override()
# download dataframe
data = pdr.get_data_yahoo('AAPL', start='2012-01-01', end=datetime.now())
#data = pdr.get_data_yahoo("^GSPC", start="2011-01-01", end="2018-01-01")


[*********************100%***********************]  1 of 1 completed


In [5]:
data=data.drop(['Adj Close','Volume'], axis=1)


In [6]:
# calculate momentum for each day
# 5-day momentum

def momentum(df):
    n = len(df)
    arr = []
    for i in range(0,5):
        arr.append('N')
    for j in range(5,n):
        momentum = df.Close[j] - df.Close[j-5] #Equation for momentum
        arr.append(momentum)
    return arr

momentum = momentum(data)

# add momentum to data
data['Momentum'] = momentum

In [7]:
#Use pct_change() function to add the one day returns to the dataframe 

data_pctchange=data.Close.pct_change()
data['Return'] = data_pctchange

In [8]:
#ROI function

def ROI(df,n):
    m = len(df)
    arr = []
    for i in range(0,n):
        arr.append('N')
    for j in range(n,m):
        roi= (df.Close[j] - df.Close[j-n])/df.Close[j-n] #Equation for ROI
        arr.append(roi)
    return arr

#Run the ROI function for 10, 20, and 30 day periods

ROI10=ROI(data,10)
ROI20=ROI(data,20)
ROI30=ROI(data,30)


#Add all 3 ROI results to dataframe 

data['10 Day ROI']=ROI10
data['20 Day ROI']=ROI20
data['30 Day ROI']=ROI30

In [9]:
# calculate RSI for each day


def RSI(df,period):
    # get average of upwards of last 14 days: Ct - Ct-1
    # get average of downwards of last 14 days: Ct-1 - Ct
    n = len(df)
    arr = []
    for i in range(0,period):
        arr.append('N')
    for j in range(period,n):
        total_upwards = 0
        total_downwards = 0
        # this will find average of upwards
        for k in range(j,j-period,-1):
            if(df.Close[k-1] > df.Close[k]):
                total_downwards = total_downwards + (df.Close[k-1] - df.Close[k])    
        avg_down = total_downwards / period
        for l in range(j,j-period,-1):
            if(df.Close[l] > df.Close[l-1]):
                total_upwards = total_upwards + (df.Close[l] - df.Close[l-1])
        avg_up = total_upwards / period
        RS = avg_up / avg_down
        RSI  = 100 - (100/(1+RS))
        arr.append(RSI)
    return arr


#Run RSI for 10, 14, and 30 day periods

RSI_14 = RSI(data,14)
RSI_10 = RSI(data,10)
RSI_30 = RSI(data,30)

# add RSI to data

data['10_day_RSI'] = RSI_10
data['14_day_RSI'] = RSI_14
data['30_day_RSI'] = RSI_30

In [10]:
# calculate EMA for each day
# formula: EMA = (2/(n+1))*ClosePrice + (1-(2/(n+1)))*previousEMA

def EMA(df, n):
    m = len(df)
    arr = []
    arr.append('N')
    prevEMA = df.Close[0]
    for i in range(1,m):
        close = df.Close[i]
        EMA = ((2/(n+1))*close) + ((1-(2/(n+1)))*prevEMA)
        arr.append(EMA)
        prevEMA = EMA
    return arr

#Calculate EMA with n=12 and n=26

EMA_12 = EMA(data, 12)
EMA_26 = EMA(data, 26)

#add EMA to dataframe 

data['EMA_12'] = EMA_12
data['EMA_26'] = EMA_26

In [11]:
#Function to Classify each day as a 1 or a 0

def clas(df):
    n = len(df)
    arr = []
    for i in range(0,len(df)-1):
        if (100*((df.Close[i+1]-df.Open[i+1])/df.Open[i+1]))>=.3:
            arr.append(1)
        else:
            arr.append(0)
    arr.append('N')
    return arr

clas=clas(data)

#Add Class to our dataframe
data['Class'] = clas

In [12]:
#MACD
# Moving Average of EMA(n) - EMA(m2) for each row
# where n = 12 and m2 = 26
def MACD(df):
    n = 12
    m2 = 26
    arr = []
    arr.append('N')
    ema_12 = EMA(df,n)
    ema_26 = EMA(df,m2)
    m = len(df)
    for i in range(1,m):
        arr.append(ema_12[i] - ema_26[i])
    return arr

MACD = MACD(data)

#Add MACD to our dataframe 
data['MACD_12_26'] = MACD

In [13]:
#SRSI: Stochastic RSI
#SRSI = (RSI_today - min(RSI_past_n)) / (max(RSI_past_n) - min(RSI_past_n))
def SRSI(df,n):
    m = len(df)
    arr = []
    list_RSI = RSI(df,n)
    for i in range(0,n):
        arr.append('N')
    for j in range(n,n+n):
        last_n = list_RSI[n:j]
        if(not(last_n == []) and not(max(last_n) == min(last_n))):
            SRSI = (list_RSI[j] - min(last_n)) / (max(last_n)- min(last_n))
            if SRSI > 1:
                arr.append(1)
            else:
                arr.append(SRSI)
        else:
            arr.append(0)
    for j in range(n+n,m):
        last_n = list_RSI[2*n:j]
        if(not(last_n == []) and not(max(last_n) == min(last_n))):
            SRSI = (list_RSI[j] - min(last_n)) / (max(last_n)- min(last_n))
            if SRSI > 1:
                arr.append(1)
            else:
                arr.append(SRSI)
        else:
            arr.append(0)
    return arr

#Run SRSI for 10, 14, and 30 day periods
SRSI_10 = SRSI(data,10)
SRSI_14 = SRSI(data,14)
SRSI_30 = SRSI(data,30)

#Add SRSI to our dataframe
data['SRSI_10'] = SRSI_10
data['SRSI_14'] = SRSI_14
data['SRSI_30'] = SRSI_30

In [14]:
# calculate Williams %R oscillator for each day

def Williams(df,n):
    m = len(df)
    arr = []
    for i in range(0,n-1):
        arr.append('N')
    for j in range(n-1,m):
        maximum = max(data.High[(j-n+1):j+1])
        minimum = min(data.Low[(j-n+1):j+1])
        val = (-100)*(maximum-df.Close[j])/(maximum-minimum)
        arr.append(val)
    return arr


williams = Williams(data,14)

#Add Williams%R to our dataframe
data['Williams'] = williams

In [15]:
# True Range
# TR = MAX(high[today] - close[yesterday]) - MIN(low[today] - close[yesterday])
def TR(df,n):
    high = df.High[n]
    low = df.Low[n]
    close = df.Close[n-1]
    l_max = list()
    l_max.append(high)
    l_max.append(close)
    l_min = list()
    l_min.append(low)
    l_min.append(close)
    return (max(l_max) - min(l_min))

# Average True Range
# Same as EMA except use TR in lieu of close (prevEMA = TR(dataframe,14days))
def ATR(df,n):
    m = len(df)
    arr = []
    prevEMA = TR(df,n+1)
    for i in range(0,n):
        arr.append('N')
    for j in range(n,m):
        TR_ = TR(df,j)
        EMA = ((2/(n+1))*TR_) + ((1-(2/(n+1)))*prevEMA)
        arr.append(EMA)
        prevEMA = EMA
    return arr

ATR = ATR(data,14)  

#Add ATR to our dataframe
data['ATR_14'] = ATR

In [16]:
# calculate Commodity Channel Index (CCI) for each day

import numpy as np
def CCI(df,n):
    m = len(df)
    arr = []
    tparr = []
    for i in range(0,n-1):
        arr.append('N')
        tp = (df.High[i]+df.Low[i]+df.Close[i])/3
        tparr.append(tp)
    for j in range(n-1,m):
        tp = (df.High[j]+df.Low[j]+df.Close[j])/3
        tparr.append(tp) 
        tps = np.array(tparr[(j-n+1):(j+1)])
        val = (tp-tps.mean())/(0.015*tps.std())
        arr.append(val)
    return arr

cci = CCI(data,20) 

#Add CCI to our dataframe
data['CCI'] = cci

In [17]:
#double check that the dataframe has all 22 features
data.shape

(2801, 22)

In [18]:
#def normalization function to clean data
def normalize(df):
    for column in df:
        df[column]=((df[column]-df[column].mean())/df[column].std())

In [19]:
#def positive values for running Multinomial Naive Bayes
def positivevalues(df):
    for column in df:
        if (df[column].min())<0:
            df[column]=(df[column]-df[column].min())

In [20]:
#Remove the first 30 index which could have a value 'N'
newdata=data.drop(data.index[0:30])

#Remove the last row of data because class has value 'N'
newdata=newdata.drop(newdata.index[-1])

#Remove 'High' and 'Low' columns to improve the algorithm
newdata=newdata.drop(['High','Low'], axis=1)

#Remove our 'Class' column because it acts as y in our algorithms 
newdata=newdata.drop(['Class'], axis=1)

#check the features that remain in our algorithm 
newdata.head()

Unnamed: 0_level_0,Open,Close,Momentum,Return,10 Day ROI,20 Day ROI,30 Day ROI,10_day_RSI,14_day_RSI,30_day_RSI,EMA_12,EMA_26,MACD_12_26,SRSI_10,SRSI_14,SRSI_30,Williams,ATR_14,CCI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2012-02-15,18.366428,17.77393,0.749643,-0.023142,0.0909271,0.159773,0.210199,80.8632,83.4258,77.5955,17.1681,16.4432,0.724947,0.362332,-11.021,0.0,-34.6824,0.497056,115.624
2012-02-16,17.553572,17.936071,0.322857,0.009122,0.103467,0.174074,0.214711,83.3169,83.8114,77.9239,17.2863,16.5538,0.732507,0.45605,0.0302126,0.0,-29.7651,0.517734,83.952
2012-02-17,17.968214,17.932858,0.310715,-0.000179,0.0923251,0.194671,0.201158,82.0544,82.4845,77.2224,17.3858,16.6559,0.729826,0.407832,-0.0737463,-1.13589,-33.01,0.484275,89.3864
2012-02-21,18.102858,18.387501,0.4375,0.025353,0.109662,0.204581,0.218868,84.0837,84.3961,78.392,17.5399,16.7842,0.755676,0.485338,0.139475,1.0,-15.8207,0.480324,99.8807
2012-02-22,18.324286,18.322857,0.127857,-0.003516,0.0942985,0.220333,0.216513,80.8772,82.9108,77.8469,17.6603,16.8982,0.76216,0.36287,0.0311006,0.53394,-18.3239,0.446852,96.7821


In [21]:
#Normalize the data that we have filtered
normalize(newdata)



In [22]:
#Put the dataframe with our relevant features into X and our class into our y
X=newdata
y=clas[30:-1]


In [23]:
#Split up our test and train by splitting 70%/30%

X_train=X.drop(X.index[1211:])
X_test=X.drop(X.index[0:1211])
y_train=y[0:1211]
y_test=y[1211:]

In [24]:
#Import and run Logistic Regression and run a fit to train the model
from sklearn.linear_model import LogisticRegression

LR=LogisticRegression()
LR.fit(X_train,y_train)

LogisticRegression()

In [25]:
#Predict the y test 
y_pred_LR=LR.predict(X_test)

In [26]:
#Print the accuracy score of our predicted y using metrics from sklearn
from sklearn import metrics
print (metrics.accuracy_score(y_test, y_pred_LR)) 

0.554842847979474


In [27]:
#Import and run Gaussian Naive Bayes and run a fit to train the model
from sklearn.naive_bayes import GaussianNB

GNB = GaussianNB()
GNB.fit(X_train,y_train)

GaussianNB()

In [28]:
#Predict the y test
y_pred=GNB.predict(X_test)

In [29]:
#Print the accuracy score of our predicted y using metrics from sklearn
from sklearn import metrics
print (metrics.accuracy_score(y_test, y_pred)) 

0.5182809493264914


In [30]:


rmse = np.sqrt(np.mean(((y_pred - y_test) ** 2)))
rmse

0.6940598322000119