In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import yfinance as yf
import talib as ta
from datetime import datetime, date

# Disabling warnings
import warnings
warnings.filterwarnings("ignore")

# Stock ticker
ticker = 'AMZN'

# Downloading stock data from yahoo finance
df = yf.download(ticker, start="2021-11-01", end=datetime.today(), period="1h", progress=False)[['Open', 'High', 'Low', 'Close', 'Volume']]

# Check if any zero volumes are available
indexZeros = df[ df['Volume'] == 0 ].index

# Cleaning data for null values
#df.drop(indexZeros , inplace=True)

df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-11-01,168.089996,168.792999,164.600998,165.905502,72178000
2021-11-02,165.750504,166.556,164.177505,165.637497,52552000
2021-11-03,165.449997,169.746002,164.876007,169.199997,67944000
2021-11-04,168.5,174.931503,168.25,173.850006,107060000
2021-11-05,173.850006,178.3125,173.848999,175.949493,99940000


In [2]:
# Calculating indicators using TA-Lib
df['MACD'] = ta.MACD(df['Close'])[0]
df['ATR'] = ta.ATR(df['High'], df['Low'], df['Close'], 20)
df['RSI'] = ta.RSI(df['Close'])
df['Average'] = ta.AVGPRICE(df['Open'], df['High'], df['Low'], df['Close']) #midprice
df['MA40'] = ta.SMA(df['Average'], 40)
df['MA80'] = ta.SMA(df['Average'], 80)
df['MA160'] = ta.SMA(df['Average'], 160)

In [3]:
# Number of days to calculate slope
backrolling = 6

# Calculating slope for indicators
df['slopeMA40'] = ta.LINEARREG_SLOPE(df['MA40'], backrolling)
df['slopeMA80'] = ta.LINEARREG_SLOPE(df['MA80'], backrolling)
df['slopeMA160'] = ta.LINEARREG_SLOPE(df['MA160'], backrolling)
df['slopeAverage'] = ta.LINEARREG_SLOPE(df['Average'], backrolling)
df['slopeRSI'] = ta.LINEARREG_SLOPE(df['RSI'], backrolling)
df['slopeMACD'] = ta.LINEARREG_SLOPE(df['MACD'], backrolling)

df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,MACD,ATR,RSI,Average,MA40,MA80,MA160,slopeMA40,slopeMA80,slopeMA160,slopeAverage,slopeRSI,slopeMACD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2023-09-19,138.699997,138.839996,135.559998,137.630005,61482500,1.779987,3.363401,49.789945,137.682499,136.113125,131.775594,116.819594,0.243389,0.318543,0.27112,-0.890572,-2.826545,0.000117
2023-09-20,138.550003,139.369995,135.199997,135.289993,46263700,1.301422,3.40373,45.512965,137.102497,136.31225,132.043094,117.037766,0.230411,0.304632,0.253488,-1.465358,-4.298499,-0.18331
2023-09-21,131.940002,132.240005,129.309998,129.330002,70234800,0.436205,3.532544,36.834181,130.705002,136.393313,132.196688,117.205985,0.199923,0.271132,0.22803,-2.461714,-5.097082,-0.382185
2023-09-22,131.110001,132.029999,128.520004,129.119995,59859500,-0.263395,3.531416,36.569566,130.195,136.40025,132.302844,117.322766,0.151862,0.22588,0.199177,-2.485357,-4.337648,-0.524618
2023-09-25,129.360001,131.779999,128.770004,131.270004,45964600,-0.637002,3.505345,41.224891,130.295002,136.381125,132.421407,117.480938,0.093436,0.180354,0.175525,-2.262071,-3.287907,-0.594677


In [4]:
# Target flexible way
ask = 0.05      #ask
riskRatio = 2   #reward/risk ratio

# Identifying future trend for each day
def mytarget(barsupfront, df1):
    length = len(df1)
    high = list(df1['High'])
    low = list(df1['Low'])
    close = list(df1['Close'])
    open = list(df1['Open'])
    trendcat = [None] * length
    
    for line in range (0,length-barsupfront-2):
        valueOpenLow = 0
        valueOpenHigh = 0
        for i in range(1,barsupfront+2):
            value1 = 1-(low[line+i]/open[line+1])
            value2 = 1-(high[line+i]/open[line+1])
            valueOpenLow = max(value1, valueOpenLow)
            valueOpenHigh = min(value2, valueOpenHigh)

            # Downtrend 
            if ( (valueOpenLow >= ask) and (-valueOpenHigh <= (ask/riskRatio)) ):
                trendcat[line] = 1
                break

            # Uptrend
            elif ( (valueOpenLow <= (ask/riskRatio)) and (-valueOpenHigh >= ask) ):
                trendcat[line] = 2
                break
            
            # No clear trend
            else:
                trendcat[line] = 0
            
    return trendcat

In [5]:
# mytarget(barsfront to take into account, dataframe)
df['mytarget'] = mytarget(21, df)
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,MACD,ATR,RSI,Average,MA40,MA80,MA160,slopeMA40,slopeMA80,slopeMA160,slopeAverage,slopeRSI,slopeMACD,mytarget
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2021-11-01,168.089996,168.792999,164.600998,165.905502,72178000,,,,166.847374,,,,,,,,,,2.0
2021-11-02,165.750504,166.556,164.177505,165.637497,52552000,,,,165.530376,,,,,,,,,,2.0
2021-11-03,165.449997,169.746002,164.876007,169.199997,67944000,,,,167.318001,,,,,,,,,,2.0
2021-11-04,168.5,174.931503,168.25,173.850006,107060000,,,,171.382877,,,,,,,,,,2.0
2021-11-05,173.850006,178.3125,173.848999,175.949493,99940000,,,,175.49025,,,,,,,,,,2.0


In [6]:
# Attributes to train and test the model
attributes=['ATR', 'RSI', 'MACD', 'Average', 'MA40', 'MA80', 'MA160', 'slopeMA40', 'slopeMA80', 'slopeMA160', 'slopeAverage', 'slopeRSI', 'slopeMACD']

df      = df[['Volume', 'MACD', 'ATR', 'RSI', 'Average', 'MA40', 'MA80', 'MA160', 'slopeMA40', 'slopeMA80', 'slopeMA160', 'slopeAverage', 'slopeRSI', 'slopeMACD', 'mytarget']] 
df_model= df[df.index<datetime.strptime("2022-11-01", "%Y-%m-%d")]
df_test = df[df.index>=datetime.strptime("2022-11-01", "%Y-%m-%d")]

# Separating today's data for prediction
x_today = df_model[attributes][-1:]

# Dropping all rows with null values
df_model=df_model.dropna()

# Creating Input and Output dataset
X_train = df_model[attributes]
y_train = df_model["mytarget"]

X_test = df_test[attributes]
y_test = df_test["mytarget"]

X_test.tail()

Unnamed: 0_level_0,ATR,RSI,MACD,Average,MA40,MA80,MA160,slopeMA40,slopeMA80,slopeMA160,slopeAverage,slopeRSI,slopeMACD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2023-09-19,3.363401,49.789945,1.779987,137.682499,136.113125,131.775594,116.819594,0.243389,0.318543,0.27112,-0.890572,-2.826545,0.000117
2023-09-20,3.40373,45.512965,1.301422,137.102497,136.31225,132.043094,117.037766,0.230411,0.304632,0.253488,-1.465358,-4.298499,-0.18331
2023-09-21,3.532544,36.834181,0.436205,130.705002,136.393313,132.196688,117.205985,0.199923,0.271132,0.22803,-2.461714,-5.097082,-0.382185
2023-09-22,3.531416,36.569566,-0.263395,130.195,136.40025,132.302844,117.322766,0.151862,0.22588,0.199177,-2.485357,-4.337648,-0.524618
2023-09-25,3.505345,41.224891,-0.637002,130.295002,136.381125,132.421407,117.480938,0.093436,0.180354,0.175525,-2.262071,-3.287907,-0.594677


## KNN Classification w/ Random sampling 

In [7]:
# KNN Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Splitting training and testing data
#X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

# Training the model
model1 = KNeighborsClassifier(n_neighbors=200, weights='uniform', algorithm='kd_tree', leaf_size=30, p=1, metric='minkowski', metric_params=None, n_jobs=1)
model1.fit(X_train, y_train)

# Predictions
y_pred_train = model1.predict(X_train)
y_pred_test = model1.predict(X_test)

# Accuracy
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Accuracy train: %.2f%%" % (accuracy_train * 100.0))
print("Accuracy test: %.2f%%" % (accuracy_test * 100.0))


print("Frequency:\n", df_model['mytarget'].value_counts()*100/df_model['mytarget'].count())

# Random Model / Gambling
pred_test = np.random.choice([0, 1, 2], len(y_pred_test))
accuracy_test = accuracy_score(y_test, pred_test)
print("Gambler Accuracy: %.2f%%" % (accuracy_test * 100.0))

# mytarget:
# 1: Downtrend
# 2: Uptrend
# 3: No Trend

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 88, n_neighbors = 200