In [19]:
# Importing libraries
import numpy as np
import pandas as pd
import yfinance as yf
import talib as ta

# Disabling warnings
import warnings
warnings.filterwarnings("ignore")

# Stock ticker
ticker = 'AMZN'

# Downloading stock data from yahoo finance
df = yf.download(ticker, interval="1h", period="730d", progress=False)[['Open', 'High', 'Low', 'Close', 'Volume']]

# Check if any zero volumes are available
indexZeros = df[ df['Volume'] == 0 ].index

# Cleaning data for null values
#df.drop(indexZeros , inplace=True)

df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-11-30 09:30:00,160.25,161.419495,158.002502,158.055511,1106347
2020-11-30 10:30:00,158.030243,158.11879,156.277496,157.288498,692002
2020-11-30 11:30:00,157.324997,157.488495,156.910004,156.998993,362257
2020-11-30 12:30:00,157.006989,158.02449,156.892502,157.792511,354356
2020-11-30 13:30:00,157.834991,158.689499,157.706757,158.511993,326376


In [20]:
# Calculating indicators using TA-Lib
df['MACD'] = ta.MACD(df['Close'])[0]
df['ATR'] = ta.ATR(df['High'], df['Low'], df['Close'], 20)
df['RSI'] = ta.RSI(df['Close'])
df['Average'] = ta.AVGPRICE(df['Open'], df['High'], df['Low'], df['Close']) #midprice
df['MA40'] = ta.SMA(df['Average'], 40)
df['MA80'] = ta.SMA(df['Average'], 80)
df['MA160'] = ta.SMA(df['Average'], 160)

In [21]:
# Number of days to calculate slope
backrolling = 6

# Calculating slope for indicators
df['slopeMA40'] = ta.LINEARREG_SLOPE(df['MA40'], backrolling)
df['slopeMA80'] = ta.LINEARREG_SLOPE(df['MA80'], backrolling)
df['slopeMA160'] = ta.LINEARREG_SLOPE(df['MA160'], backrolling)
df['slopeAverage'] = ta.LINEARREG_SLOPE(df['Average'], backrolling)
df['slopeRSI'] = ta.LINEARREG_SLOPE(df['RSI'], backrolling)
df['slopeMACD'] = ta.LINEARREG_SLOPE(df['MACD'], backrolling)

df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,MACD,ATR,RSI,Average,MA40,MA80,MA160,slopeMA40,slopeMA80,slopeMA160,slopeAverage,slopeRSI,slopeMACD
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2023-10-23 12:30:00,127.330002,127.879997,127.160004,127.419998,4637426,-1.004605,1.244884,47.929611,127.4475,129.273364,129.484587,128.63422,-0.105062,0.001932,-0.072208,0.371964,3.654211,0.059747
2023-10-23 13:30:00,127.455002,127.724998,127.150002,127.3601,4214494,-0.877644,1.211389,47.588036,127.422525,129.217177,129.506556,128.576804,-0.083901,0.009828,-0.067061,0.545897,3.934531,0.10604
2023-10-23 14:30:00,127.370003,127.790001,127.199997,127.349998,4504683,-0.768977,1.18032,47.526523,127.4275,129.119802,129.516198,128.525265,-0.072842,0.016146,-0.061595,0.559502,3.059667,0.132212
2023-10-23 15:30:00,127.339996,127.339996,126.43,126.559998,5673721,-0.738096,1.167304,42.860375,126.917498,128.995708,129.511573,128.496953,-0.077046,0.01621,-0.053855,0.265429,0.516365,0.121125
2023-10-24 09:30:00,127.739998,128.539993,126.900902,128.089996,8309568,-0.583439,1.207939,52.572252,127.817722,128.89512,129.513794,128.47671,-0.088639,0.01164,-0.044334,0.118316,0.310673,0.108049


In [22]:
# Target flexible way
ask = 0.05      #ask
riskRatio = 2   #reward/risk ratio

# Identifying future trend for each day
def mytarget(barsupfront, df1):
    length = len(df1)
    high = list(df1['High'])
    low = list(df1['Low'])
    close = list(df1['Close'])
    open = list(df1['Open'])
    trendcat = [None] * length
    
    for line in range (0,length-barsupfront-2):
        valueOpenLow = 0
        valueOpenHigh = 0
        for i in range(1,barsupfront+2):
            value1 = 1-(low[line+i]/open[line+1])
            value2 = 1-(high[line+i]/open[line+1])
            valueOpenLow = max(value1, valueOpenLow)
            valueOpenHigh = min(value2, valueOpenHigh)

            # Downtrend 
            if ( (valueOpenLow >= ask) and (-valueOpenHigh <= (ask/riskRatio)) ):
                trendcat[line] = 1
                break

            # Uptrend
            elif ( (valueOpenLow <= (ask/riskRatio)) and (-valueOpenHigh >= ask) ):
                trendcat[line] = 2
                break
            
            # No clear trend
            else:
                trendcat[line] = 0
            
    return trendcat

In [23]:
# mytarget(barsfront to take into account, dataframe)
df['mytarget'] = mytarget(21, df)
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,MACD,ATR,RSI,Average,MA40,MA80,MA160,slopeMA40,slopeMA80,slopeMA160,slopeAverage,slopeRSI,slopeMACD,mytarget
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2020-11-30 09:30:00,160.25,161.419495,158.002502,158.055511,1106347,,,,159.431877,,,,,,,,,,0.0
2020-11-30 10:30:00,158.030243,158.11879,156.277496,157.288498,692002,,,,157.428757,,,,,,,,,,0.0
2020-11-30 11:30:00,157.324997,157.488495,156.910004,156.998993,362257,,,,157.180622,,,,,,,,,,0.0
2020-11-30 12:30:00,157.006989,158.02449,156.892502,157.792511,354356,,,,157.429123,,,,,,,,,,0.0
2020-11-30 13:30:00,157.834991,158.689499,157.706757,158.511993,326376,,,,158.18581,,,,,,,,,,0.0


In [24]:
from datetime import datetime
# Attributes to train and test the model
attributes=['ATR', 'RSI', 'MACD', 'Average', 'MA40', 'MA80', 'MA160', 'slopeMA40', 'slopeMA80', 'slopeMA160', 'slopeAverage', 'slopeRSI', 'slopeMACD']

df      = df[['Volume', 'MACD', 'ATR', 'RSI', 'Average', 'MA40', 'MA80', 'MA160', 'slopeMA40', 'slopeMA80', 'slopeMA160', 'slopeAverage', 'slopeRSI', 'slopeMACD', 'mytarget']] 
df_model= df[df.index<datetime.strptime("2022-11-01", "%Y-%m-%d")]
df_test = df[df.index>=datetime.strptime("2022-11-01", "%Y-%m-%d")]

# Separating today's data for prediction
x_today = df_model[attributes][-1:]

# Dropping all rows with null values
df_model=df_model.dropna(inplace=True)

# Creating Input and Output dataset
X_train = df_model[attributes]
y_train = df_model["mytarget"]

X_test = df_test[attributes]
y_test = df_test["mytarget"]

X_test.tail()

TypeError: 'NoneType' object is not subscriptable

## KNN Classification w/ Random sampling 

In [25]:
# KNN Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Splitting training and testing data
#X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
y_test.dropna(inplace=True)
X_test.dropna(inplace=True)

# Training the model
model1 = KNeighborsClassifier(n_neighbors=200, weights='uniform', algorithm='kd_tree', leaf_size=30, p=1, metric='minkowski', metric_params=None, n_jobs=1)
model1.fit(X_train, y_train)

# Predictions
y_pred_train = model1.predict(X_train)
y_pred_test = model1.predict(X_test)

# Accuracy
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Accuracy train: %.2f%%" % (accuracy_train * 100.0))
print("Accuracy test: %.2f%%" % (accuracy_test * 100.0))


print("Frequency:\n", df_model['mytarget'].value_counts()*100/df_model['mytarget'].count())

# Random Model / Gambling
pred_test = np.random.choice([0, 1, 2], len(y_pred_test))
accuracy_test = accuracy_score(y_test, pred_test)
print("Gambler Accuracy: %.2f%%" % (accuracy_test * 100.0))

# mytarget:
# 1: Downtrend
# 2: Uptrend
# 3: No Trend

ValueError: Found input variables with inconsistent numbers of samples: [1687, 1710]