In [39]:
# import required libraries
import pandas as pd
import numpy as np
import requests as rq
import plotly.graph_objects as go
import time
import datetime
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV
from joblib import dump

In [2]:
# Define your stock and time period
start_timestamp = '2000-01-01'
end_timestamp = '2025-01-01'
symbol = 'AAPL'

# now convert time into unix format
def to_unix_timestamp(date_str):
    dt = datetime.datetime.strptime(date_str, "%Y-%m-%d")
    return int(time.mktime(dt.timetuple()))

# now define the periods
period1 = to_unix_timestamp(start_timestamp)
period2 = to_unix_timestamp(end_timestamp)

# Build the URL
url = f"https://query1.finance.yahoo.com/v8/finance/chart/{symbol}"

params = {
    "period1": period1,
    "period2": period2,
    "interval": "1d",
    "events": "history",
    "includeAdjustedClose": "true"
}

headers = {
    "User-Agent": "Mozilla/5.0"
}

# Send request
response = rq.get(url, headers=headers, params=params)

# Process response
if response.status_code == 200:
    data = response.json()
    timestamps = data['chart']['result'][0]['timestamp']
    indicators = data['chart']['result'][0]['indicators']['quote'][0]
    
    dataset = pd.DataFrame({
        'Datetime': pd.to_datetime(timestamps, unit='s'),
        'Open': indicators['open'],
        'High': indicators['high'],
        'Low': indicators['low'],
        'Close': indicators['close'],
        'Volume': indicators['volume']
    })
    
    print(dataset.head())
    
else:
    print(f"Failed to fetch data {response.status_code}")

             Datetime      Open      High       Low     Close     Volume
0 1999-12-31 14:30:00  0.901228  0.918527  0.888393  0.917969  163811200
1 2000-01-03 14:30:00  0.936384  1.004464  0.907924  0.999442  535796800
2 2000-01-04 14:30:00  0.966518  0.987723  0.903460  0.915179  512377600
3 2000-01-05 14:30:00  0.926339  0.987165  0.919643  0.928571  778321600
4 2000-01-06 14:30:00  0.947545  0.955357  0.848214  0.848214  767972800


In [3]:
# find dataset
dataset.head()

Unnamed: 0,Datetime,Open,High,Low,Close,Volume
0,1999-12-31 14:30:00,0.901228,0.918527,0.888393,0.917969,163811200
1,2000-01-03 14:30:00,0.936384,1.004464,0.907924,0.999442,535796800
2,2000-01-04 14:30:00,0.966518,0.987723,0.90346,0.915179,512377600
3,2000-01-05 14:30:00,0.926339,0.987165,0.919643,0.928571,778321600
4,2000-01-06 14:30:00,0.947545,0.955357,0.848214,0.848214,767972800


In [4]:
# find shape of dataset
dataset.shape

(6290, 6)

In [5]:
# check for any missing or null values
dataset.isnull().sum()

Datetime    0
Open        0
High        0
Low         0
Close       0
Volume      0
dtype: int64

In [6]:
# as there are no values which is a good thing
# now check the datatypes
dataset.dtypes

Datetime    datetime64[ns]
Open               float64
High               float64
Low                float64
Close              float64
Volume               int64
dtype: object

In [7]:
# Feature engineering


In [8]:
# scaling and normalization


In [9]:
# encoding for time series data, specifically for models like LSTM


In [10]:
# now i just want to predict using simple data and simple models
# lets drop the time column as we do not need to predict the volume
final_dataset = dataset.drop(columns = ['Datetime'], axis = 1)

In [11]:
final_dataset.head(5)

Unnamed: 0,Open,High,Low,Close,Volume
0,0.901228,0.918527,0.888393,0.917969,163811200
1,0.936384,1.004464,0.907924,0.999442,535796800
2,0.966518,0.987723,0.90346,0.915179,512377600
3,0.926339,0.987165,0.919643,0.928571,778321600
4,0.947545,0.955357,0.848214,0.848214,767972800


In [24]:
# split the dataset into depedent and independent
x = final_dataset.iloc[:, :-1]
y = final_dataset.iloc[:, -1]
y

0       163811200
1       535796800
2       512377600
3       778321600
4       767972800
          ...    
6285     23234700
6286     27237100
6287     42355300
6288     35557500
6289     39480700
Name: Volume, Length: 6290, dtype: int64

In [25]:
# now split into training and testing

# specify split index
split_index = int(len(final_dataset)*0.85)

# split x into testing and training
x_train = x.iloc[:split_index, :]
x_test = x.iloc[split_index:, :]

# now split y into testing and training
y_train = y.iloc[:split_index]
y_test = y.iloc[split_index:]
print(x_train.shape)
print(y_train.shape)

(5346, 4)
(5346,)


In [34]:
# now apply knn model and predict the outcome

# model
knn_model = KNeighborsRegressor(n_neighbors = 29, metric='euclidean', weights='distance')

# fit data
knn_model.fit(x_train, y_train)

In [35]:
# predict the output
y_pred = knn_model.predict(x_test)

In [36]:
# check accuracy
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
score_r2 = r2_score(y_test, y_pred)

print(mae)
print(mse)
print(score_r2)

42625223.75181042
2255179784502675.5
-1.7440839388207752


In [37]:
# hyperparameter tunning

# set parameters
params = {
    'n_neighbors': np.arange(1, 30),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

knn = KNeighborsRegressor()
grid_search = GridSearchCV(knn, params, cv = 5, scoring = 'r2')
grid_search.fit(x_train, y_train)

print('Best score: ', grid_search.best_score_)
print('Best params: ', grid_search.best_params_)

Best score:  -0.8187832623100324
Best params:  {'metric': 'euclidean', 'n_neighbors': 29, 'weights': 'distance'}


In [40]:
# even though this model is not performing well, lets save it
dump(knn_model, 'knn_model.joblib')

['knn_model.joblib']