Basic setup

In [20]:
import pandas as pd
import yfinance as yf
import numpy as np

Import Apple dataset and look at them.

In [21]:
appl = yf.Ticker('AAPL')
appl.info

{'address1': 'One Apple Park Way',
 'city': 'Cupertino',
 'state': 'CA',
 'zip': '95014',
 'country': 'United States',
 'phone': '408 996 1010',
 'website': 'https://www.apple.com',
 'industry': 'Consumer Electronics',
 'industryKey': 'consumer-electronics',
 'industryDisp': 'Consumer Electronics',
 'sector': 'Technology',
 'sectorKey': 'technology',
 'sectorDisp': 'Technology',
 'longBusinessSummary': 'Apple Inc. designs, manufactures, and markets smartphones, personal computers, tablets, wearables, and accessories worldwide. The company offers iPhone, a line of smartphones; Mac, a line of personal computers; iPad, a line of multi-purpose tablets; and wearables, home, and accessories comprising AirPods, Apple TV, Apple Watch, Beats products, and HomePod. It also provides AppleCare support and cloud services; and operates various platforms, including the App Store that allow customers to discover and download applications and digital content, such as books, music, video, games, and pod

In [22]:
hist = appl.history(period='1y')

appl.history_metadata

{'currency': 'USD',
 'symbol': 'AAPL',
 'exchangeName': 'NMS',
 'fullExchangeName': 'NasdaqGS',
 'instrumentType': 'EQUITY',
 'firstTradeDate': 345479400,
 'regularMarketTime': 1716580800,
 'hasPrePostMarketData': True,
 'gmtoffset': -14400,
 'timezone': 'EDT',
 'exchangeTimezoneName': 'America/New_York',
 'regularMarketPrice': 189.98,
 'fiftyTwoWeekHigh': 190.58,
 'fiftyTwoWeekLow': 188.04,
 'regularMarketDayHigh': 190.58,
 'regularMarketDayLow': 188.04,
 'regularMarketVolume': 35941665,
 'chartPreviousClose': 171.84,
 'priceHint': 2,
 'currentTradingPeriod': {'pre': {'timezone': 'EDT',
   'start': 1716537600,
   'end': 1716557400,
   'gmtoffset': -14400},
  'regular': {'timezone': 'EDT',
   'start': 1716557400,
   'end': 1716580800,
   'gmtoffset': -14400},
  'post': {'timezone': 'EDT',
   'start': 1716580800,
   'end': 1716595200,
   'gmtoffset': -14400}},
 'dataGranularity': '1d',
 'range': '1y',
 'validRanges': ['1d',
  '5d',
  '1mo',
  '3mo',
  '6mo',
  '1y',
  '2y',
  '5y',
  '1

In [23]:
end_date = pd.Timestamp.now()
start_date = end_date - pd.DateOffset(years=1)

apple_data = appl.history(start = start_date, end = end_date)

apple_data.head()


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-05-25 00:00:00-04:00,171.499219,172.981338,170.783021,172.076157,56058300,0.0,0.0
2023-05-26 00:00:00-04:00,172.404411,174.841466,172.195514,174.50325,54835000,0.0,0.0
2023-05-30 00:00:00-04:00,176.025173,178.044447,175.637233,176.363373,55964400,0.0,0.0
2023-05-31 00:00:00-04:00,176.393239,178.402572,175.826243,176.31366,99625300,0.0,0.0
2023-06-01 00:00:00-04:00,176.761237,179.168451,175.995301,179.138611,68901800,0.0,0.0


In [24]:
apple_data.reset_index(inplace=True)
apple_data['Date'] = pd.to_datetime(apple_data['Date'])

apple_data['Day_of_Week'] = apple_data['Date'].dt.dayofweek
apple_data['Day_of_Month'] = apple_data['Date'].dt.day
apple_data['Month'] = apple_data['Date'].dt.month
apple_data['Day_of_Year'] = apple_data['Date'].dt.dayofyear

We have got the data set of Apple in the past year. Now we want to generate a dataset of missing values in 5 days.

In [25]:
random_state = 27
np.random.seed(random_state)


missing_dates = np.random.choice(apple_data.index, size=5, replace=False)
apple_missing = apple_data.copy()
apple_missing.loc[missing_dates, 'Close'] = np.nan

Let us look at this new dataset. 

In [26]:
apple_missing.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Day_of_Week,Day_of_Month,Month,Day_of_Year
0,2023-05-25 00:00:00-04:00,171.499219,172.981338,170.783021,172.076157,56058300,0.0,0.0,3,25,5,145
1,2023-05-26 00:00:00-04:00,172.404411,174.841466,172.195514,174.50325,54835000,0.0,0.0,4,26,5,146
2,2023-05-30 00:00:00-04:00,176.025173,178.044447,175.637233,176.363373,55964400,0.0,0.0,1,30,5,150
3,2023-05-31 00:00:00-04:00,176.393239,178.402572,175.826243,176.31366,99625300,0.0,0.0,2,31,5,151
4,2023-06-01 00:00:00-04:00,176.761237,179.168451,175.995301,179.138611,68901800,0.0,0.0,3,1,6,152


In [27]:
len(apple_missing)

252

In [28]:
apple_missing.dtypes

Date            datetime64[ns, America/New_York]
Open                                     float64
High                                     float64
Low                                      float64
Close                                    float64
Volume                                     int64
Dividends                                float64
Stock Splits                             float64
Day_of_Week                                int32
Day_of_Month                               int32
Month                                      int32
Day_of_Year                                int32
dtype: object

In [29]:
missing_index = apple_missing[apple_missing['Close'].isna()].index
print(missing_index)


Index([10, 95, 149, 178, 205], dtype='int64')


In [30]:
data_non_missing = apple_missing.dropna(subset = ['Close'])
len(data_non_missing)

247

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [32]:
X = data_non_missing[['Open', 'Day_of_Week', 'Day_of_Month', 'Month', 'Day_of_Year']]
y = data_non_missing['Close']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

In [33]:
model = LinearRegression()
model.fit(X_train,y_train)

In [34]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 3.1783488596365164


In [38]:
data_missing = apple_missing[apple_missing['Close'].isnull()]
X_missing = data_missing[['Open', 'Day_of_Week', 'Day_of_Month', 'Month', 'Day_of_Year']]

predicted_close = model.predict(X_missing)


close_missing = apple_data.loc[missing_dates, 'Close']

print(predicted_close)
print(close_missing)

[180.66456434 177.84500158 193.46169268 188.26796436 175.48467107]
95     179.091675
178    188.594208
10     180.004059
149    193.071426
205    178.427994
Name: Close, dtype: float64


In [36]:
mse_close = mean_squared_error(predicted_close, close_missing)
print(f'Mean Squared Error: {mse_close}')

Mean Squared Error: 66.17274396138502
