### This is first load on dataframe "jpx-tokyo-stock-exchange-prediction/stock_list.csv'"
##### Overall dataset of unique stocks in the data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../../jpx-tokyo-stock-exchange-prediction/stock_list.csv')
df.rename(columns={'Section/Products': 'Section', '33SectorName':'Sector_one', '17SectorName':'Sector_two'}, inplace=True)
df['TradeDate'] = pd.to_datetime(df['TradeDate'], unit='s')

In [None]:
print(f'We have {len(df.SecuritiesCode.unique())} unique stocks in our dataset all with metadata in 15 additional columns : {df.shape[0]} x {df.shape[1]}.')
print(f'We have {len(df.Section.unique())} unique sections in our dataset.')
print(f'We have {len(df.NewMarketSegment.unique())} unique market segments in our dataset.')
print(f'We have {len(df.Sector_one.unique())} unique 33 Sector Names in our dataset.Some are None values, for example : {list(df.Sector_one.unique())[0:5]}')
print(f'We have {len(df.Sector_two.unique())} unique 17 Sector Names in our dataset.Some are None values, for example : {list(df.Sector_two.unique())[0:5]}')

### Loading train_files/stock_prices.csv that is the main stock trading data

In [None]:
df_sp = pd.read_csv('../../jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv')
df_sp['Date'] = pd.to_datetime(df_sp['Date'])

In [None]:
df_sp.head()

## Investigate a single stock to see how PACF works

In [None]:
security = df_sp[df_sp["SecuritiesCode"] == 1376]
print(f'Security {1376} with number of closing prices : {security.shape[0]} and missing values : {security.Close.isnull().sum()}')
security['Close'].fillna(security['Close'].mean(), inplace=True)
security['Close'].plot(figsize=(16,4),legend=True)

# pcaf
correlations,conf_intervals = sm.tsa.stattools.pacf(security['Close'], nlags=20, method='ywm', alpha=0.05)
correl = correlations.tolist()[1:]
# First lag, lag 0 is always t today so it is 1
sm.graphics.tsa.plot_pacf(security['Close'], method="ywm",lags=20,alpha=0.05)
# sm.graphics.tsa.plot_acf(security['Close'],lags=100)
plt.show()


In [None]:
correl

## Bootstrap betas from first 1000 stocks

In [None]:
# calculate pcaf for first 100 stocks with 20 lags
correlation_matrix = np.zeros((len(df_sp.SecuritiesCode.unique()[0:1000]), 20))
for i, security in enumerate(list(df_sp.SecuritiesCode.unique())[0:1000]):
    stock = df_sp[df_sp["SecuritiesCode"] == security]
    stock['Close'].fillna(stock['Close'].mean(), inplace=True)
    correlations, conf_intervals = sm.tsa.stattools.pacf(stock['Close'], nlags=20, method='ywm', alpha=0.05)
    correlation_matrix[i,:] = correlations.tolist()[1:]
    

In [None]:
# Bootstrap confidence intervals for each betas:
from scipy.stats import bootstrap
import seaborn as sns
rng = np.random.default_rng()

data = (correlation_matrix[:,1],)  # samples must be in a sequence
# Bootstrap for mean
res_mean = bootstrap(data, np.mean, confidence_level=0.95, random_state=rng)

# Bootstrap for standard deviation
res_std = bootstrap(data, np.std, confidence_level=0.95, random_state=rng)

# Creating subplots
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# Histogram and KDE for bootstrap mean
sns.histplot(res_mean.bootstrap_distribution, bins=25, kde=True, color='blue', alpha=0.7, ax=axs[0])
axs[0].set_title('Bootstrap Distribution of Mean')
axs[0].set_xlabel('Mean Value')
axs[0].set_ylabel('Frequency')

# Histogram and KDE for bootstrap standard deviation
sns.histplot(res_std.bootstrap_distribution, bins=25, kde=True, color='green', alpha=0.7, ax=axs[1])
axs[1].set_title('Bootstrap Distribution of Standard Deviation')
axs[1].set_xlabel('Standard Deviation Value')
axs[1].set_ylabel('Frequency')


# Display the plots
plt.tight_layout()
plt.show()
