In [6]:
import pandas as pd
import numpy as np
import yfinance as yf

import glob

In [7]:
start_date = "2020-01-01"
end_date = "2020-12-01"

In [13]:
# jpn_daily = yf.download("^N225", start=start_date, end=end_date) # Japan 225

In [91]:
# TODO: download daily open and close for each index

#jpn_opens = pd.read_csv("datasets_stocks/....csv")
#jpn_closes = pd.read_csv("datasets_stocks/....csv")

intraday_root = "datasets_stocks/intraday"

jpn_path =f"{intraday_root}/JPNIDX"
us_path = f"{intraday_root}/SP500"
che_path = f"{intraday_root}/CHEIDX"
chi_path = f"{intraday_root}/CHIIDX"

In [63]:
def process_intraday_data(path):

    df = pd.DataFrame()

    for file in sorted(glob.glob(f"{path}/*.json")):
        df_month = pd.read_json(file)
        df = pd.concat([df, df_month])

    df = df.set_index("timestamp")
    
    dates = sorted(list(set(df.index.date)))
    dates = sorted(list(set(df.index.date)))
    df_open = pd.DataFrame(index=dates, columns=["price"])
    df_close = pd.DataFrame(index=dates, columns=["price"])
    
    for date in dates:
        sub_df = df[df.index==date]
        df_open.at[date, "price"] = df.head(1).close.item()
        df_close.at[date, "price"] = df.tail(1).close.item()
        
    return df, df_open, df_close

In [92]:
jpn, jpn_opens, jpn_closes = process_intraday_data(jpn_path)
us, us_opens, us_closes = process_intraday_data(us_path)
che, che_opens, che_closes = process_intraday_data(che_path)
chi, chi_opens, chi_closes = process_intraday_data(chi_path)

In [87]:
def calculate_annualised_rv(opens, closes, intraday): # rv = realised variance
    
    opens = opens.astype(np.float64)
    closes = closes.astype(np.float64)
    # 1. calculate j_t: diff between closing on day t-1 and opening on day t
    
    # j_t = 100*(ln(P_t_open) - ln(P_{t-1}_close))
    j = 100*np.log(opens.iloc[1:] / closes.shift(1).iloc[1:])
    #j = j[j.index <= "2020-12-01"] # truncate
    
    # 2. continuous return within a day
    # r_{i,t} = 100*(ln(P_{i,t}) - ln(P_{i,t-1}))
    # we want R_t = sum(r_{i,t}^2)

    intraday.index = pd.DatetimeIndex(intraday.index)
    dates_avail = sorted(list(set(intraday.index.date)),reverse=True) # <= unique dates available in data
    R = pd.DataFrame(index=dates_avail, columns=["R"])

    for date in dates_avail:
        prices = intraday[intraday.index.date == date].copy()["close"] # prices observed on this date
        rs = 100*np.log(prices.iloc[1:]/prices.shift(1).iloc[1:])# all the rs calculated on this date
        R.loc[date]["R"] = sum(rs**2)
    R.index = pd.DatetimeIndex(R.index)   
    
    # 3. Annualised daily variance RV (return variance)
    # RV_t = 252*(j_t^2 + R_t)

    merged = pd.merge(j, R, how="inner", left_index=True, right_index=True)
    merged.columns = ["j", "R"]

    merged["RV"] = 252*(merged["j"]**2 + merged["R"])
    
    return merged

In [93]:
jpn_rvs = calculate_annualised_rv(jpn_opens, jpn_closes, jpn)
us_rvs = calculate_annualised_rv(us_opens, us_closes, us)
che_rvs = calculate_annualised_rv(che_opens, che_closes, che)
chi_rvs = calculate_annualised_rv(chi_opens, chi_closes, chi)

In [94]:
jpn_rvs.head(20)

Unnamed: 0,j,R,RV
2020-11-30,-12.633936,1.62648,40633.2
2020-11-29,-12.633936,0.0823478,40244.1
2020-11-27,-12.633936,0.769584,40417.3
2020-11-26,-12.633936,0.592744,40372.7
2020-11-25,-12.633936,1.58137,40621.8
2020-11-24,-12.633936,0.949023,40462.5
2020-11-23,-12.633936,0.884451,40446.2
2020-11-22,-12.633936,0.0442057,40234.5
2020-11-20,-12.633936,0.661861,40390.1
2020-11-19,-12.633936,1.09734,40499.8


In [97]:
names = ["jpn_rvs", "us_rvs", "che_rvs", "chi_rvs"]
dfs_rvs = [jpn_rvs, us_rvs, che_rvs, chi_rvs]

for i in range(len(names)):
    dfs_rvs[i].to_csv(f"./computed/realised_variance/{names[i]}.csv")