## Startup

In [2]:
import numpy as np
import pandas as pd
import pandas_datareader as web
import matplotlib.pyplot as plt

import os
import pickle

In [3]:
import yfinance as yf
yf.pdr_override()

In [4]:
np.random.seed(42)

In [5]:
dataroute=os.path.join("..",  "data")
resultsroute=os.path.join("..",  "results")

## Data Retrieval

In [5]:
tickerlist=["^MERV", 
            "GGAL", "GGAL.BA", 
            "YPF", "YPFD.BA",
            "EDN", "EDN.BA",
            "BMA", "BMA.BA"] 
# sumar tamb BBAR/BBAR? TEO/TECO2?

In [6]:
factordict={"GGAL": 10, "YPF":1, "EDN":20, "BMA":10, "BBAR":3, "TEO":5}

In [7]:
stocks=tickerlist.copy()
stocks.remove("^MERV")
stocklist=[]

for i in range(0, len(stocks), 2):
    stocklist.append((stocks[i], stocks[i+1]))
del stocks
stocklist

In [8]:
ohlclist=["Open", "High", "Low", "Close"]

In [9]:
objectlist=[]

for ticker in tickerlist:
    objectlist.append(yf.Ticker(ticker))    

In [10]:
# get historical market data
data={}
start='2013-01-01'
end="2023-06-01"

In [11]:
name=f'dataset_{start}_{end}.pickle'
filename=os.path.join(dataroute, name)

In [12]:
if not os.path.exists(filename):
    for ticker in objectlist:
        # descargo data en un diccionario[ticker]
        data[ticker.ticker] = ticker.history(start=start, end=end)
        # guardo en un pickle
    with open(filename, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
else:
    with open(filename, 'rb') as handle:
        data=pickle.load(handle)

## Data quality deletion

In [13]:
data_quality_dates=["2022-07-14"]

In [14]:
for ticker in tickerlist:
    data[ticker]=data[ticker].loc[~data[ticker].index.isin(pd.to_datetime(data_quality_dates))]

## Implicit USD calculation

In [15]:
def _reindex_refill_dfs(df1, df2):
    """
    The function returns two dataframes with an index as the union of the two.
    The dataframes are then forward filled.
    """
    index3=df1.index.union(df2.index)
    # reindex both con index3
    df3=df1.reindex(index3)
    df4=df2.reindex(index3)
    # fillna con previous value
    df3.fillna(method="ffill")
    df4.fillna(method="ffill")
    return df3, df4

In [16]:
def calculate_usd(usd_df, ars_df, conversion_factor):
    """
    The function returns a dataframe with an index the size of the union between the two.
    Missing values in dates (stemming from, for example, holidays in one country) are
    forward filled to create the last  
    """
    usd_df_r, ars_df_r = _reindex_refill_dfs(usd_df, ars_df)
    implicit_usd = ars_df_r.divide(usd_df_r)*conversion_factor
    return implicit_usd

In [17]:
usdlist=[]
for stocktuplo in stocklist:
    us, ba = stocktuplo
    usdlist.append(f"USD_{us}")
    data[f"USD_{us}"]=calculate_usd(data[us][ohlclist], data[ba][ohlclist], factordict[us])
    data[f"USD_{us}"]["Average"]=data[f"USD_{us}"].mean(axis=1)

In [18]:
data["USD"]=pd.DataFrame(columns=ohlclist)

for i in ohlclist:
    df=pd.concat([data[col][i] for col in usdlist], axis=1)
    data["USD"][i]=df.mean(axis=1)
    
data["USD"]["Average"]=data["USD"].mean(axis=1)

In [19]:
for key in data.keys():
    data[key].fillna(method="ffill", inplace=True)
    # revisar esto

In [20]:
data["USD"][[*ohlclist, "Average"]].plot(figsize=(10,10), logy=True, grid=True)

## USD Denominated Index

In [21]:
data["USD_^MERV"]=pd.DataFrame(columns=ohlclist)

for col in ohlclist:
    data["USD_^MERV"][col] = data["^MERV"][col]/data["USD"]["Average"]

In [22]:
data["USD_^MERV"].fillna(method="ffill", inplace=True)

## Intraday Volatility

Vamos a usar para medir intraday volatility el estimador de Garman and Klass (1980):

$$V_{ohlc}=0.5*[log(H)-log(L)]^2+(2*log(2)-1)*[log(C)-log(O)]^2$$ 
Donde H es el precio mas alto del día, L el bajo, C el cierre y O su apertura

Garman, M. B. and M. J. Klass (1980). On the estimation of security price volatilities from historical data. Journal of Business 53, 67–78.

In [23]:
def gk_vol(o, h, l, c):
    "Returns Garman Klass (1980) intraday volatility estimator"
    return 0.5*(np.log(h)-np.log(l))**2+(2*np.log(2)-1)*(np.log(c)-np.log(o))**2

## Returns Calculation

In [24]:
for ticker in data.keys():
    view=data[ticker]
    view["rets"] = view["Close"]/view["Close"].shift()-1
    view["log_rets"] = np.log(view["Close"]/view["Close"].shift())
    view["norm_range"] = (view["High"]-view["Low"])/view["Open"]
    # chequear si esto tiene asidero
    # alternativa (view["High"]-view["Low"])/view["Close"]
    view["gk_vol"] = gk_vol(o=view["Open"], h=view["High"], l=view["Low"], c=view["Close"])
    # delete first observation to eliminate nans
    data[ticker]=data[ticker][1:].copy()

## Save dataset

In [25]:
processedname="processed_"+name
with open(os.path.join(dataroute, processedname), 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Process into single dataframe, matching dates and forward filling
Véase https://github.com/alfsn/regime-switching-hmm/issues/9

In [10]:
df=pd.DataFrame()

for key, value in data.items():
    for column in ["rets", "log_rets", "gk_vol"]:
        df[key+"_"+column]=value[column]

In [18]:
df.loc[df.isna().any(axis=1), df.isna().any(axis=0)]

Unnamed: 0_level_0,GGAL_rets,GGAL_log_rets,GGAL_gk_vol,GGAL.BA_rets,GGAL.BA_log_rets,GGAL.BA_gk_vol,YPF_rets,YPF_log_rets,YPF_gk_vol,YPFD.BA_rets,...,EDN.BA_gk_vol,BMA_rets,BMA_log_rets,BMA_gk_vol,BMA.BA_rets,BMA.BA_log_rets,BMA.BA_gk_vol,USD_EDN_rets,USD_EDN_log_rets,USD_EDN_gk_vol
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-10,0.011817,0.011748,0.000385,,,,-0.005822,-0.005839,0.000326,,...,,-0.007131,-0.007156,0.000449,,,,0.000000,0.000000,0.001563
2013-01-21,,,,0.014583,0.014478,0.000131,,,,0.044843,...,0.000634,,,,0.015385,0.015267,0.000122,0.000000,0.000000,0.001101
2013-02-18,,,,0.000000,0.000000,0.000000,,,,0.000000,...,0.000000,,,,0.000000,0.000000,0.000000,0.000000,0.000000,0.002106
2013-04-10,-0.001686,-0.001688,0.000545,0.000000,0.000000,0.000000,-0.040645,-0.041494,0.003459,0.000000,...,0.000000,-0.001296,-0.001297,0.000357,,,,-0.013453,-0.013544,0.000000
2013-04-19,0.011342,0.011279,0.006057,0.000000,0.000000,0.000000,0.003159,0.003154,0.000309,0.000000,...,0.000000,0.017065,0.016921,0.000759,,,,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-11-24,,,,0.010238,0.010185,0.000133,,,,0.026714,...,0.000630,,,,0.028935,0.028524,0.000454,0.000000,0.000000,0.000285
2022-12-26,,,,0.011345,0.011281,0.001117,,,,0.015120,...,0.000763,,,,0.013233,0.013146,0.002052,0.000000,0.000000,0.000004
2023-01-02,,,,0.015663,0.015541,0.000362,,,,0.039941,...,0.004389,,,,0.030126,0.029681,0.001014,0.000000,0.000000,0.000042
2023-01-16,,,,0.018600,0.018429,0.000974,,,,0.040827,...,0.001625,,,,0.036447,0.035799,0.003793,0.000000,0.000000,0.000011


In [19]:
df.fillna(0, inplace=True)

In [20]:
finaldfname="finaldf_"+name
with open(os.path.join(dataroute, finaldfname), 'wb') as handle:
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)