In [13]:
from typing import Optional, List


import pandas as pd
import xarray as xr
xr.set_options(keep_attrs=True,
               display_expand_data=False)

import yfinance as yf

from util import xr_pct_change, safe_reindex
from data import get_factor_master, get_portfolios
from stats import get_volatility_set, get_correlation_set


In [2]:
def out(df):
    if isinstance(df, xr.DataArray):
        df = df.to_pandas()
    if isinstance(df, xr.Dataset):
        df = df.to_pandas()
    df.sort_index(ascending=False).to_clipboard()


In [3]:
def get_yahoo_data(ticker, field_name, cache=None):
    # TODO: Check cache first
    # cache.columns.get_level_values(1)
    return yf.download(ticker)[field_name]


def get_yahoo_data_set(tickers, field_name, asset_names=None):
    if asset_names is None:
        asset_names = tickers
    return (pd.DataFrame({asset_name: get_yahoo_data(ticker, field_name) 
                         for asset_name, ticker in zip(asset_names, tickers)})
            .rename_axis(index='date', columns='factor_name'))


def fill_returns(df):
    return df.ffill()


def get_business_days(df, factor_names):
    return df[factor_names].dropna(how='any').index


def align_dates(df, business_day_factors):
    dates_raw = df.index
    dates_business = get_business_days(df, business_day_factors)
    dates_union = dates_raw.union(dates_business)
    return (df
            .reindex(dates_union)
            .pipe(fill_returns)
            .loc[dates_business])


def calculate_returns(cret, diffusion_type):
    match diffusion_type:
        case 'lognormal':
            return cret.pct_change().mul(10_000)
        case 'normal':
            return cret.diff().mul(100)
        # case 'normal10':
        #     return cret.diff().div(10)
        case _:
            raise ValueError(f'Unsupported diffusion_type of {diffusion_type} for {cret.name}')
        # case nan:
        #     raise ValueError(f'No diffusion_type provided for {cret.name}')


def calculate_returns_set(df, diffusion_map):
    return (pd.DataFrame({factor: calculate_returns(df[factor], diffusion_map[factor]) 
                          for factor in df.columns
                          })
            .rename_axis(index='date', columns='factor_name'))
    

def accumulate_returns(ret, diffusion_type, level=None):
    # TODO: This drops the first observation
    if level is None:
        level = ret.iloc[-1]
    match diffusion_type:
        case 'lognormal':
            cret = ret.div(10_000).add(1).cumprod()
            cret = cret / cret.iloc[-1] * level
        case 'normal':
            cret = ret.div(100).cumsum()
            cret = cret - cret.iloc[-1] + level
        case _:
            raise ValueError(f'Unsupported diffusion_type of {diffusion_type} for {ret.name}')
    return cret


def accumulate_returns_set(ret, diffusion_map, level_map=None):
    if level_map is None:
        level_map = {factor: None for factor in ret.columns}  
    return (pd.DataFrame({factor: accumulate_returns(ret[factor], diffusion_map[factor], level_map.get(factor, 100)) 
                          for factor in ret.columns
                          })
            .rename_axis(index='date', columns='factor_name'))

In [None]:
factor_master = get_factor_master('factor_master.xlsx', 'read_new')
factor_list = factor_master.index

levels_raw = (get_yahoo_data_set(asset_names = factor_list, 
                                 tickers = factor_master.loc[factor_list, 'ticker'],
                                 field_name = 'Adj Close'))
levels = align_dates(levels_raw, ['SPX', 'USD10'])

diffusion_map = factor_master['diffusion_type']
levels_latest = levels.iloc[-1]

ret = calculate_returns_set(levels, diffusion_map)
cret = accumulate_returns_set(ret, diffusion_map, levels_latest)

# Build Dataset

In [None]:
def build_factor_data2(halflifes: List[int]) -> xr.Dataset:
    # TODO: Check vol units
    factor_master = get_factor_master('factor_master.xlsx', 'read')
    factor_list = factor_master.index

    factor_list_yf = factor_master.query('source=="yfinance"').index
    levels_yf = (get_yahoo_data_set(asset_names = factor_list_yf.tolist(), 
                                     tickers = factor_master.loc[factor_list, 'ticker'],
                                     field_name = 'Adj Close')
                 .pipe(align_dates, ['SPY'])
                 )

    diffusion_map = factor_master['diffusion_type']
    ret_yf = calculate_returns_set(levels_yf, diffusion_map)
    
    portfolios_weights = (get_portfolios()
                          .pipe(safe_reindex, factor_master)
                          .fillna(0)
                          .loc[factor_list_yf]
                          )
    portfolios_ret = ret_yf @ portfolios_weights
    levels_latest = levels_yf.iloc[-1]

    factor_data = xr.Dataset()
    factor_data['ret']  = pd.concat([ret_yf, portfolios_ret], axis=1).rename_axis(columns='factor_name')
    factor_data['cret'] = accumulate_returns_set(factor_data['ret'].to_pandas(), diffusion_map, levels_latest)
    factor_data['vol']  = get_volatility_set(factor_data['ret'], halflifes)
    factor_data['corr'] = get_correlation_set(factor_data['ret'], halflifes)
    factor_data['factor_name'].attrs = factor_master.T.to_dict()
    
    return factor_data #, diffusion_map, levels_latest

halflifes = [21, 63, 121, 252]
# factor_data, diffusion_map, levels_latest = build_factor_data2(halflifes)
factor_data = build_factor_data2(halflifes)


In [None]:
factor_data.vol.sel(vol_type=63).to_pandas()

In [None]:
def build_factor_data_yf(halflifes: List[int]) -> xr.Dataset:
    factor_master = get_factor_master('factor_master.xlsx', 'read_short')
    factor_list = factor_master.index

    levels_raw = (get_yahoo_data_set(asset_names = factor_list, 
                                     tickers = factor_master.loc[factor_list, 'ticker'],
                                     field_name = 'Adj Close'))
    levels = align_dates(levels_raw, ['SPX', 'USD10'])

    diffusion_map = factor_master['diffusion_type']
    levels_latest = levels.iloc[-1]

    ret = calculate_returns_set(levels, diffusion_map)
    cret = accumulate_returns_set(ret, diffusion_map, levels_latest)

    factor_data = xr.Dataset
    factor_data = xr.Dataset({'levels': levels.stack().to_xarray(), 
                              'ret':  ret.stack().to_xarray(), 
                              'cret': cret.stack().to_xarray()})
    factor_data['vol']   = get_volatility_set(factor_data['ret'], halflifes)
    factor_data['corr']  = get_correlation_set(factor_data['ret'], halflifes)
    factor_data['factor_name'].attrs = factor_master.T.to_dict()
    return factor_data

halflifes = [21, 63, 121, 252]
ds = build_factor_data2(halflifes)

# Check duration

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm


def regression_plot(df, x, y):
    df = df[[x, y]].dropna()
    X = df[x]
    Y = df[y]
    X = sm.add_constant(X)  # Adds a constant term to the predictor

    model = sm.OLS(Y, X).fit()
    # predictions = model.predict(X)

    # Display the equation of the line
    intercept, slope = model.params
    print(f'Equation of line: Y = {intercept:.2f} + {slope:.2f}X')

    # Create scatter plot with line of best fit
    sns.lmplot(x=x, y=y, data=df)
    plt.show()
    
    
regression_plot(ret, 'USD10', 'TY')


# Junk

In [None]:
break

In [None]:
def get_yahoo_data(ticker, field_name, cache=None):
    # TODO: Check cache first
    return yf.download(ticker)[field_name]

# cache = yf.download(factor_master['ticker'].iloc[:-1].to_list())

lvl_raw_dict = {}
for factor in factor_master.index:
    lvl_raw_dict[factor] = get_yahoo_data(factor_master.loc[factor, 'ticker'], 'Adj Close')
lvl_raw = pd.DataFrame(lvl_raw_dict)
# TODO: Confirm outer join


In [None]:
business_days = None
df.index

In [None]:
def my_ffill(df, business_days, method=None, na_tolerance=None, diffusion_type=None):
    # 1) Include business_days in df index
    
    if business_days is None:
        full_dates = df.index.union(business_days)
        df = df.reindex(full_dates)
    # 2) Count consecutive NAs
    # 3) Extract dates with NAs and their count (build replacement dataframe)
    # 4) If consecutive NAs < na_tolerance, replacement data = 0 (for now) else nan
    # 5) Forward fill
    # 6) Add replacement (respecting diffusion_type)
    return df


def get_business_days(df, factors=['SPX']):
    return df[factors].dropna(how='any').index


def calculate_returns(levels, diffusion_type):
    if diffusion_type == 'diff':
        return levels.diff()
    elif diffusion_type == 'pct':
        return levels.pct_change()
    else:
        raise ValueError('diffusion_type must be either "diff" or "pct"')

In [None]:
my_ffill(lvl_raw, get_business_days(lvl_raw))

In [None]:
import pandas as pd
import numpy as np

# Create a date range for the last three days
data = [100, np.nan, np.nan, 101]
date_range = pd.date_range(end=pd.Timestamp.today().date(), periods=len(data))


# Create a dataframe with the specified values
data = [100, np.nan, np.nan, 101]
df = pd.Series(data, index=date_range) #, columns=['Value'])

# pd.concat([df, df.pct_change()], axis=1)

df

In [None]:


asset_list = ['SPY', 'IWM', 'IEF', '^TNX', '^FCHI']
diffusion_types = ['lognormal', 'lognormal', ]
data = yf.download(asset_list)

lvl = data['Adj Close']
lvl.pipe(out)

ret = lvl.pct_change().pipe(out)
# Union any other data sources




In [None]:
my_ffill(lvl_raw, get_business_days(lvl_raw))

In [None]:
import pandas as pd
import numpy as np

# Create a date range for the last three days
data = [100, np.nan, np.nan, 101]
date_range = pd.date_range(end=pd.Timestamp.today().date(), periods=len(data))


# Create a dataframe with the specified values
data = [100, np.nan, np.nan, 101]
df = pd.Series(data, index=date_range) #, columns=['Value'])

# pd.concat([df, df.pct_change()], axis=1)

df

In [None]:
import pandas as pd

print(pd.__version__)


In [None]:
import pandas as pd
current_date = pd.Timestamp.today().date()
print(current_date)
