In [None]:
import xarray as xr
xr.set_options(keep_attrs=True,
               display_expand_data=False)
import pandas as pd

import plotly.express as px
import plotly.io as pio
# pio.renderers.default='png'

from stats import get_factor_data


In [None]:
factor_data = get_factor_data()
data = factor_data
factor_master = pd.DataFrame(data.asset.attrs).T


[*********************100%%**********************]  30 of 30 completed


In [3]:
# halflifes = [21, 63, 126, 252, 512]
# data = get_factor_data(halflifes)
# factor_master = pd.DataFrame(data['asset'].attrs).T
# factor_list = factor_master.index.to_list()

# MDS

In [4]:
from sklearn.manifold import MDS
from numpy import sqrt

def multidimensional_scaling(correlation_matrix: pd.DataFrame, init=None) -> pd.DataFrame:
    """
    Perform multidimensional scaling on a correlation matrix.

    Parameters
    ----------
    correlation_matrix : pd.DataFrame
        The input correlation matrix as a pandas DataFrame.
    init : np.ndarray, optional
        Initial positions of the points in the embedding space. If None, random initialization is used.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the MDS results with dimensions 'dim1' and 'dim2'.
    """
    
    dissimilarity_matrix = sqrt(1 - correlation_matrix**2)

    # Pass n_init explicitly to suppress warning when init is not None:
    n_init = 4 if init is None else 1 
    embedding = MDS(dissimilarity='precomputed', random_state=42, n_init=n_init)
    coordinates = embedding.fit_transform(dissimilarity_matrix, init=init)
    
    return pd.DataFrame(coordinates, 
                        index=dissimilarity_matrix.index, 
                        columns=pd.Index(['dim1', 'dim2'], name='dimension'))


def mds_ts_df(corr: xr.DataArray, start_date = None) -> xr.DataArray:
    
    dates = corr.date.sel(date=slice(start_date, None)).values
    coordinates = None
    
    mds_dict = {}
    for date in dates:
        df = corr.sel(date=date, corr_type=63).to_pandas()
        coordinates = multidimensional_scaling(df, init=coordinates)
        mds_dict[date] = coordinates
    return (pd.concat(mds_dict)
            .rename_axis(index=['date', df.index.name])
            # .stack()
            # .rename('mds')
            # .to_xarray()
            )


# corr_df = data.corr.sel(date=data.date.max(), corr_type=63).to_pandas()
# df = multidimensional_scaling(corr_df)


def draw_mds(df: pd.DataFrame) -> px.scatter:
    # df.plot.scatter(x='dim1', y='dim2')
    fig_format = {'template': 'plotly_white', 'height': 750, 'width': 750}
    fig = (px.scatter(df.join(factor_master).reset_index(), 
                      x='dim1', y='dim2', text='asset', color='asset_class',
                     **fig_format)
           .update_traces(textposition='middle right', textfont=dict(color='lightgray'))
           .update_xaxes(title=None)
           .update_yaxes(title=None))
    return fig


def draw_mds_ts(df: pd.DataFrame) -> px.scatter:
    # df.plot.scatter(x='dim1', y='dim2')
    fig_format = {'template': 'plotly_white', 'height': 750, 'width': 750}
    
    fig = (px.scatter(df, 
                      x='dim1', y='dim2', text='asset', color='asset_class', 
                      animation_frame='date',
                     **fig_format)
           .update_traces(textposition='middle right', textfont=dict(color='lightgray'))
           .update_xaxes(title=None)
           .update_yaxes(title=None))
    return fig

# mds_series = mds_da.to_series().reset_index().join(factor_master, on='asset')
# draw_mds_ts(mds_series)
# mds_series

# mds_da.to_series().reset_index().pivot(index='date', columns='dimension')

# mds_df

In [5]:
df = mds_ts_df(data.corr, start_date='2024')

df1 = df.reset_index().join(factor_master, on='asset')
df1['date'] = df1['date'].astype(str)
draw_mds_ts(df1)

In [6]:
draw_mds(df.xs(df.index[0][0]))

In [7]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

# Scratch

In [None]:
type(fig)

In [None]:
data.corr.sel(date=date, corr_type=63).to_pandas().pipe(multidimensional_scaling)

In [None]:
# mds_ts = {date: data.corr.sel(date=date, corr_type=63).to_pandas().pipe(multidimensional_scaling)
#           for date in data.corr.date}
# for date in data.corr.date:
data.corr.date

In [None]:
from sklearn.manifold import MDS
from numpy import sqrt


correlation_matrix = ds.corr.sel(date='2024-10-25', corr_type=63).to_pandas()
dissimilarity_matrix = sqrt(1 - correlation_matrix**2)


mds = MDS(dissimilarity='precomputed', random_state=42)
mds_results = mds.fit_transform(dissimilarity_matrix)
mds_df = pd.DataFrame(mds_results, index=dissimilarity_matrix.index, columns=['dim1', 'dim2'])


print(mds_df)
mds_df.plot.scatter(x='dim1', y='dim2')

px.scatter(mds_df.reset_index(), x='dim1', y='dim2', hover_name='asset')

In [None]:
from sklearn.manifold import MDS

corr_df = ds.corr.sel(date='2024-10-25', corr_type=63).to_pandas()
mds = MDS(dissimilarity='precomputed', random_state=42)
mds_results = mds.fit_transform(1 - corr_df)
mds_df = pd.DataFrame(mds_results, index=corr_df.index, columns=['MDS1', 'MDS2'])
print(mds_df)
mds_df.plot.scatter(x='MDS1', y='MDS2')

px.scatter(mds_df.reset_index(), x='MDS1', y='MDS2', hover_name='asset')

In [None]:
mds_results_list = []

for date in ds.date.values:
    corr_df = ds.corr.sel(date=date, corr_type=63).to_pandas()
    imputer = SimpleImputer(strategy='mean')
    corr_df_imputed = imputer.fit_transform(1 - corr_df)
    mds = MDS(dissimilarity='precomputed', random_state=42)
    mds_results = mds.fit_transform(corr_df_imputed)
    mds_results = mds.fit_transform(1 - corr_df)
    mds_results_list.append(mds_results)

mds_results_array = np.stack(mds_results_list)
mds_da = xr.DataArray(mds_results_array, coords=[ds.date, corr_df.index, ['MDS1', 'MDS2']], dims=['date', 'asset', 'mds_dim'])
print(mds_da)
