# Synthetic Data Generation

## Preliminaries

In [None]:
# imports 
from ev_scoring import ExtremeValueScoring
from market_utils import MarketUtilities
from ydata.connectors import GCSConnector
from ydata.dataset.filetype import FileType
from ydata.metadata import Metadata
from ydata.synthesizers.timeseries.model import TimeSeriesSynthesizer
from ydata.utils.data_types import VariableType
from ydata.utils.formats import read_json

## Information/Notes

Below are resources for Generative Adversarial networks:
- https://github.com/ydataai/ydata-synthetic/blob/master/data/stock_data.csv
- https://papers.nips.cc/paper/2019/file/c9efe5f26cd17ba6216bbe2a7d26d490-Paper.pd: *Time Series Generative Adversarial Networks*

## Preliminaries

In [None]:
ev = ExtremeValueScoring(wrds_username='audreymcmillion')
db = ev.wrds_db
conn = ev.sqlite_conn
mkt_utils = MarketUtilities(wrds_username='audreymcmillion', wrds_db = db, sqlite_conn = "auto")

In [None]:
from dotenv import load_dotenv
load_dotenv()

## Data Extraction

In [None]:
with open("sql_lib/interday_highlow_query.sql", "r") as file:
    interday_hl_template = file.read()

In [None]:
# extract a test dataframe using my favorite ticker (CZR)
czr_df = mkt_utils.wrds_db.raw_sql(interday_hl_template.format(symbol='CZR', \
                                                                   start_dt='2015-06-01', \
                                                                   end_dt='2020-02-01'))

In [None]:
czr_df['dlyclose'].plot() # plotting the price

In [None]:
czr_df

## Neural Network Model

In [None]:
sub_czr_df = czr_df[['dlycaldt', 'dlyopen', 'dlyclose', 'dlyhigh', 'dlylow', 'dlynumtrd', 'dlyvol']]

In [None]:
synth = TimeSeriesSynthesizer()

In [None]:
# czr_df[['dlycaldt', 'dlyopen', 'dlyclose', 'dlyhigh', 'dlylow', 'dlyvol']].to_csv("test.csv", index=False)

In [None]:
from ydata.connectors import LocalConnector
import pandas as pd
import datetime
from ydata import connectors

connector = LocalConnector()

# Read the data
data = connector.read_file('test.csv')
data.head()

In [None]:
print(data)

In [None]:
from ydata.dataset import Dataset
my_data = Dataset(sub_czr_df)

dataset_attrs = {"sortbykey": "dlycaldt"}
m = Metadata(my_data, dataset_attrs=dataset_attrs)

In [None]:
# data.schema
# m.summary

In [None]:
synth = TimeSeriesSynthesizer()
synth.fit(my_data, m)

In [None]:
# my_sample = synth.sample(n_entities = 200)

In [None]:
my_data.to_pandas()

In [None]:
sub_czr_df['dlycaldt'].nunique()

## Extreme Value Theory

Methodology:
1. Generate an entire return series using a baseline AR(1)-GARCH(1,1) model.
2. Transform the extremes generated via this simplistic model to that from a parameterized generalized extreme value distribution.
   - Assumption is that we use the block-over-maxima method to isolate the extremes within these periods.
   - A challenge with this method is the IID assumption. Perhaps we can implement some controls that prevent the sampled extreme value from being too far from the prior generated value.

### CZR: A Test Case

In [None]:
with open("sql_lib/interday_highlow_query.sql", "r") as file:
    interday_hl_template = file.read()

# extract a test dataframe using my favorite ticker (CZR)
czr_df = mkt_utils.wrds_db.raw_sql(interday_hl_template.format(symbol='CZR', \
                                                                   start_dt='2015-06-01', \
                                                                   end_dt='2020-02-01'))

In [None]:
# use a more "stable" time period
hl_series = czr_df[(czr_df.dlycaldt >= '2019-03-25') & (czr_df.dlycaldt <= '2020-01-07')]["log_highlow_diff"].reset_index(drop=True)

In [None]:
from arch import arch_model
import numpy as np
am = arch_model(hl_series, mean='ARX', lags=1, vol='GARCH', p=1, q=1, dist='t')
res = am.fit(disp='off')

In [None]:
res

In [None]:
hl_series

In [None]:
# simulate data using the parameters from the AR-GARCH model
sim_data = am.simulate(
    params=res.params, 
    nobs=400,
    initial_value=hl_series.iloc[-1], 
    x=None,                       
    burn=100                      
)

In [None]:
sim_data = sim_data.reset_index().rename(columns={"index": "t"})

In [None]:
czr_df

In [None]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'

fig = px.line(czr_df[(czr_df.dlycaldt >= '2019-01-01') & (czr_df.dlycaldt <= '2020-01-07')], x='dlycaldt', y="log_highlow_diff", title="Original Diff[Log(High/Low)] Data")
fig.show()

In [None]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'

fig = px.line(sim_data[:200], x='t', y="data", title="AR-GARCH-Generated Synthetic Data")
fig.show()

**Extreme value injection**

In [None]:
from pyextremes import EVA

In [None]:
import pandas as pd
def flag_max_n(series: pd.Series, n: int) -> pd.Series:
    flags = pd.Series(0, index=series.index)
    
    # Iterate over the series in chunks
    for start in range(0, len(series), n):
        end = min(start + n, len(series))
        chunk = series.iloc[start:end]
        
        if not chunk.empty:
            max_idx = chunk.idxmax()
            flags.loc[max_idx] = 1

    return flags
    
sim_data['max_flag'] = flag_max_n(sim_data['data'], n=10)

In [None]:
sim_data['max_flag_str'] = sim_data['max_flag'].astype(str)

In [None]:
import plotly.express as px

def plot_extremes(sim_data, x, y, color_col):
    fig = px.scatter(
        sim_data,
        x=x,
        y=y,
        color=color_col,  # Color based on 'flag'
        color_discrete_map={'0': 'gray', '1': 'red'},  # Customize colors
    )
    
    # Connect the dots with a line
    fig.add_scatter(
        x=sim_data[x],
        y=sim_data[y],
        mode='lines',
        line=dict(color='gray'),
        showlegend=False,
    )
    
    fig.show()

plot_extremes(sim_data, x='t', y='data', color_col='max_flag_str')

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import genextreme

def plot_genextreme_distributions(results, colors=None, labels=None):
    plt.figure(figsize=(10, 6))

    i = 0
    for res in results:
        c = res['parameters']['c']
        loc = res['parameters'].get('loc', 0)
        scale = res['parameters'].get('scale', 1)

        # Generate x-values safely within distribution support
        x = np.linspace(
            genextreme.ppf(0.01, c, loc=loc, scale=scale),
            genextreme.ppf(0.9, c, loc=loc, scale=scale),
            300
        )
        y = genextreme.pdf(x, c, loc=loc, scale=scale)

        color = colors[i] if colors and i < len(colors) else None
        label = labels[i] if labels and i < len(labels) else f'{i}: c={c:.2f}'

        plt.plot(x, y, lw=2, color=color, label=label)
        i += 1

    plt.title('Generalized Extreme Value (GEV) Distributions')
    plt.xlabel('x')
    plt.ylabel('Density')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# Reference EV distibution:  

reference_ev_orig = {'model': 'genextreme',  
  'parameters': {'c': 0.5,  
   'loc': 1.5,  
   'scale': 1}
}

reference_ev_new = {'model': 'genextreme',  
  'parameters': {'c': -1.5,  
   'loc': 1,  
   'scale': 0.75}
}
plot_genextreme_distributions([reference_ev_orig, reference_ev_new])

In [None]:
# add a plot here to compare parameters of distributions -> are they significantly different from eachother?

In [None]:
from scipy.stats import genextreme
def simple_accept_reject_sample(c, loc, scale, conditional_std, max_deviation = 3, max_tries=100):
    valid_samples = np.array([])
    threshold = max_deviation * conditional_std
    
    while len(valid_samples) == 0:
        samples = genextreme.rvs(c, loc=loc, scale=scale, size=max_tries)
        valid_samples = samples[samples <= threshold]
    
    return valid_samples[0]

In [None]:
new_values = []
max_orig = len(sim_data) / 2
i = 0
for ind, row in sim_data.iterrows():
    print("Iteration:", i, "/", len(sim_data))
    time = row['t']
    current_val = row['data']
    conditional_std = row["volatility"]
    print(conditional_std)
    max_flag = row["max_flag"]

    if max_flag == 1:
        if i <= max_orig:
            gen_sample = simple_accept_reject_sample(reference_ev_orig['parameters']['c'], 
                                                     reference_ev_orig['parameters']['loc'], 
                                                     reference_ev_orig['parameters']['scale'], conditional_std, max_deviation = 3)
            new_values.append(gen_sample)
        else:
           gen_sample = simple_accept_reject_sample(reference_ev_new['parameters']['c'], 
                                                     reference_ev_new['parameters']['loc'], 
                                                     reference_ev_new['parameters']['scale'], conditional_std, max_deviation = 3)
           new_values.append(gen_sample) 
    else:
        new_values.append(current_val)

    i += 1 # increment i

In [None]:
sim_data['data_ext']  = new_values

In [None]:
import plotly.express as px
import plotly.graph_objects as go

def plot_extremes_compare(sim_data, x, y, y2, color_col):
    # First scatter plot with color
    fig = px.scatter(
        sim_data,
        x=x,
        y=y,
        color=color_col,
        color_discrete_map={'0': 'gray', '1': 'red'},
    )

    # Add line connecting points for `y`
    fig.add_trace(
        go.Scatter(
            x=sim_data[x],
            y=sim_data[y],
            mode='lines',
            line=dict(color='gray'),
            name='original line',
            showlegend=True,
        )
    )

    # Second scatter plot for `y2` with color
    fig.add_trace(
        go.Scatter(
            x=sim_data[x],
            y=sim_data[y2],
            mode='markers',
            marker=dict(
                color=sim_data[color_col].map({'0': 'gray', '1': 'green'})
            ),
            name='simulated extremes',
            showlegend=True,
        )
    )

    # Add line for `y2`
    fig.add_trace(
        go.Scatter(
            x=sim_data[x],
            y=sim_data[y2],
            mode='lines',
            line=dict(color='gray', dash='dot'),
            name='simulated line',
            showlegend=True,
        )
    )

    fig.show()

In [None]:
plot_extremes_compare(sim_data, x='t', y='data', y2= 'data_ext', color_col='max_flag_str')

**Transform this back to a candlestick-type chart to visualize the feasability of such a series**

1. Get a mean value for the log(H/L) ratio.

In [None]:
czr_subset = czr_df[(czr_df.dlycaldt >= '2019-03-25') & (czr_df.dlycaldt <= '2020-01-07')].copy().reset_index(drop=True)

In [None]:
czr_subset["hl_ratio"] = np.exp(czr_subset["log_highlow"])

In [None]:
low_mean = czr_subset["dlylow"].mean()
high_mean = czr_subset["dlyhigh"].mean()

In [None]:
initial_ratio = round(high_mean,2)/round(low_mean,2) # round to two decimal points to be realistic

In [None]:
initial_ratio # get the inital ratio

In [None]:
czr_subset.hl_ratio.mean() # compare this to the true mean

In [None]:
czr_subset.hl_ratio.describe()

Process:
1. Divide ``log_highlow_diff`` by 100 (since it is scaled by 100) 
2. Exponentiate ``log_highlow_diff`` to remove the log, this gives us $\frac{H_{t}/L_{t}}{H_{t-1}/L_{t-1}}$
3. Set a $H_{0}/L_{0}$ value and use this to extract the remaining $H/L$ ratios.

In [None]:
sim_data

In [None]:
transformed_srs = np.exp(sim_data.data_ext/100)
untransformed_srs = np.exp(sim_data.data/100)

In [None]:
transformed_srs # value can

In [None]:
result_series = [float(initial_ratio)]
prev_val = float(initial_ratio)
for val in untransformed_srs[1:]:
    # val = (H_t / L_t)/(H_{t-1} / L_{t-1})
    untransformed_val = max(1, float(val) * prev_val)
    result_series.append(untransformed_val)

    # set prev_val to val
    prev_val = untransformed_val 

In [None]:
pd.Series(result_series[:200]).plot()

In [None]:
czr_subset.hl_ratio.plot()

**Transform the series back to enforce the minimum**

**Fit the extreme value distributions iteratively to the simulated data as a quality check**

In addition to this simple accept-reject algorithm based on the conditional standard deviation, to improve this algorithm, we should incorporate:
- (a) The mean element of the AR(1)-GARCH(1,1) model
- (b) Although this is somewhat baked into the GARCH model already, we can control for consecutive time deviations