# Extreme Value Experimentation

In [None]:
import feedparser
import pprint
import datetime as dt
from datetime import datetime, timedelta
import pandas as pd
from tqdm import tqdm
import requests
import time
import numpy as np
from pyextremes import get_extremes
from pyextremes.plotting import plot_extremes
import matplotlib.pyplot as plt
import sqlite3

# Display all columns
pd.set_option('display.max_columns', None)

# Display numbers in full (no scientific notation)
pd.set_option('display.float_format', '{:.4f}'.format)

Potential Python packages include:  
- ``pyextremes``
- ``scipy.stats``

## Read in Anomaly Datasets

In [None]:
low_anom = pd.read_csv("test_data/low_anomalies.csv")
high_anom = pd.read_csv("test_data/high_anomalies.csv")

In [None]:
high_anom.sort_values("current_date", ascending=False)

In [None]:
# imports
import pandas_market_calendars as mcal
import wrds
db = wrds.Connection(wrds_username='audreymcmillion')

We will look into one particular anomalous symbol day: PALI on 2023-09-06.

In [None]:
luld_data = pd.read_csv("test_data/halt_records.csv")

## Test a single (symbol, date)

In [None]:
luld_data[(luld_data.symbol == 'PALI') & (luld_data.halt_date == '09/06/2023')]

In [None]:
pali_trades = db.raw_sql("""
    with rw_trades as (
    	select *,
    	       date_trunc('second', time_m) as trunc_time
    	from taqm_2023.ctm_2023 c 
    	where c.date between date('2023-08-15') and date('2023-09-27')
    	and c.sym_root = 'PALI'
    	and time_m >= '09:30:00'
    	and time_m <= '16:00:00'
    	and tr_scond !~ '[OPQ65]'
    ),

    aggregated as (
        select r.date, 
               r.sym_root, 
               r.trunc_time, 
               avg(r.price) as avg_price,
               max(r.price) as max_price,
               min(r.price) as min_price,
               sum(r.size) as volume, 
               max(r.tr_seqnum) as tr_seqnum
        from rw_trades r 
        group by r.date, r.sym_root, r.trunc_time
    )

    select a.*,
           (a.avg_price - lag(a.avg_price) over (
                        partition by a.date 
                        order by a.date, a.trunc_time
                        )) AS avg_price_diff,
            EXTRACT(EPOCH FROM (a.trunc_time - LAG(a.trunc_time) OVER (
                        PARTITION BY a.date 
                        ORDER BY a.date, a.trunc_time))) AS time_delta
    from aggregated a
    order by a.date, a.trunc_time
    """)

In [None]:
pali_trades['trunc_time'] = (pd.to_datetime('00:00:00') + pali_trades['trunc_time']).dt.time

In [None]:
pali_trades['datetime'] = pd.to_datetime(pali_trades['date'].astype(str) + ' ' + pali_trades['trunc_time'].astype(str))

In [None]:
pali_trades = pali_trades.set_index("datetime")

In [None]:
pali_trades[pali_trades.date == dt.date(2023, 9, 6)]

In [None]:
pali_trades[(pali_trades.date == dt.date(2023, 9, 6)) & (pali_trades.trunc_time <= dt.time(9, 38, 34))][:-50]

In [None]:
pali_trades[pali_trades.date == dt.date(2023, 9, 6)].dropna().volume.plot()

In [None]:
pali_trades[pali_trades.date == dt.date(2023, 9, 6)].dropna().avg_price_diff.plot()
plt.show()

In [None]:
pali_trades.dropna().avg_price_diff.plot()
plt.show()

In [None]:
# using the block maxima (BM) method
extremes = get_extremes(pali_trades.dropna().avg_price_diff, "BM", block_size="1H", errors="ignore")
plot_extremes(
    ts=pali_trades.dropna().avg_price_diff,
    extremes=extremes,
    extremes_method="BM",
    extremes_type="high",
    block_size="1H",
)
plt.show()

In [None]:
extremes

In [None]:
from pyextremes import get_extremes, get_return_periods

return_periods = get_return_periods(
    ts=pali_trades.dropna().avg_price_diff,
    extremes=extremes,
    extremes_method="BM",
    extremes_type="high",
    block_size="1H",
    return_period_size="1H",
    plotting_position="weibull",
)
return_periods.sort_values("return period", ascending=False).head()

From this, could we use the exeedance probability or the return period to "score" price movements.

## Fit an extreme value model and extract parameters

In [None]:
from pyextremes import EVA
model = EVA(pali_trades.dropna().avg_price_diff)

In [None]:
model.get_extremes(method="BM", block_size="1H", errors="ignore")

In [None]:
model.plot_extremes()

In [None]:
# fit the model
model.fit_model()

In [None]:
model.plot_diagnostic(alpha=0.95)

## Assigning Extreme Value Scores 

In [None]:
conn = sqlite3.connect('databases/halt_data.db')

In [None]:
anom_df = pd.read_sql("""
        with merged as (
        	select bar.*, coalesce(nh.halt_time, ny.halt_time) as halt_time
        	from before_after_results bar
        	left join nsdq_halts nh
        	on (bar.ticker, bar.current_date) = (nh.symbol, nh.halt_date) and nh.halt_code = 'LUDP'
        	left join nyse_halts ny
        	on (bar.ticker, bar.current_date) = (ny.symbol, ny.halt_date) and ny.halt_code = 'LULD Pause'
        	where bar.open_pr is not null
        	and bar.close_pr is not null
        	and bar.dlyhigh is not null
        	and bar.dlylow is not null
        	and bar.dlynumtrd is not null
        ) 
        select ticker,
               m.current_date,
               before_date,
               after_date,
               open_pr,
               close_pr,
               dlyhigh,
               dlylow,
               dlynumtrd,
               dlyvol,
               before_avg_open,
               before_avg_close,
               before_avg_dlyhigh,
               before_avg_dlylow,
               before_avg_dlynumtrd,
               before_avg_dlyvol,
               after_avg_open,
               after_avg_close,
               after_avg_dlyhigh,
               after_avg_dlylow,
               after_avg_dlynumtrd,
               after_avg_dlyvol,
               vlty_estimate,
               vix_close,
               halt_time
        from merged m
        where halt_time is not null
        order by m.current_date
    """, conn)

Now, score extreme values from this dataset:

In [None]:
from ev_scoring import ExtremeValueScoring
ev = ExtremeValueScoring(wrds_username='audreymcmillion')

In [None]:
anom_df = anom_df.drop_duplicates(subset=["ticker", "current_date"]).reset_index(drop=True)

In [None]:
# anom_df

In [None]:
already_processed = pd.read_csv("processed_data.csv").drop_duplicates(subset=["ticker", "current_date"]).reset_index(drop=True)

In [None]:
# already_processed

In [None]:
# Perform a left merge with indicator
merged_df = anom_df.merge(already_processed[['ticker', 'current_date']], on=['ticker', 'current_date'], how='left', indicator=True)

# Filter rows where the indicator is 'left_only'
filtered_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])

In [None]:
filtered_df

In [None]:
# Chunk the DataFrame into 100-row chunks and process each chunk
chunk_size = 100
output_file = 'processed_data.csv'

for start in tqdm(range(0, len(filtered_df), chunk_size)):
    chunk = filtered_df[start:start + chunk_size]
    processed_chunk = ev.process_data(chunk)
    
    # write to CSV
    processed_chunk.to_csv(output_file, mode='a', index=False, header=False)

## Exploring Results

In this section, we'll explore our extreme value-scored results collected above.

In [None]:
pd.read_sql("""
    with before_aft as (
    	select bar.*
    	from before_after_results bar
    	where bar.open_pr is not null
    	and bar.close_pr is not null
    	and bar.dlyhigh is not null
    	and bar.dlylow is not null
    	and bar.dlynumtrd is not null
    ),
    
    ev_vals as (
    	select *
    	from ev_results er 
    	where high_score is not null
    	and low_score is not null
    )
    
    select b.*, e.high_extreme, e.low_extreme, (e.high_score * 100) as high_score, (e.low_score * 100) as low_score
    from before_aft b
    join ev_vals e
    on (b.ticker, b.current_date) = (e.ticker, e.current_date)
    order by e.high_score
""", conn)

In [None]:
from market_utils import MarketUtilities
mkt_utils = MarketUtilities(wrds_username='audreymcmillion')

In [None]:
mkt_utils.multiday_chart("MNPR", "2020-06-16")

## Misc. Work

In [None]:
test_trades = ev.get_daily_trades(self, current_dt, before_dt, after_dt, symbol)

In [None]:
from pyextremes import EVA
model = EVA(test_trades.dropna().avg_price_diff)
model.get_extremes(method="BM", block_size="1H", errors="ignore", extremes_type="low")
model.fit_model()

In [None]:
model.plot_extremes()

In [None]:
model.model.cdf(-0.9)

In [None]:
value = 0.05  # Replace with your threshold of interest
exceedance_prob = 1 - model.model.cdf(0.5205)

In [None]:
exceedance_prob

In [None]:
current_dt, halt_time = anom_df['current_date'][0], anom_df['halt_time'][0]

In [None]:
datetime_obj = pd.to_datetime(f"{current_dt} {halt_time}")

In [None]:
test_trades.date.iloc[0]

In [None]:
test_trades[test_trades.date == dt.date(2013,5,10)].avg_price_diff.plot()

In [None]:
test_trades[test_trades.date == dt.date(2013,5,10)].sort_values("avg_price_diff").dropna().iloc[-1]

In [None]:
test_trades[(test_trades.max_price == 37.98) & (test_trades.date == dt.date(2013,5,10))]

In [None]:
15:21:08

In [None]:
test_trades.avg_price_diff.plot()

In [None]:
datetime_obj