# Collecting Anomalies

In [1]:
import feedparser
import pprint
from datetime import datetime, timedelta, date
import pandas as pd
from tqdm import tqdm
import requests
import time
import numpy as np
from market_utils import MarketUtilities

# Display all columns
pd.set_option('display.max_columns', None)

# Display numbers in full (no scientific notation)
pd.set_option('display.float_format', '{:.4f}'.format)

In [2]:
url = "http://www.nasdaqtrader.com/rss.aspx?feed=tradehalts&haltdate={mmddyyyy}"
def get_halt_data(start_date: datetime.date, end_date: datetime.date, halt_codes: list) -> pd.DataFrame:
    # create empty data structure
    data_lst = []
    
    # Calculate the number of days in the range
    num_days = (end_date - start_date).days + 1
    
    # Loop through the date range with a progress bar
    for i in tqdm(range(num_days), desc="Processing dates"):
        current_date = start_date + timedelta(days=i)

        if i % 100 == 0:
            time.sleep(2)
        
        # parse URL
        str_date = current_date.strftime("%m%d%Y")
        formatted_url = url.format(mmddyyyy=str_date)

        try:
            # Fetch the feed with a timeout
            response = requests.get(formatted_url, timeout=10)
            response.raise_for_status()  # Raise an error for bad status codes
        
            # Parse the feed content
            feed = feedparser.parse(response.content)
        except requests.exceptions.Timeout:
            print('The request timed ou, current_date =', str_date)
            return pd.DataFrame(data_lst)
        except requests.exceptions.RequestException as e:
            print(f'An error occurred: {e}, current_date', str_date)
            return pd.DataFrame(data_lst)

        # check if there are any actual entries
        if len(feed.entries) > 0:
            # iterate through feed entries (articles)
            for entry in feed.entries:
                # only look for specified halt codes
                # continue otherwise
                if halt_codes is not None and entry.ndaq_reasoncode not in halt_codes:
                    continue 
                    
                # empty data entry
                data_entry = {}
                data_entry["halt_date"] = entry.ndaq_haltdate
                data_entry["resumption_date"] = entry.ndaq_resumptiondate
                data_entry["symbol"] = entry.title
                data_entry["issue_name"] = entry.ndaq_issuename
                data_entry["mkt_center"] = entry.ndaq_mkt
                data_entry["halt_code"] = entry.ndaq_reasoncode
                data_entry["halt_time"] = entry.ndaq_halttime
                data_entry["resumption_quote_time"] = entry.ndaq_resumptionquotetime
                data_entry["resumption_trade_time"] = entry.ndaq_resumptiontradetime

                # append to list
                data_lst.append(data_entry)
    
    return pd.DataFrame(data_lst)

In [3]:
run_collection = False

Dates already covered: ``date(2015, 1, 1)`` to ``date(2025, 1, 15)``

In [4]:
if run_collection:
    halt_df = get_halt_data(date(2024, 6, 19), date(2025, 1, 15), None)
    halt_df.to_csv("test_data/halt_records.csv", mode='a', index=False, header=False)

In [5]:
halt_df = pd.read_csv("test_data/halt_records.csv")

In [6]:
halt_df

Unnamed: 0,halt_date,resumption_date,symbol,issue_name,mkt_center,halt_code,halt_time,resumption_quote_time,resumption_trade_time
0,01/02/2015,01/02/2015,PWRD,Perfect World Co Ltd ADS,Q,T3,07:58:48,08:50:00,08:55:00
1,01/02/2015,01/05/2015,FFKY,First Financial Service Corp,Q,D,08:13:17,00:00:01,00:00:01
2,01/02/2015,01/02/2015,BARL,Morgan Stanley S&P 500 Crude Oil Linked ETNs d...,P,M,09:30:18,09:36:00,09:36:00
3,01/02/2015,01/02/2015,RGRC,RBS Rogers Enhanced Commodity ETN,P,M,09:30:38,09:36:00,09:36:00
4,01/02/2015,01/02/2015,SMACU,Sino Mercury Acquisition Unit,Q,LUDP,09:30:49,09:30:49,09:35:49
...,...,...,...,...,...,...,...,...,...
96801,01/15/2025,01/15/2025,XRTX,XORTX Therapeutics Inc. Cm,Q,LUDP,13:46:52,13:46:52,13:51:52
96802,01/15/2025,01/15/2025,PTLE,PTL LTD Ordinary Shares,Q,LUDP,14:02:40,14:02:40,14:07:40
96803,01/15/2025,01/15/2025,PTLE,PTL LTD Ordinary Shares,Q,LUDP,15:29:16,15:29:16,15:34:16
96804,01/15/2025,01/16/2025,MBIO,"Mustang Bio, Inc. CM",Q,T3,19:50:00,08:55:00,09:00:00


In [7]:
halt_df.halt_code.unique()

array(['T3', 'D', 'M', 'LUDP', 'T2', 'T12', 'T1', 'H11', 'T7'],
      dtype=object)

In [8]:
halt_df[halt_df.halt_code == 'H11']

Unnamed: 0,halt_date,resumption_date,symbol,issue_name,mkt_center,halt_code,halt_time,resumption_quote_time,resumption_trade_time
57911,03/04/2022,03/14/2022,RUSL,Direxion Daily Russia Bull 2X Shares,P,H11,03:54:19,00:00:01,00:00:01
91893,08/07/2024,08/19/2024,SCPX,"Scorpius Holdings, Inc.",A,H11,09:53:26,09:35:00,09:35:00
93288,10/04/2024,10/15/2024,UAVS,Ageagle Aerial Systems Inc,N,H11,13:51:59,09:00:00,09:00:00


In [9]:
nsdq_sorted = halt_df[["halt_date", 
         "symbol", 
         "halt_code"]].groupby(["halt_date", 
                                "symbol", 
                                "halt_code"]).value_counts().reset_index().rename(columns={0:"count"}).sort_values("count", ascending=False)

In [10]:
nsdq_sorted['halt_date'] = pd.to_datetime(nsdq_sorted['halt_date'], format='%m/%d/%Y').dt.strftime('%Y-%m-%d')

In [11]:
nsdq_sorted.sort_values("halt_date")

Unnamed: 0,halt_date,symbol,halt_code,count
156,2012-01-03,RVSN,T3,1
154,2012-01-03,APKT,T3,1
155,2012-01-03,OSH,T7,2
329,2012-01-04,TRID,T3,1
328,2012-01-04,SATC,T3,1
...,...,...,...,...
2326,2025-01-15,MYNZ,LUDP,2
2323,2025-01-15,JXG,LUDP,4
2330,2025-01-15,SONM,LUDP,4
2322,2025-01-15,ALVR,T3,1


In [12]:
nsdq_sorted[nsdq_sorted["count"] >= 5].sort_values("count", ascending=False)[:50]

Unnamed: 0,halt_date,symbol,halt_code,count
11313,2020-03-12,AMCIU,LUDP,79
27419,2016-06-02,TRTLU,LUDP,70
28018,2016-06-06,TRTLU,LUDP,70
27572,2016-06-03,TRTLU,LUDP,70
42634,2015-09-15,SMACU,LUDP,69
27179,2015-06-01,FNTC,LUDP,69
42603,2015-09-15,AAPC,LUDP,69
31298,2016-06-27,OACQU,LUDP,69
29777,2015-06-17,FNTC,LUDP,67
721,2016-01-06,FNTC,LUDP,67


### NYSE Trade Halt Data

In [13]:
nyse_halts = pd.read_csv("test_data/nyse_halts.csv").rename(columns={'Halt Date': 'halt_date', 
                                                                                 'Symbol': 'symbol',
                                                                                 'Reason': 'halt_code'})

In [14]:
nyse_sorted = nyse_halts[["halt_date", 
                          "symbol", 
                          "halt_code"]].groupby(["halt_date", 
                                                 "symbol", 
                                                 "halt_code"]).value_counts().reset_index().rename(columns={0:"count"}).sort_values("count", ascending=False)

In [15]:
nyse_sorted.halt_code.unique()

array(['LULD pause', 'News pending', 'Regulatory Concern',
       'Intraday Indicative Value Not Available', 'Corporate Action',
       'News Released', 'Merger Effective', 'Intraday Ind Val NA',
       'New Security Offering', 'News dissemination'], dtype=object)

In [16]:
nyse_sorted[nyse_sorted["count"] > 5].sort_values("count", ascending=False)[:50]

Unnamed: 0,halt_date,symbol,halt_code,count
1094,2020-02-10,FMCIU,LULD pause,60
7426,2020-06-08,HVT A,LULD pause,59
1874,2020-03-12,AMCIU,LULD pause,53
26942,2024-12-23,DRCT,LULD pause,49
17990,2023-03-13,WAL,LULD pause,46
20396,2023-10-11,TPST,LULD pause,46
16672,2022-12-09,AMAM,LULD pause,44
5199,2020-03-24,IMAC,LULD pause,42
26882,2024-12-19,NVNI,LULD pause,41
26864,2024-12-18,XCH,LULD pause,41


### Combining the Two Halt Datasets

In [17]:
stacked_df = pd.concat([nsdq_sorted, nyse_sorted], ignore_index=True)

In [18]:
stacked_dedup = stacked_df.drop_duplicates(subset=['halt_date', 'symbol'], keep='first').sort_values('halt_date').reset_index(drop=True)

In [19]:
# stacked_dedup.to_csv('stacked_dedup_halts.csv')

In [20]:
stacked_dedup[stacked_dedup.symbol == 'EVLO']

Unnamed: 0,halt_date,symbol,halt_code,count
30261,2018-10-29,EVLO,LUDP,1
32264,2019-07-16,EVLO,LUDP,1
33216,2019-12-02,EVLO,LUDP,1
33749,2020-02-18,EVLO,LUDP,1
34172,2020-03-09,EVLO,LUDP,2
34541,2020-03-12,EVLO,LUDP,1
35145,2020-03-13,EVLO,LUDP,1
41702,2020-07-28,EVLO,LUDP,1
42935,2020-12-14,EVLO,LUDP,1
52687,2023-05-06,EVLO,LULD pause,1


**TODO:** Filter out irrelevant halt codes.

## Query WRDS database

There will be multiple steps to this. We will want 
1. The opening and closing price 15 days before and after the date of the LULD pause
2. The opening and closing price the day of the LULD pause
3. The trade price that triggered the LULD pause (last step)

We can find interday anomalies using 1 and 2. However, for intraday anomalies we will need to determine 3 and possibly some other metric for quantifying average price throughout the trade day.

Since interday anomalies will likely be easier to determine, we will develop the necessary queries for those first.

#### Preliminaries and Test Data

In [None]:
# imports
import pandas_market_calendars as mcal
import wrds
import yfinance as yf

In [None]:
mkt_utils = MarketUtilities(wrds_username='audreymcmillion')
db = mkt_utils.wrds_db

In [None]:
# test mkt_utils function: get_after_date
mkt_utils.get_after_date('2024-07-08', 15)

The below function fetches the industry from the ``yfinance`` API:

In [None]:
# test mkt_utils function: get_industry_data
mkt_utils.get_industry_data(["ENDP", "AAPL", "TSLA", "MSFT"])

Use the above to get the details.

We will utilize the standard deviation of daily returns to quantify volatility. Additionally, we will add a VIX score to each row when collecting the data.

In [None]:
# query for before and after average prices
before_after_query = """
    WITH price_details AS (
        SELECT  b.ticker,
                a.permno,
                a.dlycaldt,
                a.dlyopen,
                coalesce(a.dlyclose, a.dlyprc) AS dlyclose,
                a.dlyhigh,
                a.dlylow,
                a.dlynumtrd,
                a.dlyvol,
                (a.dlyopen - a.dlyclose) as daily_return
        FROM crsp.dsf_v2 a
        JOIN crsp.dsenames AS b
        ON a.permno = b.permno
        AND b.ticker IN ({symbol_lst})
        AND a.dlycaldt BETWEEN date('{start_dt}') AND date('{end_dt}')
        AND date('{current_dt}') BETWEEN b.namedt AND b.nameendt
    ),

    days_before AS (
        SELECT ticker,
               MIN(dlycaldt) AS before_date,
               AVG(dlyopen) AS before_avg_open,
               AVG(dlyclose) AS before_avg_close,
               AVG(dlyhigh) AS before_avg_dlyhigh,
               AVG(dlylow) AS before_avg_dlylow,
               AVG(dlynumtrd) AS before_avg_dlynumtrd,
               AVG(dlyvol) AS before_avg_dlyvol
        FROM price_details
        WHERE dlycaldt BETWEEN date('{start_dt}') AND (date('{current_dt}') - INTERVAL '1 day')
        GROUP BY ticker
    ),

    current_days AS (
        SELECT ticker,
               MIN(dlycaldt) AS current_date,
               AVG(dlyopen) AS open_pr,
               AVG(dlyclose) AS close_pr,
               AVG(dlyhigh) AS dlyhigh,
               AVG(dlylow) AS dlylow,
               AVG(dlynumtrd) AS dlynumtrd,
               AVG(dlyvol) AS dlyvol
        FROM price_details
        WHERE dlycaldt = date('{current_dt}')
        GROUP BY ticker
    ),

    days_after AS (
        SELECT ticker,
               MAX(dlycaldt) AS after_date,
               AVG(dlyopen) AS after_avg_open,
               AVG(dlyclose) AS after_avg_close,
               AVG(dlyhigh) AS after_avg_dlyhigh,
               AVG(dlylow) AS after_avg_dlylow,
               AVG(dlynumtrd) AS after_avg_dlynumtrd,
               AVG(dlyvol) AS after_avg_dlyvol
        FROM price_details
        WHERE dlycaldt BETWEEN (date('{current_dt}') + INTERVAL '1 day') AND date('{end_dt}')
        GROUP BY ticker
    )
    
    SELECT coalesce(c.ticker, a.ticker, b.ticker) AS ticker,
           COALESCE(c.current_date, TO_DATE('{current_dt}', 'YYY-MM-DD')) AS current_date,
           COALESCE(b.before_date, TO_DATE('{start_dt}', 'YYY-MM-DD')) AS before_date,
           COALESCE(a.after_date, TO_DATE('{end_dt}', 'YYYY-MM-DD')) AS after_date,
           c.open_pr,
           c.close_pr,
           c.dlyhigh,
           c.dlylow,
           c.dlynumtrd,
           c.dlyvol,
           b.before_avg_open,
           b.before_avg_close,
           b.before_avg_dlyhigh,
           b.before_avg_dlylow,
           b.before_avg_dlynumtrd,
           b.before_avg_dlyvol,
           a.after_avg_open,
           a.after_avg_close,
           a.after_avg_dlyhigh,
           a.after_avg_dlylow,
           a.after_avg_dlynumtrd,
           a.after_avg_dlyvol,
           (SELECT stddev(daily_return) FROM price_details) AS vlty_estimate
    FROM current_days c
    FULL OUTER JOIN days_before b
    ON (c.ticker = b.ticker)
    FULL OUTER JOIN days_after a
    ON (c.ticker = a.ticker)
"""

In [None]:
import yfinance as yf 

max_date = datetime.strptime('2023-12-29', '%Y-%m-%d')
output_file = "before_after_results.csv"
vix_ticker = yf.Ticker("^VIX") 

i = 0
for current_date in tqdm(stacked_dedup.halt_date.unique()):
    print(current_date)

    # check if valid date
    if datetime.strptime(current_date, '%Y-%m-%d') >= max_date:
        continue
 
    # get subset of LULP 
    subset = stacked_dedup[(stacked_dedup.halt_date == current_date) \
                           & ((stacked_dedup.halt_code == 'LUDP') | (stacked_dedup.halt_code == 'LULD pause')) ]

    # get relevant symbols
    subset_symbols = subset.symbol.to_list()

    if len(subset_symbols) == 0:
        continue

    # get string version of symbols
    stringed_symbols = "', '".join(subset_symbols)
    stringed_symbols = f"'{stringed_symbols}'".replace("''", "'")
    

    # get date strings
    before_date = mkt_utils.get_before_date(current_date, 15)
    after_date = mkt_utils.get_after_date(current_date, 15)

    # check if valid date
    if datetime.strptime(after_date, '%Y-%m-%d') >= max_date:
        continue
    
    # query using extracted values
    before_after_result = db.raw_sql(before_after_query.format(start_dt = before_date,
                                     current_dt = current_date,
                                     end_dt = after_date,
                                     symbol_lst = stringed_symbols))

    # get VIX close
    try:
        before_after_result["vix_close"] = vix_ticker.history(start=current_date, end=get_after_date(current_date, 1))['Close'].iloc[-1]
    except:
        before_after_result["vix_close"] = None

    # get sector data from yfinance
    # industry_data =  get_industry_data(subset_symbols)

    # merge with before_after_result dataframe
    # before_after_result = before_after_result.merge(industry_data, how='left', left_on='ticker', right_on='symbol')

    # write to CSV
    if i == 0:
        # Write the header in the first iteration
        before_after_result.to_csv(output_file, mode='w', index=False)
    else:
        # Append without writing the header in subsequent iterations
        before_after_result.to_csv(output_file, mode='a', index=False, header=False)
        
    i += 1

#### Read in Results

In [21]:
results = pd.read_csv("test_data/before_after_results.csv")

In [22]:
results.columns

Index(['ticker', 'current_date', 'before_date', 'after_date', 'open_pr',
       'close_pr', 'dlyhigh', 'dlylow', 'dlynumtrd', 'dlyvol',
       'before_avg_open', 'before_avg_close', 'before_avg_dlyhigh',
       'before_avg_dlylow', 'before_avg_dlynumtrd', 'before_avg_dlyvol',
       'after_avg_open', 'after_avg_close', 'after_avg_dlyhigh',
       'after_avg_dlylow', 'after_avg_dlynumtrd', 'after_avg_dlyvol',
       'vlty_estimate', 'gind', 'gsector', 'gsubind', 'idbflag', 'vix_close'],
      dtype='object')

In [23]:
results[results.ticker == 'EVLO']

Unnamed: 0,ticker,current_date,before_date,after_date,open_pr,close_pr,dlyhigh,dlylow,dlynumtrd,dlyvol,before_avg_open,before_avg_close,before_avg_dlyhigh,before_avg_dlylow,before_avg_dlynumtrd,before_avg_dlyvol,after_avg_open,after_avg_close,after_avg_dlyhigh,after_avg_dlylow,after_avg_dlynumtrd,after_avg_dlyvol,vlty_estimate,gind,gsector,gsubind,idbflag,vix_close
9776,EVLO,2018-10-29,2018-10-08,2018-11-19,8.67,8.98,9.385,8.15,255.0,18208.0,10.1629,9.8667,10.6433,9.6445,310.0667,31995.1333,8.8387,8.7593,9.3101,8.321,322.4667,29458.6,0.5677,,,,,24.7
10888,EVLO,2019-07-16,2019-06-24,2019-08-06,7.0101,7.65,8.145,7.0101,385.0,19960.0,8.5002,8.28,8.8947,8.015,379.2,62220.0667,7.0854,6.9653,7.4412,6.7265,368.2667,43658.8,0.3929,352010.0,35.0,35201010.0,D,12.86
11297,EVLO,2019-12-02,2019-11-08,2019-12-23,4.47,4.465,5.61,4.3,796.0,99103.0,5.8584,5.684,5.9517,5.5267,377.8,46643.7333,4.3033,4.2597,4.4871,4.1244,553.4,78670.0667,2.0706,,,,,14.91
11522,EVLO,2020-02-18,2020-01-27,2020-03-10,4.92,5.53,6.4539,4.886,1135.0,165339.0,5.5153,5.4307,5.8492,5.1305,799.0667,132481.4667,5.3353,5.308,5.6609,5.0206,684.6667,79203.4,0.5597,,,,,14.83
11715,EVLO,2020-03-09,2020-02-14,2020-03-30,4.7,4.75,4.84,3.77,665.0,93271.0,5.314,5.382,5.7838,5.0824,754.0,101482.6,4.0133,3.9527,4.3089,3.5757,612.9333,69524.8,1.2397,,,,,54.46
11931,EVLO,2020-03-12,2020-02-20,2020-04-02,3.98,4.32,4.32,3.01,523.0,100086.0,5.27,5.1647,5.5231,4.9026,645.2,77424.4667,3.8653,3.793,4.1729,3.516,585.5333,59758.9333,1.6224,,,,,75.47
12103,EVLO,2020-03-13,2020-02-21,2020-04-03,4.47,3.53,4.47,3.31,755.0,114795.0,5.0987,5.0533,5.3691,4.7093,579.2,74686.4,3.8217,3.795,4.1349,3.5153,556.3333,55155.5333,1.5842,,,,,57.83
14747,EVLO,2020-07-28,2020-07-07,2020-08-18,4.24,4.05,5.44,4.04,4964.0,964289.0,4.248,4.2233,4.4038,4.1247,636.9333,73116.1333,4.4133,4.4563,4.6246,4.2684,1007.4667,150671.4,1.0311,,,,,25.44
15276,EVLO,2020-12-14,2020-11-20,2021-01-06,7.564,9.41,11.67,7.56,28963.0,4145391.0,5.2547,5.416,5.6494,5.0593,1364.7333,202237.2667,10.9769,11.2233,11.7711,10.39,3868.4667,603263.7333,0.5621,,,,,24.72
19616,EVLO,2023-05-06,2023-04-14,2023-05-30,,,,,,,0.1529,0.1484,0.1711,0.131,13953.3125,13221303.0625,,,,,,,0.0573,,,,,16.98


In [24]:
import yfinance as yf 

# List of stock symbols
symbols = ["AAPL", "GOOGL", "TSLA", "MSFT"]

# Fetch industry for each symbol
industries = [yf.Ticker(symbol).info.get("industryKey") for symbol in symbols]
sectors = [yf.Ticker(symbol).info.get("sectorKey") for symbol in symbols]
industries_dict = {'symbol': symbols, 'industry_key': industries, 'sector_key': sectors}

In [25]:
industries_dict

{'symbol': ['AAPL', 'GOOGL', 'TSLA', 'MSFT'],
 'industry_key': ['consumer-electronics',
  'internet-content-information',
  'auto-manufacturers',
  'software-infrastructure'],
 'sector_key': ['technology',
  'communication-services',
  'consumer-cyclical',
  'technology']}

In [26]:
pd.DataFrame.from_dict(industries_dict)

Unnamed: 0,symbol,industry_key,sector_key
0,AAPL,consumer-electronics,technology
1,GOOGL,internet-content-information,communication-services
2,TSLA,auto-manufacturers,consumer-cyclical
3,MSFT,software-infrastructure,technology


In [27]:
nona_results = results.dropna(subset=['ticker', 'current_date', 'before_date', 'after_date', 'open_pr',
       'close_pr', 'dlyhigh', 'dlylow', 'dlynumtrd', 'dlyvol',
       'before_avg_open', 'before_avg_close', 'before_avg_dlyhigh',
       'before_avg_dlylow', 'before_avg_dlynumtrd', 'before_avg_dlyvol',
       'after_avg_open', 'after_avg_close', 'after_avg_dlyhigh',
       'after_avg_dlylow', 'after_avg_dlynumtrd', 'after_avg_dlyvol',
       'vlty_estimate']).reset_index(drop=True)

In [28]:
nona_results["vix_quantile"] = pd.qcut(nona_results.vix_close, 10, labels=False)
nona_results["vlty_quantile"] = pd.qcut(nona_results.vlty_estimate, 10, labels=False)

**Volatility estimates**

In [29]:
nona_results["vlty_estimate"].describe()

count   16754.0000
mean        1.8680
std         3.9878
min         0.0232
25%         0.5387
50%         0.9733
75%         1.6288
max        68.5050
Name: vlty_estimate, dtype: float64

In [30]:
nona_results["vix_close"].describe()

count   16754.0000
mean       25.4404
std        17.2686
min         9.1400
25%        14.7100
50%        19.3700
75%        27.1800
max        82.6900
Name: vix_close, dtype: float64

#### Explore Extreme Values

In [31]:
# view columns
nona_results.columns

Index(['ticker', 'current_date', 'before_date', 'after_date', 'open_pr',
       'close_pr', 'dlyhigh', 'dlylow', 'dlynumtrd', 'dlyvol',
       'before_avg_open', 'before_avg_close', 'before_avg_dlyhigh',
       'before_avg_dlylow', 'before_avg_dlynumtrd', 'before_avg_dlyvol',
       'after_avg_open', 'after_avg_close', 'after_avg_dlyhigh',
       'after_avg_dlylow', 'after_avg_dlynumtrd', 'after_avg_dlyvol',
       'vlty_estimate', 'gind', 'gsector', 'gsubind', 'idbflag', 'vix_close',
       'vix_quantile', 'vlty_quantile'],
      dtype='object')

**Get highs and lows**

In [32]:
daily_highs = nona_results[["ticker", "current_date", "before_date", "after_date", "dlyhigh", "before_avg_dlyhigh",  
                            "after_avg_dlyhigh", "close_pr", "before_avg_close", "after_avg_close", 'dlyvol', 'before_avg_dlyvol', 'after_avg_dlyvol',
                            'vlty_estimate', 'vix_close', 'vix_quantile', 'vlty_quantile']]
daily_lows = nona_results[["ticker", "current_date", "before_date", "after_date", "dlylow", "before_avg_dlylow",  
                           "after_avg_dlylow", "close_pr", "before_avg_close", "after_avg_close", 'dlyvol', 'before_avg_dlyvol', 'after_avg_dlyvol',
                           'vlty_estimate', 'vix_close', 'vix_quantile', 'vlty_quantile']]

**Evaluate daily highs**

In [33]:
d_highs_fltd = daily_highs[(daily_highs.before_avg_dlyhigh < daily_highs.dlyhigh) \
                           & (daily_highs.after_avg_dlyhigh < daily_highs.dlyhigh)].reset_index(drop=True)

In [34]:
d_highs_fltd["before_pct_diff"] = ((d_highs_fltd.dlyhigh - d_highs_fltd.before_avg_dlyhigh)/d_highs_fltd.dlyhigh)*100
d_highs_fltd["after_pct_diff"] = ((d_highs_fltd.dlyhigh - d_highs_fltd.after_avg_dlyhigh)/d_highs_fltd.dlyhigh)*100
d_highs_fltd["bef_aft_diff"] = np.abs(((d_highs_fltd.before_avg_dlyhigh - d_highs_fltd.after_avg_dlyhigh)/d_highs_fltd.before_avg_dlyhigh)*100)

In [35]:
d_highs_fltd[(d_highs_fltd.bef_aft_diff < 5) & (d_highs_fltd.before_pct_diff > 3)].sort_values(["vlty_quantile", "bef_aft_diff"], ascending=True)

Unnamed: 0,ticker,current_date,before_date,after_date,dlyhigh,before_avg_dlyhigh,after_avg_dlyhigh,close_pr,before_avg_close,after_avg_close,dlyvol,before_avg_dlyvol,after_avg_dlyvol,vlty_estimate,vix_close,vix_quantile,vlty_quantile,before_pct_diff,after_pct_diff,bef_aft_diff
72,QKLS,2013-10-18,2013-09-27,2013-11-08,5.7790,4.9043,4.9038,5.3500,4.8029,4.7450,2500.0000,6242.5333,3294.6667,0.2903,13.0400,1,0,15.1363,15.1437,0.0087
619,HTWO,2015-01-30,2015-01-08,2015-02-23,12.4700,10.4615,10.4650,10.5000,9.9333,10.3982,649300.0000,628.0000,183378.8182,0.2971,20.9700,5,0,16.1070,16.0786,0.0339
609,CORI,2015-01-26,2015-01-02,2015-02-17,7.3500,6.8278,6.8238,7.0200,6.6559,6.7250,53928.0000,14164.7333,6382.0667,0.3005,15.5200,2,0,7.1053,7.1587,0.0574
1144,AIQ,2015-12-07,2015-11-13,2015-12-29,9.4500,8.9761,8.9830,8.6300,8.6387,8.8157,16209.0000,86268.4000,26222.2667,0.2350,15.8400,3,0,5.0143,4.9418,0.0764
431,GDEF,2014-09-25,2014-09-04,2014-10-16,10.7900,10.3342,10.3245,10.5000,10.3223,10.3120,4896.0000,6489.7333,1964.8000,0.2685,15.6400,3,0,4.2246,4.3146,0.0940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3401,GWGH,2019-11-05,2019-10-15,2019-11-26,10.4900,9.9007,10.3903,10.4900,9.8210,10.2090,4487.0000,5004.5333,5566.8000,5.9930,13.1000,1,9,5.6182,0.9502,4.9458
2913,PLL,2019-01-07,2018-12-13,2019-01-29,8.2100,7.2845,7.6461,7.3864,6.9114,7.4553,1113.0000,3545.2667,4116.7333,10.5243,21.4000,5,9,11.2728,6.8685,4.9640
4443,MDJH,2020-06-17,2020-05-27,2020-07-09,3.4342,3.1729,3.0153,3.0000,2.8204,2.8460,12157.0000,31425.4000,9730.8000,3.6204,33.4700,8,9,7.6088,12.1989,4.9681
4977,LIXT,2020-12-17,2020-11-25,2021-01-11,4.7900,3.8135,3.6234,4.1500,3.5947,3.4000,1969411.0000,79643.0667,258223.1333,4.0959,21.9300,6,9,20.3857,24.3553,4.9861


**View how we might determine cutoffs based on the before/after percent differences.**

In [36]:
d_highs_fltd[(d_highs_fltd.bef_aft_diff < 10)][["vlty_quantile", 
                                               "before_pct_diff", 
                                               "after_pct_diff"]].groupby("vlty_quantile").describe()

Unnamed: 0_level_0,before_pct_diff,before_pct_diff,before_pct_diff,before_pct_diff,before_pct_diff,before_pct_diff,before_pct_diff,before_pct_diff,after_pct_diff,after_pct_diff,after_pct_diff,after_pct_diff,after_pct_diff,after_pct_diff,after_pct_diff,after_pct_diff
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
vlty_quantile,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,470.0,10.434,12.3018,0.02,2.0839,6.0359,14.2067,72.0623,470.0,10.5476,11.9419,0.0374,2.6332,6.9371,13.8704,69.443
1,411.0,11.588,13.3537,0.0211,2.1619,7.2549,15.6954,68.4064,411.0,11.5131,13.2363,0.009,2.6801,6.8056,14.7738,67.8771
2,420.0,12.3946,13.9741,0.023,2.7281,7.6672,17.6144,77.7032,420.0,12.4571,13.8212,0.0271,2.4194,8.0513,17.6224,79.3335
3,406.0,13.1822,13.889,0.045,3.1815,8.8129,17.7663,76.6053,406.0,13.3947,13.7242,0.0246,3.5893,8.7914,17.5215,76.7811
4,341.0,14.4702,13.8528,0.0627,3.9451,10.4249,20.6305,68.9182,341.0,14.6093,13.7315,0.0877,4.605,10.6865,18.9182,66.6942
5,333.0,17.1813,16.0917,0.0149,5.0482,11.4631,25.0487,77.2078,333.0,17.1968,15.9675,0.0348,4.9044,12.9912,24.4757,77.764
6,235.0,18.6804,16.2635,0.016,6.664,12.9691,29.0813,74.8667,235.0,18.6695,16.2314,0.0442,5.7058,13.1587,29.8729,73.1714
7,248.0,19.7407,15.6568,0.1649,6.5811,16.8755,28.3532,74.5055,248.0,19.8863,15.4579,0.2237,8.1492,16.51,27.8691,73.5697
8,335.0,20.4766,15.5672,0.0238,8.208,16.9756,29.6136,75.1898,335.0,20.2349,15.613,0.2753,7.8134,16.9184,30.2005,76.9497
9,281.0,19.8803,16.2593,0.0439,7.6088,15.6954,27.9658,78.7665,281.0,19.4978,16.2934,0.1593,6.8195,14.2127,27.8914,79.7649


**Visualize individual results using the below function:**

In [37]:
mkt_utils.multiday_chart('GDEF', '2014-09-25')

NameError: name 'mkt_utils' is not defined

In [None]:
mkt_utils.multiday_chart('VVPR', '2020-10-09')

**Evaluate daily lows**

In [None]:
d_lows_fltd = daily_lows[(daily_lows.before_avg_dlylow > daily_lows.dlylow) \
                         & (daily_lows.after_avg_dlylow > daily_lows.dlylow)].reset_index(drop=True)

In [None]:
d_lows_fltd

In [None]:
d_lows_fltd["before_pct_diff"] = ((d_lows_fltd.dlylow - d_lows_fltd.before_avg_dlylow)/d_lows_fltd.dlylow)*100
d_lows_fltd["after_pct_diff"] = ((d_lows_fltd.dlylow - d_lows_fltd.after_avg_dlylow)/d_lows_fltd.dlylow)*100
d_lows_fltd["bef_aft_diff"] = np.abs(((d_lows_fltd.before_avg_dlylow - d_lows_fltd.after_avg_dlylow)/d_lows_fltd.before_avg_dlylow)*100)

In [None]:
d_lows_fltd

In [None]:
d_lows_fltd[(d_lows_fltd.before_pct_diff < -40) \
             & (d_lows_fltd.after_pct_diff < -40) \
             & (d_lows_fltd.bef_aft_diff < 10)].sort_values(["bef_aft_diff"], ascending=True)

In [None]:
mkt_utils.multiday_chart('BRAG', '2022-06-08', high=False)

#### Determine Extreme Values

First, generate cutoff/threshold dictionaries for the high and low dataframes.

In [None]:
# function to get cutoff dictionaries based on volatility quantiles
def cutoffs_dict(hl_df: pd.DataFrame, pct_diff_quantile: float, bef_aft_quantile: float, max_bef_aft_diff = 10):
    cutoff_dict = {}
    
    for val in sorted(hl_df.vlty_quantile.unique()):
        # get before and after percentages
        bef_percent = np.abs(hl_df[(hl_df.bef_aft_diff < max_bef_aft_diff) 
                             & (hl_df.vlty_quantile == val)]["before_pct_diff"]).quantile(pct_diff_quantile)
        aft_percent = np.abs(hl_df[(hl_df.bef_aft_diff < max_bef_aft_diff) 
                             & (hl_df.vlty_quantile == val)]["after_pct_diff"]).quantile(pct_diff_quantile)

        # get cutoff for bef_aft_diff
        bef_aft_cutoff = hl_df[(hl_df.bef_aft_diff < max_bef_aft_diff) & (hl_df.vlty_quantile == val)]["bef_aft_diff"].quantile(bef_aft_quantile)

        # add to dictionary
        cutoff_dict[val] = {"pct_diff_cutoff": np.mean([bef_percent, aft_percent]), "bef_aft_cutoff": bef_aft_cutoff}

    return cutoff_dict

In [None]:
high_cutoff_dict = cutoffs_dict(d_highs_fltd, pct_diff_quantile = 0.75, bef_aft_quantile = 0.3)
low_cutoff_dict = cutoffs_dict(d_lows_fltd, pct_diff_quantile = 0.75, bef_aft_quantile = 0.3)

In [None]:
high_cutoff_dict

Now, use the below function to flag suspected anomalies.

In [None]:
def flag_anomaly(row, high: bool) -> bool:
    # get relevant values
    row_before = np.abs(row.before_pct_diff)
    row_after = np.abs(row.after_pct_diff)
    row_bef_aft = row.bef_aft_diff
    row_vol = row.vlty_quantile

    # get dictionary values
    if high:
        pct_diff_cutoff = high_cutoff_dict[row_vol]["pct_diff_cutoff"]
        bef_aft_cutoff = high_cutoff_dict[row_vol]["bef_aft_cutoff"]
    else:
        pct_diff_cutoff = low_cutoff_dict[row_vol]["pct_diff_cutoff"]
        bef_aft_cutoff = low_cutoff_dict[row_vol]["bef_aft_cutoff"]

    if (row_before >= pct_diff_cutoff) & (row_after >= pct_diff_cutoff) & (row_bef_aft <= bef_aft_cutoff):
        return True
    else:
        return False

In [None]:
d_highs_fltd['anomaly_fl'] = d_highs_fltd.apply(lambda row: flag_anomaly(row, high=True), axis=1)
d_lows_fltd['anomaly_fl'] = d_highs_fltd.apply(lambda row: flag_anomaly(row, high=False), axis=1)

In [None]:
d_highs_fltd.to_csv("high_anomalies.csv", index=False)

In [None]:
mkt_utils.multiday_chart('PALI', '2023-09-06', high = True, diff_num = 15)

In [None]:
d_lows_fltd.to_csv("low_anomalies.csv", index=False)

## Testing Open Source Market Data

**Finance database**: https://github.com/JerBouma/FinanceDatabase
- "The FinanceDatabase serves the role of providing anyone with any type of financial product categorization entirely for free. To be able to achieve this, the FinanceDatabase relies on involvement from the community to add, edit and remove tickers over time. This is made easy enough that anyone, even with a lack of coding experience can contribute because of the usage of CSV files that can be manually edited with ease."

In [None]:
import financedatabase as fd

# Initialize the Equities database
equities = fd.Equities()

In [None]:
# Use the tickers to obtain data via the Finance Toolkit
telecomunication_services = equities.search(
    industry="Diversified Telecommunication Services",
    country="United States",
    market_cap="Mega Cap",
    exclude_exchanges=True)

In [None]:
all_symbols = equities.select(exclude_exchanges=True).reset_index()
all_symbols['symbol'] = all_symbols['symbol'].fillna('NA')  # Fill NaNs in 'price' column with 0

In [None]:
all_symbols[all_symbols.symbol == 'AAPL']

In [None]:
all_symbols.industry.unique().tolist()

In [None]:
# all_symbols.drop_duplicates().reset_index(drop=True).to_csv("symbol_details.csv", index=False)

Can we fill in the blank for some of these using Yahoo's API?

In [None]:
# mkt_utils = MarketUtilities(wrds_username='audreymcmillion')
mkt_utils.get_industry_data(["FTEL", 'IXHL', 'AIRE'])

Get tickers with null sectors from database.

In [None]:
null_sector_tickers = pd.read_sql("""
with sector_luld as (
	select bar.*, sd.sector, sd.industry_group, sd.industry, sd.market_cap
	from before_after_results bar
	left join symbol_details sd 
	on (bar.ticker) = (sd.symbol)
	order by bar.current_date desc
)

select distinct s.ticker
from sector_luld s
where s.sector is null
""", mkt_utils.sqlite_conn)             

In [None]:
null_sector_tickers = null_sector_tickers.ticker.to_list()

In [None]:
len(null_sector_tickers)

In [None]:
luldnull_sectors = mkt_utils.get_industry_data(null_sector_tickers)

In [None]:
symbol_nonnull = luldnull_sectors[~luldnull_sectors.industry.isna()].reset_index(drop=True)

In [None]:
sorted(symbol_nonnull.industry.unique().tolist())

In [None]:
sorted(symbol_nonnull.sector.unique().tolist())

In [None]:
all_sector = pd.read_sql("""
	select distinct sd.sector, sd.industry_group, sd.industry, sd.market_cap
	from symbol_details sd  
    where sd.sector is not null
    and sd.industry_group is not null
    and sd.industry is not null
    order by sd.sector
""", mkt_utils.sqlite_conn)            

In [None]:
sorted(all_sector.industry.unique().tolist())

In [None]:
sorted(all_sector.sector.unique().tolist())

In [None]:
industry_mapping = {
    'Capital Markets': 'Capital Markets',
    'Consumer Electronics': 'Household Durables',
    'Software - Infrastructure': 'Software',
    'Building Materials': 'Building Products',
    'Beverages - Non-Alcoholic': 'Beverages',
    'Biotechnology': 'Biotechnology',
    'Specialty Industrial Machinery': 'Machinery',
    'Computer Hardware': 'Technology Hardware, Storage & Peripherals',
    'Real Estate - Development': 'Real Estate Management & Development',
    'Specialty Retail': 'Specialty Retail',
    'Consulting Services': 'Professional Services',
    'Health Information Services': 'Health Care Technology',
    'Integrated Freight & Logistics': 'Air Freight & Logistics',
    'Asset Management': 'Diversified Financial Services',
    'Auto Parts': 'Auto Components',
    'Real Estate Services': 'Real Estate Management & Development',
    'Software - Application': 'Software',
    'Internet Content & Information': 'Interactive Media & Services',
    'Information Technology Services': 'IT Services',
    'Specialty Business Services': 'Professional Services',
    'Conglomerates': 'Industrial Conglomerates',
    'Grocery Stores': 'Food & Staples Retailing',
    'Aerospace & Defense': 'Aerospace & Defense',
    'Advertising Agencies': 'Media',
    'Solar': 'Independent Power and Renewable Electricity Producers',
    'Medical Care Facilities': 'Health Care Providers & Services',
    'Farm Products': 'Food Products',
    'Drug Manufacturers - General': 'Pharmaceuticals',
    'Lumber & Wood Production': 'Paper & Forest Products',
    'Leisure': 'Hotels, Restaurants & Leisure',
    'Auto Manufacturers': 'Automobiles',
    'Waste Management': 'Commercial Services & Supplies',
    'Communication Equipment': 'Communications Equipment',
    'Auto & Truck Dealerships': 'Distributors',
    'Packaged Foods': 'Food Products',
    'Shell Companies': None,
    'Personal Services': 'Diversified Consumer Services',
    'Other Industrial Metals & Mining': 'Metals & Mining',
    'Marine Shipping': 'Marine',
    'Utilities - Renewable': 'Independent Power and Renewable Electricity Producers',
    'Security & Protection Services': 'Commercial Services & Supplies',
    'Electrical Equipment & Parts': 'Electrical Equipment',
    'Electronic Gaming & Multimedia': 'Entertainment',
    'Railroads': 'Road & Rail',
    'Banks - Regional': 'Banks',
    'Gambling': 'Hotels, Restaurants & Leisure',
    'Scientific & Technical Instruments': 'Electronic Equipment, Instruments & Components',
    'Diagnostics & Research': 'Health Care Equipment & Supplies',
    'Drug Manufacturers - Specialty & Generic': 'Pharmaceuticals',
    'Lodging': 'Hotels, Restaurants & Leisure',
    'Electronic Components': 'Electronic Equipment, Instruments & Components',
    'Entertainment': 'Entertainment',
    'Oil & Gas Integrated': 'Oil, Gas & Consumable Fuels',
    'Travel Services': 'Hotels, Restaurants & Leisure',
    'Medical Devices': 'Health Care Equipment & Supplies',
    'Credit Services': 'Consumer Finance',
    'Internet Retail': 'Internet & Direct Marketing Retail',
    'Utilities - Regulated Water': 'Water Utilities'
}

sector_mapping = {
    'Basic Materials': 'Materials',
    'Communication Services': 'Communication Services',
    'Consumer Cyclical': 'Consumer Discretionary',
    'Consumer Defensive': 'Consumer Staples',
    'Energy': 'Energy',
    'Financial Services': 'Financials',
    'Healthcare': 'Health Care',
    'Industrials': 'Industrials',
    'Real Estate': 'Real Estate',
    'Technology': 'Information Technology',
    'Utilities': 'Utilities'
}

In [None]:
symbol_nonnull['industry'] = symbol_nonnull['industry'].map(industry_mapping)
symbol_nonnull['sector'] = symbol_nonnull['sector'].map(sector_mapping)

In [None]:
# symbol_nonnull.to_csv("yfinance_sectors.csv", index=False)

Examine the "still nulls"...

In [None]:
still_nulls = luldnull_sectors[luldnull_sectors.industry.isna()].symbol.unique()

In [None]:
still_nulls

Another option, **TODO** when I get home tonight.

In [None]:
from sec_api import MappingApi
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

# Initialize the MappingApi with your API key
mapping_api = MappingApi(api_key=os.getenv("SEC_API_KEY"))

# List of tickers
tickers = ['BRSH', 'HRYU', 'PXDT', 'BSGA','REVE']

# Dictionary to store the results
ticker_info = {'symbol':[], 'sector':[], 'industry':[]}

for ticker in tickers:   
    ticker_info['symbol'].append(ticker) 
    
    # Resolve ticker to get company details
    company_details = mapping_api.resolve('ticker', ticker)
    if company_details:
        # Extract the first matching result
        details = company_details[0]
        ticker_info['sector'].append(details.get('sector'))
        ticker_info['industry'].append(details.get('industry'))
    else:
        ticker_info['sector'].append(None)
        ticker_info['industry'].append(None)

In [None]:
pd.DataFrame(ticker_info)

**NOTES**:  
WBUY	Internet & Direct Marketing Retail	Consumer Discretionary  
NTBL	Biotechnology	Health Care  
DMK	Pharmaceuticals	Health Care  