In [1]:
import os
import numpy as np
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
from cleanco.clean import custom_basename
from cleanco.clean import prepare_default_terms
from kagglehub import dataset_download
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key = api_key)

  from .autonotebook import tqdm as notebook_tqdm


## Load NASDAQ traded stock symbols

In [None]:
datadir = os.path.join('..', 'stock-market-dataset')
fn = 'symbols_valid_meta.csv'
nasdaq = pd.read_csv(os.path.join(datadir, fn))
nasdaq

Unnamed: 0,Nasdaq Traded,Symbol,Security Name,Listing Exchange,Market Category,ETF,Round Lot Size,Test Issue,Financial Status,CQS Symbol,NASDAQ Symbol,NextShares
0,Y,A,"Agilent Technologies, Inc. Common Stock",N,,N,100.0,N,,A,A,N
1,Y,AA,Alcoa Corporation Common Stock,N,,N,100.0,N,,AA,AA,N
2,Y,AAA,Alternative Access First Priority CLO Bond ETF,P,,Y,100.0,N,,AAA,AAA,N
3,Y,AAAU,Goldman Sachs Physical Gold ETF Shares,Z,,Y,100.0,N,,AAAU,AAAU,N
4,Y,AACG,ATA Creativity Global - American Depositary Sh...,Q,S,N,100.0,N,N,,AACG,N
...,...,...,...,...,...,...,...,...,...,...,...,...
8654,Y,ZVOL,Volatility Premium Plus ETF,Z,,Y,100.0,N,,ZVOL,ZVOL,N
8655,Y,ZVRA,"Zevra Therapeutics, Inc. - Common Stock",Q,Q,N,100.0,N,N,,ZVRA,N
8656,Y,ZWS,Zurn Elkay Water Solutions Corporation Common ...,N,,N,100.0,N,,ZWS,ZWS,N
8657,Y,ZYME,Zymeworks Inc. - Common Stock,Q,Q,N,100.0,N,N,,ZYME,N


In [3]:
nasdaq_stocks = nasdaq.loc[nasdaq.ETF == 'N'].copy()
nasdaq_stocks

Unnamed: 0,Nasdaq Traded,Symbol,Security Name,Listing Exchange,Market Category,ETF,Round Lot Size,Test Issue,Financial Status,CQS Symbol,NASDAQ Symbol,NextShares
0,Y,A,"Agilent Technologies, Inc. Common Stock",N,,N,100.0,N,,A,A,N
1,Y,AA,Alcoa Corporation Common Stock,N,,N,100.0,N,,AA,AA,N
4,Y,AACG,ATA Creativity Global - American Depositary Sh...,Q,S,N,100.0,N,N,,AACG,N
5,Y,AACT,Ares Acquisition Corporation II Class A Ordina...,N,,N,100.0,N,,AACT,AACT,N
7,Y,AAL,"American Airlines Group, Inc. - Common Stock",Q,Q,N,100.0,N,N,,AAL,N
...,...,...,...,...,...,...,...,...,...,...,...,...
8653,Y,ZVIA,Zevia PBC Class A Common Stock,N,,N,100.0,N,,ZVIA,ZVIA,N
8655,Y,ZVRA,"Zevra Therapeutics, Inc. - Common Stock",Q,Q,N,100.0,N,N,,ZVRA,N
8656,Y,ZWS,Zurn Elkay Water Solutions Corporation Common ...,N,,N,100.0,N,,ZWS,ZWS,N
8657,Y,ZYME,Zymeworks Inc. - Common Stock,Q,Q,N,100.0,N,N,,ZYME,N


In [4]:
nasdaq_health = pd.read_csv(os.path.join(datadir, 'nasdaq_screener_healthcare.csv'))
nasdaq_health = nasdaq_stocks.loc[nasdaq_stocks.Symbol.isin(nasdaq_health.Symbol)]
nasdaq_health

Unnamed: 0,Nasdaq Traded,Symbol,Security Name,Listing Exchange,Market Category,ETF,Round Lot Size,Test Issue,Financial Status,CQS Symbol,NASDAQ Symbol,NextShares
23,Y,ABBV,AbbVie Inc. Common Stock,N,,N,100.0,N,,ABBV,ABBV,N
25,Y,ABCL,AbCellera Biologics Inc. - Common Shares,Q,Q,N,100.0,N,N,,ABCL,N
27,Y,ABEO,Abeona Therapeutics Inc. - Common Stock,Q,S,N,100.0,N,N,,ABEO,N
39,Y,ABOS,"Acumen Pharmaceuticals, Inc. - Common Stock",Q,Q,N,100.0,N,N,,ABOS,N
41,Y,ABP,"Abpro Holdings, Inc - Common Stock",Q,G,N,100.0,N,D,,ABP,N
...,...,...,...,...,...,...,...,...,...,...,...,...
8650,Y,ZTS,Zoetis Inc. Class A Common Stock,N,,N,100.0,N,,ZTS,ZTS,N
8652,Y,ZURA,Zura Bio Limited - Class A Ordinary shares,Q,S,N,100.0,N,N,,ZURA,N
8655,Y,ZVRA,"Zevra Therapeutics, Inc. - Common Stock",Q,Q,N,100.0,N,N,,ZVRA,N
8657,Y,ZYME,Zymeworks Inc. - Common Stock,Q,Q,N,100.0,N,N,,ZYME,N


## Load FDA product recalls

In [5]:
pth = dataset_download('mexwell/fda-product-recalls')
recalls = pd.read_csv(os.path.join(pth, 'fda_product_recalls.csv'))
datecol = 'center_classification_date'
recalls[datecol] = pd.to_datetime(recalls[datecol])
recalls

Unnamed: 0,fei_number,recalling_firm_name,product_type,product_classification,status,distribution_pattern,recalling_firm_city,recalling_firm_state,recalling_firm_country,center_classification_date,reason_for_recall,product_description,event_id,event_classification,product_id,center,recall_details
0,3.002602e+09,Lamb Weston Sales,Food/Cosmetics,Class I,Ongoing,"Distributed in CA, IA, IL, KS, LA MO, MS, NM, ...",Kennewick,Washington,United States,2023-04-21,Undeclared Wheat in foodservice item Hashbrown...,"G5300 Lamb's Supreme Hash Brown Patties, Froze...",92014,Class I,199418,CFSAN,https://www.accessdata.fda.gov/scripts/ires/?P...
1,3.012438e+09,Fresh Express Incorpated,Food/Cosmetics,Class I,Ongoing,Product was shipped to the following states: F...,Windermere,Florida,United States,2023-04-21,The firm was notified by one of their customer...,Fresh EXPRESS Chopped Kit Caesar Romaine Lettu...,92068,Class I,199573,CFSAN,https://www.accessdata.fda.gov/scripts/ires/?P...
2,3.012438e+09,Fresh Express Incorpated,Food/Cosmetics,Class I,Ongoing,Product was shipped to the following states: F...,Windermere,Florida,United States,2023-04-21,The firm was notified by one of their customer...,Fresh Express Chopped Kit Chipotle Cheddar TOT...,92068,Class I,199574,CFSAN,https://www.accessdata.fda.gov/scripts/ires/?P...
3,3.012438e+09,Fresh Express Incorpated,Food/Cosmetics,Class I,Ongoing,Product was shipped to the following states: F...,Windermere,Florida,United States,2023-04-21,The firm was notified by one of their customer...,PREMIUM MAKOTO HONEY GINGER SALAD KIT TOTAL NE...,92068,Class I,199575,CFSAN,https://www.accessdata.fda.gov/scripts/ires/?P...
4,1.000222e+09,"Blood Bank Computer Systems, Inc",Biologics,Class II,Terminated,"GA, DE, TX, MO, PA, CA, FL, KY, IA, MI, IL, an...",Auburn,Washington,United States,2023-04-21,Blood Bank Computer Systems has discovered in ...,"ABO Wheels, Version 1.1.0",91219,Class II,197268,CBER,https://www.accessdata.fda.gov/scripts/ires/?P...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83150,3.004404e+09,Panera Bread LLC,Food/Cosmetics,Class II,Terminated,Nationwide,Saint Louis,Missouri,United States,2012-06-08,Product ingredient statement reversed for Red...,"Panera ,HAZELNUT CREAM CHEESE SPREAD Reduced F...",61831,Class II,109200,CFSAN,https://www.accessdata.fda.gov/scripts/ires/?P...
83151,3.004162e+09,"DSM Nutritional Products, Inc.",Food/Cosmetics,Class II,Terminated,"NJ, WI, IL",Parsippany,New Jersey,United States,2012-06-08,Flavor is contaminated with Salmonella,GB Select Roast Meat Type Flavor Net Wt. 55 lb...,61936,Class II,109523,CFSAN,https://www.accessdata.fda.gov/scripts/ires/?P...
83152,3.002727e+09,Best West Foods,Food/Cosmetics,Class II,Terminated,NV only.,Las Vegas,Nevada,United States,2012-06-08,Soy was not included in the ingredient stateme...,"Florentine Lasagna Rolls;\r\nPerishable, keep ...",61968,Class II,109609,CFSAN,https://www.accessdata.fda.gov/scripts/ires/?P...
83153,3.002727e+09,Best West Foods,Food/Cosmetics,Class II,Terminated,NV only.,Las Vegas,Nevada,United States,2012-06-08,Soy was not included in the ingredient stateme...,"Cheese Lasagna Rolls;\r\nPerishable, keep froz...",61968,Class II,109610,CFSAN,https://www.accessdata.fda.gov/scripts/ires/?P...


## Select class I medical recalls

In [6]:
msk_med = recalls.product_type.isin(['Drugs', 'Devices', 'Biologics'])
msk_c1 = (recalls.event_classification == 'Class I')
recalls_medc1 = recalls.loc[msk_med & msk_c1].copy()
recalls_medc1

Unnamed: 0,fei_number,recalling_firm_name,product_type,product_classification,status,distribution_pattern,recalling_firm_city,recalling_firm_state,recalling_firm_country,center_classification_date,reason_for_recall,product_description,event_id,event_classification,product_id,center,recall_details
26,2.936999e+06,Covidien,Devices,Class I,Ongoing,Worldwide - US Nationwide distribution includi...,Boulder,Colorado,United States,2023-04-19,"A manufacturing error, resulted in a less than...",Shiley Adult Flexible Tracheostomy Tube with T...,91943,Class I,199257,CDRH,https://www.accessdata.fda.gov/scripts/ires/?P...
157,3.014732e+09,"Fresenius Kabi USA, LLC",Devices,Class I,Ongoing,"Domestic: CA, CO, NJ, WI, & UT. No foreign dis...",North Andover,Massachusetts,United States,2023-04-11,Fluid ingress that can cause a loss of electri...,"Ivenix Infusion System (IIS), Large Volume Pum...",91783,Class I,198841,CDRH,https://www.accessdata.fda.gov/scripts/ires/?P...
185,3.002803e+09,"Abbott Diabetes Care, Inc.",Devices,Class I,Ongoing,U.S. Nationwide.,Alameda,California,United States,2023-04-06,Lithium-ion batteries in glucose monitoring sy...,"FreeStyle Libre Reader, REF: 71525-01, 71701-0...",91756,Class I,198772,CDRH,https://www.accessdata.fda.gov/scripts/ires/?P...
186,3.002803e+09,"Abbott Diabetes Care, Inc.",Devices,Class I,Ongoing,U.S. Nationwide.,Alameda,California,United States,2023-04-06,Lithium-ion batteries in glucose monitoring sy...,"FreeStyle Libre Reader, REF: 71936-01, 71937-0...",91756,Class I,198773,CDRH,https://www.accessdata.fda.gov/scripts/ires/?P...
187,3.002803e+09,"Abbott Diabetes Care, Inc.",Devices,Class I,Ongoing,U.S. Nationwide.,Alameda,California,United States,2023-04-06,Lithium-ion batteries in glucose monitoring sy...,"FreeStyle Libre Reader, REF: 71951-01, 71952-0...",91756,Class I,198813,CDRH,https://www.accessdata.fda.gov/scripts/ires/?P...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82896,1.039215e+06,Nidek Medical Products Inc,Devices,Class I,Terminated,Worldwide Distribution-USA (nationwide) and th...,Birmingham,Alabama,United States,2012-06-19,Capacitor failure may result in a fire hazard ...,"NIDEK Medical MARK5 NUVO / M5C5, 115 V ~60Hz -...",61843,Class I,109227,CDRH,https://www.accessdata.fda.gov/scripts/ires/?P...
82908,1.641527e+06,"Physicians Total Care, Inc",Drugs,Class I,Terminated,FL,Tulsa,Oklahoma,United States,2012-06-18,Labeling: Label mix-up; Bottles labeled to con...,"Morphine Sulfate Extended Release tablet, 30 m...",61233,Class I,107624,CDER,https://www.accessdata.fda.gov/scripts/ires/?P...
82909,1.641527e+06,"Physicians Total Care, Inc",Drugs,Class I,Terminated,FL,Tulsa,Oklahoma,United States,2012-06-18,Labeling: Label mix-up; Bottles labeled to con...,"Morphine Sulfate Immediate Release tablet, 30 ...",61233,Class I,107625,CDER,https://www.accessdata.fda.gov/scripts/ires/?P...
82998,2.126677e+06,"GE Healthcare, LLC",Devices,Class I,Terminated,Nationwide Distribution - including the states...,Waukesha,Wisconsin,United States,2012-06-17,GE Healthcare has recently become aware of a p...,"GE Healthcare, Aestiva/5 7900 SmartVent, anest...",61639,Class I,108604,CDRH,https://www.accessdata.fda.gov/scripts/ires/?P...


## Clean recalling firms' names

In [7]:
recalling_firm_name = recalls_medc1.recalling_firm_name.copy()
basename = lambda x: custom_basename(x, prepare_default_terms(), middle = True)
for i in range(2):
    for _ in range(3): recalling_firm_name = recalling_firm_name.map(basename)
    recalling_firm_name = (recalling_firm_name
                           .str.lower()
                           .str.replace(r"\/|\.", ' ', regex = True)
                           .str.replace(r'[^\w\s\-]', '', regex = True)
                           .str.replace(r'\s+', ' ', regex = True)
                           .str.replace(r'(?<=\b\w) (?=\w\b)', '', regex = True)
                           .str.strip())
recalls_medc1['recalling_firm_name'] = recalling_firm_name.copy() 
recalls_medc1

Unnamed: 0,fei_number,recalling_firm_name,product_type,product_classification,status,distribution_pattern,recalling_firm_city,recalling_firm_state,recalling_firm_country,center_classification_date,reason_for_recall,product_description,event_id,event_classification,product_id,center,recall_details
26,2.936999e+06,covidien,Devices,Class I,Ongoing,Worldwide - US Nationwide distribution includi...,Boulder,Colorado,United States,2023-04-19,"A manufacturing error, resulted in a less than...",Shiley Adult Flexible Tracheostomy Tube with T...,91943,Class I,199257,CDRH,https://www.accessdata.fda.gov/scripts/ires/?P...
157,3.014732e+09,fresenius kabi usa,Devices,Class I,Ongoing,"Domestic: CA, CO, NJ, WI, & UT. No foreign dis...",North Andover,Massachusetts,United States,2023-04-11,Fluid ingress that can cause a loss of electri...,"Ivenix Infusion System (IIS), Large Volume Pum...",91783,Class I,198841,CDRH,https://www.accessdata.fda.gov/scripts/ires/?P...
185,3.002803e+09,abbott diabetes care,Devices,Class I,Ongoing,U.S. Nationwide.,Alameda,California,United States,2023-04-06,Lithium-ion batteries in glucose monitoring sy...,"FreeStyle Libre Reader, REF: 71525-01, 71701-0...",91756,Class I,198772,CDRH,https://www.accessdata.fda.gov/scripts/ires/?P...
186,3.002803e+09,abbott diabetes care,Devices,Class I,Ongoing,U.S. Nationwide.,Alameda,California,United States,2023-04-06,Lithium-ion batteries in glucose monitoring sy...,"FreeStyle Libre Reader, REF: 71936-01, 71937-0...",91756,Class I,198773,CDRH,https://www.accessdata.fda.gov/scripts/ires/?P...
187,3.002803e+09,abbott diabetes care,Devices,Class I,Ongoing,U.S. Nationwide.,Alameda,California,United States,2023-04-06,Lithium-ion batteries in glucose monitoring sy...,"FreeStyle Libre Reader, REF: 71951-01, 71952-0...",91756,Class I,198813,CDRH,https://www.accessdata.fda.gov/scripts/ires/?P...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82896,1.039215e+06,nidek medical products,Devices,Class I,Terminated,Worldwide Distribution-USA (nationwide) and th...,Birmingham,Alabama,United States,2012-06-19,Capacitor failure may result in a fire hazard ...,"NIDEK Medical MARK5 NUVO / M5C5, 115 V ~60Hz -...",61843,Class I,109227,CDRH,https://www.accessdata.fda.gov/scripts/ires/?P...
82908,1.641527e+06,physicians total care,Drugs,Class I,Terminated,FL,Tulsa,Oklahoma,United States,2012-06-18,Labeling: Label mix-up; Bottles labeled to con...,"Morphine Sulfate Extended Release tablet, 30 m...",61233,Class I,107624,CDER,https://www.accessdata.fda.gov/scripts/ires/?P...
82909,1.641527e+06,physicians total care,Drugs,Class I,Terminated,FL,Tulsa,Oklahoma,United States,2012-06-18,Labeling: Label mix-up; Bottles labeled to con...,"Morphine Sulfate Immediate Release tablet, 30 ...",61233,Class I,107625,CDER,https://www.accessdata.fda.gov/scripts/ires/?P...
82998,2.126677e+06,ge healthcare,Devices,Class I,Terminated,Nationwide Distribution - including the states...,Waukesha,Wisconsin,United States,2012-06-17,GE Healthcare has recently become aware of a p...,"GE Healthcare, Aestiva/5 7900 SmartVent, anest...",61639,Class I,108604,CDRH,https://www.accessdata.fda.gov/scripts/ires/?P...


## Keep recalling firms with multiple recalls

In [8]:
# deduplicate events per-firm in a 1-week period
colnames = ['event_id', 'recalling_firm_name', datecol,
            'reason_for_recall', 'recall_details']
recalls_events = (recalls_medc1.loc[:, colnames].groupby(
    [colnames[1], pd.Grouper(key = datecol, freq = 'W')])
    .last().reset_index()); recalls_events

Unnamed: 0,recalling_firm_name,center_classification_date,event_id,reason_for_recall,recall_details
0,4e brands north america,2020-10-11,86022,CGMP Deviations: recalled because it was manuf...,https://www.accessdata.fda.gov/scripts/ires/?P...
1,a-s medication solutions,2021-05-02,87436,Labeling: Label Mix-up; The bottle of over-the...,https://www.accessdata.fda.gov/scripts/ires/?P...
2,aaa cosmetica,2020-09-13,86037,CGMP Deviations,https://www.accessdata.fda.gov/scripts/ires/?P...
3,abbott,2018-05-20,79893,Reports of outflow graft twist occlusions. Pa...,https://www.accessdata.fda.gov/scripts/ires/?P...
4,abbott diabetes care,2013-05-12,64876,"AT rare, extremely high glucose levels (1024 m...",https://www.accessdata.fda.gov/scripts/ires/?P...
...,...,...,...,...,...
1101,zimmer biomet,2017-02-12,75971,Higher than anticipated rate of fracturing due...,https://www.accessdata.fda.gov/scripts/ires/?P...
1102,zimmer biomet,2018-11-04,81127,Lack of adequate validation and controls to en...,https://www.accessdata.fda.gov/scripts/ires/?P...
1103,zions rx formulations services dba rx formuati...,2014-04-13,67080,Lack of Assurance of Sterility: The firm expan...,https://www.accessdata.fda.gov/scripts/ires/?P...
1104,zydus pharmaceuticals usa,2013-08-04,65394,Failed Tablet/Capsule Specifications: A produc...,https://www.accessdata.fda.gov/scripts/ires/?P...


In [9]:
# keep firms with multiple events
msk = recalls_events.duplicated('recalling_firm_name', keep = False)
recalls_events = recalls_events.loc[msk]; recalls_events

Unnamed: 0,recalling_firm_name,center_classification_date,event_id,reason_for_recall,recall_details
4,abbott diabetes care,2013-05-12,64876,"AT rare, extremely high glucose levels (1024 m...",https://www.accessdata.fda.gov/scripts/ires/?P...
5,abbott diabetes care,2013-12-22,66886,Certain lots of FreeStyle and FreeStyle Lite B...,https://www.accessdata.fda.gov/scripts/ires/?P...
6,abbott diabetes care,2014-03-23,67472,Abbott Diabetes Care has identified through in...,https://www.accessdata.fda.gov/scripts/ires/?P...
7,abbott diabetes care,2023-04-09,91756,Lithium-ion batteries in glucose monitoring sy...,https://www.accessdata.fda.gov/scripts/ires/?P...
9,abbott vascular,2016-03-13,73243,Abbott Vascular has recently received reports ...,https://www.accessdata.fda.gov/scripts/ires/?P...
...,...,...,...,...,...
1100,zimmer,2015-06-14,71272,Zimmer is initiating a voluntary recall of 64 ...,https://www.accessdata.fda.gov/scripts/ires/?P...
1101,zimmer biomet,2017-02-12,75971,Higher than anticipated rate of fracturing due...,https://www.accessdata.fda.gov/scripts/ires/?P...
1102,zimmer biomet,2018-11-04,81127,Lack of adequate validation and controls to en...,https://www.accessdata.fda.gov/scripts/ires/?P...
1104,zydus pharmaceuticals usa,2013-08-04,65394,Failed Tablet/Capsule Specifications: A produc...,https://www.accessdata.fda.gov/scripts/ires/?P...


In [10]:
recalls_firms = recalls_events.recalling_firm_name.drop_duplicates()
recalls_firms

4                   abbott diabetes care
9                        abbott vascular
19                     accord healthcare
22                acella pharmaceuticals
30                advance pharmaceutical
                      ...               
1068                      vyaire medical
1071    vyaire medical carefusion viasys
1099                              zimmer
1101                       zimmer biomet
1104           zydus pharmaceuticals usa
Name: recalling_firm_name, Length: 156, dtype: object

## Infer recalling firms' stock symbols

In [None]:
fn = 'recalling_firm_symbols.csv'

def get_symbol(name, model = 'gpt-4o', ref = nasdaq_health.Symbol.tolist()):
    prompt = (f"{name}\nWITH SKEPTICISM RETURN THE CORPORATION'S STOCK TICKER SYMBOL "
              f"FOR THE BIOPHARMA OR HEALTHCARE COMPANY AND NOTHING ELSE\n{name}")
    out = client.responses.create(
        model = model,
        input = prompt,
        temperature = 0
        ).output[0].content
    symbol = out[0].text.strip() if out else out
    symbol = symbol if symbol in ref else None
    print(name, f'({symbol})')
    return symbol

with open(os.path.join(datadir, fn), 'w') as f:
    f.write('recalling_firm_name,Symbol\n')
    for ix in recalls_firms.index:
        name = recalls_firms.loc[ix]
        symbol = get_symbol(name)
        f.write(f'{name},{symbol}\n')

recalls_symbols = pd.read_csv(os.path.join(datadir, fn))
recalls_symbols = recalls_symbols.loc[~recalls_symbols.Symbol.isna()]
recalls_symbols.set_index('recalling_firm_name', inplace = True)
recalls_symbols

Unnamed: 0_level_0,Symbol
recalling_firm_name,Unnamed: 1_level_1
abbott diabetes care,ABT
abbott vascular,ABT
alcon research,ALC
avanos medical,AVNS
bard peripheral vascular,BDX
baxter englewood,BAX
baxter healthcare,BAX
becton dickinson,BDX
boston scientific,BSX
bristol-myers squibb,BMY


In [12]:
recalls_events = recalls_events.join(other = recalls_symbols,
                    on = 'recalling_firm_name', how = 'inner')
recalls_events

Unnamed: 0,recalling_firm_name,center_classification_date,event_id,reason_for_recall,recall_details,Symbol
4,abbott diabetes care,2013-05-12,64876,"AT rare, extremely high glucose levels (1024 m...",https://www.accessdata.fda.gov/scripts/ires/?P...,ABT
5,abbott diabetes care,2013-12-22,66886,Certain lots of FreeStyle and FreeStyle Lite B...,https://www.accessdata.fda.gov/scripts/ires/?P...,ABT
6,abbott diabetes care,2014-03-23,67472,Abbott Diabetes Care has identified through in...,https://www.accessdata.fda.gov/scripts/ires/?P...,ABT
7,abbott diabetes care,2023-04-09,91756,Lithium-ion batteries in glucose monitoring sy...,https://www.accessdata.fda.gov/scripts/ires/?P...,ABT
9,abbott vascular,2016-03-13,73243,Abbott Vascular has recently received reports ...,https://www.accessdata.fda.gov/scripts/ires/?P...,ABT
...,...,...,...,...,...,...
1003,teva pharmaceuticals usa,2022-06-12,90182,Failed Dissolution Specifications- Low Out-Of-...,https://www.accessdata.fda.gov/scripts/ires/?P...,TEVA
1099,zimmer,2012-12-23,63683,Zimmer Spine has received reports of the PEEK ...,https://www.accessdata.fda.gov/scripts/ires/?P...,ZBH
1100,zimmer,2015-06-14,71272,Zimmer is initiating a voluntary recall of 64 ...,https://www.accessdata.fda.gov/scripts/ires/?P...,ZBH
1101,zimmer biomet,2017-02-12,75971,Higher than anticipated rate of fracturing due...,https://www.accessdata.fda.gov/scripts/ires/?P...,ZBH


## Compile market-adjusted daily returns

In [13]:
start_date, end_date = '2009-01-01', '2023-12-31'
ix = pd.date_range(start_date, end_date, freq = 'B')
cols = nasdaq_stocks.Symbol.copy()
X = pd.DataFrame(index = ix, columns = cols, dtype = np.float64)
datadir_stocks = os.path.join(datadir, 'stocks')
for col in X.columns:
    pth = os.path.join(datadir_stocks, col + '.csv')
    df = pd.read_csv(pth, header = [0, 1, 2], index_col = 0)
    df.index = pd.to_datetime(df.index)
    close_adj = df['Adj Close'].copy()
    if (close_adj > 0).all().values[0]:
        X[col] = df['Adj Close']
X = X.loc[:, (X.isna().sum(0) < X.shape[0])]
R = X.pct_change().dropna(how = 'all')
R = R.sub(R.mean(1), 0); R

  R = X.pct_change().dropna(how = 'all')


Symbol,A,AA,AACG,AACT,AAL,AAME,AAMI,AAOI,AAON,AAP,...,ZTO,ZTR,ZTS,ZUMZ,ZURA,ZVIA,ZVRA,ZWS,ZYME,ZYXI
2009-01-05,0.019879,-0.034632,-0.053988,,-0.021139,-0.013988,,,-0.011593,-0.027462,...,,0.004504,,0.032766,,,,,,-0.091910
2009-01-06,0.052366,-0.005520,-0.067125,,0.060192,-0.204658,,,-0.028876,-0.025958,...,,-0.018798,,-0.008833,,,,,,0.099317
2009-01-07,0.028395,-0.078607,-0.022576,,0.025086,0.038263,,,-0.005829,-0.016848,...,,0.000021,,0.117885,,,,,,-0.033371
2009-01-08,-0.004338,0.032239,-0.026071,,0.043045,-0.010920,,,-0.005502,-0.007524,...,,-0.007995,,0.010215,,,,,,-0.004297
2009-01-09,0.030094,-0.027585,0.031819,,-0.038731,0.308709,,,0.005642,-0.001324,...,,0.014999,,0.001222,,,,,,0.060304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-25,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2023-12-26,-0.009401,-0.008159,0.025917,-0.009203,-0.025097,-0.020599,-0.007934,0.080746,-0.001040,-0.016508,...,0.005512,-0.001963,-0.008453,-0.005023,-0.070056,0.033879,-0.015145,-0.003772,0.075350,-0.004542
2023-12-27,-0.006238,0.021444,0.243691,-0.009180,-0.014814,0.027184,0.009043,-0.019456,-0.005500,0.004196,...,-0.011267,-0.004494,0.000852,0.002277,-0.026511,0.003260,0.100761,-0.013935,0.000245,-0.000707
2023-12-28,-0.003758,-0.010870,0.082314,-0.003401,-0.004116,0.015118,0.000249,-0.030932,-0.003131,-0.004538,...,0.036460,-0.005212,-0.002080,0.021637,-0.013710,-0.017619,0.036745,-0.012088,-0.008052,0.007741


## Find FDA recalls' market impact dates

In [14]:
days_window, days_baseline, z_thresh = 90, 3*360, 3
recalls_events['market_impact_date'] = pd.NaT
for ix in recalls_events.index:
    symbol = recalls_events.loc[ix].Symbol
    win_end = recalls_events.loc[ix, datecol]
    win_start = win_end - pd.Timedelta(days = days_window)
    bl_start = win_start - pd.Timedelta(days = days_baseline)
    Rwin = R.loc[win_start : win_end, symbol]
    Rbl = R.loc[bl_start : win_start, symbol]
    z = (Rwin - Rbl.mean()) / Rbl.std()
    dt_impact = Rwin.index[(z < -z_thresh)].max()
    recalls_events.loc[ix, 'market_impact_date'] = dt_impact
recalls_events

Unnamed: 0,recalling_firm_name,center_classification_date,event_id,reason_for_recall,recall_details,Symbol,market_impact_date
4,abbott diabetes care,2013-05-12,64876,"AT rare, extremely high glucose levels (1024 m...",https://www.accessdata.fda.gov/scripts/ires/?P...,ABT,NaT
5,abbott diabetes care,2013-12-22,66886,Certain lots of FreeStyle and FreeStyle Lite B...,https://www.accessdata.fda.gov/scripts/ires/?P...,ABT,NaT
6,abbott diabetes care,2014-03-23,67472,Abbott Diabetes Care has identified through in...,https://www.accessdata.fda.gov/scripts/ires/?P...,ABT,NaT
7,abbott diabetes care,2023-04-09,91756,Lithium-ion batteries in glucose monitoring sy...,https://www.accessdata.fda.gov/scripts/ires/?P...,ABT,NaT
9,abbott vascular,2016-03-13,73243,Abbott Vascular has recently received reports ...,https://www.accessdata.fda.gov/scripts/ires/?P...,ABT,NaT
...,...,...,...,...,...,...,...
1003,teva pharmaceuticals usa,2022-06-12,90182,Failed Dissolution Specifications- Low Out-Of-...,https://www.accessdata.fda.gov/scripts/ires/?P...,TEVA,NaT
1099,zimmer,2012-12-23,63683,Zimmer Spine has received reports of the PEEK ...,https://www.accessdata.fda.gov/scripts/ires/?P...,ZBH,2012-11-30
1100,zimmer,2015-06-14,71272,Zimmer is initiating a voluntary recall of 64 ...,https://www.accessdata.fda.gov/scripts/ires/?P...,ZBH,NaT
1101,zimmer biomet,2017-02-12,75971,Higher than anticipated rate of fracturing due...,https://www.accessdata.fda.gov/scripts/ires/?P...,ZBH,NaT


In [15]:
recalls_events = recalls_events.loc[~recalls_events.market_impact_date.isna()]
recalls_events.sort_values('market_impact_date', ascending = False, inplace = True)
recalls_events

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recalls_events.sort_values('market_impact_date', ascending = False, inplace = True)


Unnamed: 0,recalling_firm_name,center_classification_date,event_id,reason_for_recall,recall_details,Symbol,market_impact_date
152,baxter healthcare,2023-02-26,91506,There is the potential for patient desaturatio...,https://www.accessdata.fda.gov/scripts/ires/?P...,BAX,2023-02-09
825,philips respironics,2023-01-08,91293,The replacement silicone sound abatement foam ...,https://www.accessdata.fda.gov/scripts/ires/?P...,PHG,2022-10-12
424,fresenius kabi usa,2022-10-09,90845,The display screen may become frozen and unres...,https://www.accessdata.fda.gov/scripts/ires/?P...,FMS,2022-07-28
150,baxter healthcare,2022-09-11,90730,Firm noted an increase in customer reports of ...,https://www.accessdata.fda.gov/scripts/ires/?P...,BAX,2022-07-28
733,mylan pharmaceuticals,2022-05-08,89970,Labeling: Missing label on the vial,https://www.accessdata.fda.gov/scripts/ires/?P...,VTRS,2022-02-28
...,...,...,...,...,...,...,...
532,icu medical,2013-02-10,63548,Crystallization: Product is being recalled due...,https://www.accessdata.fda.gov/scripts/ires/?P...,ICUI,2012-11-30
328,depuy orthopaedics,2013-02-17,63783,DePuy Orthopaedics is initiating a voluntary r...,https://www.accessdata.fda.gov/scripts/ires/?P...,JNJ,2012-11-30
1099,zimmer,2012-12-23,63683,Zimmer Spine has received reports of the PEEK ...,https://www.accessdata.fda.gov/scripts/ires/?P...,ZBH,2012-11-30
205,bristol-myers squibb,2012-10-28,63043,Superpotent (Single Ingredient) Drug: All BiCN...,https://www.accessdata.fda.gov/scripts/ires/?P...,BMY,2012-08-02


## Keep stock symbols with multiple impacts

In [16]:
# deduplicate events per-symbol in a 1-week period
recalls_events = (recalls_events.groupby(
    ['Symbol', pd.Grouper(key = 'market_impact_date', freq = 'W')])
    .last().reset_index().set_index(colnames[0]))
recalls_events

Unnamed: 0_level_0,Symbol,market_impact_date,recalling_firm_name,center_classification_date,reason_for_recall,recall_details
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
61355,BAX,2012-04-22,baxter healthcare,2012-06-17,Recall expansion; the infusion pumps have the ...,https://www.accessdata.fda.gov/scripts/ires/?P...
67884,BAX,2014-05-18,baxter englewood,2014-05-18,1. ABACUS v3.1 may calculate quantities of ele...,https://www.accessdata.fda.gov/scripts/ires/?P...
90730,BAX,2022-07-31,baxter healthcare,2022-09-11,Firm noted an increase in customer reports of ...,https://www.accessdata.fda.gov/scripts/ires/?P...
91506,BAX,2023-02-12,baxter healthcare,2023-02-26,There is the potential for patient desaturatio...,https://www.accessdata.fda.gov/scripts/ires/?P...
82273,BDX,2019-04-21,becton dickinson,2019-06-30,Leaking of the Smartsite Syringe Administratio...,https://www.accessdata.fda.gov/scripts/ires/?P...
63043,BMY,2012-08-05,bristol-myers squibb,2012-10-28,Superpotent (Single Ingredient) Drug: All BiCN...,https://www.accessdata.fda.gov/scripts/ires/?P...
69931,BSX,2014-12-28,boston scientific,2015-01-18,Lotus valve became unlocked during release fro...,https://www.accessdata.fda.gov/scripts/ires/?P...
72409,BSX,2015-09-20,boston scientific,2015-11-15,Boston Scientific is recalling its recently re...,https://www.accessdata.fda.gov/scripts/ires/?P...
84886,BSX,2020-01-19,boston scientific,2020-03-29,Potential for tip detachment of Imager II 5F A...,https://www.accessdata.fda.gov/scripts/ires/?P...
86947,BSX,2020-12-06,boston scientific,2021-01-10,Failure to execute the visual inspection corre...,https://www.accessdata.fda.gov/scripts/ires/?P...


In [17]:
# keep symbols with multiple events
msk = recalls_events.duplicated('Symbol', keep = False)
recalls_events = recalls_events.loc[msk]; recalls_events

Unnamed: 0_level_0,Symbol,market_impact_date,recalling_firm_name,center_classification_date,reason_for_recall,recall_details
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
61355,BAX,2012-04-22,baxter healthcare,2012-06-17,Recall expansion; the infusion pumps have the ...,https://www.accessdata.fda.gov/scripts/ires/?P...
67884,BAX,2014-05-18,baxter englewood,2014-05-18,1. ABACUS v3.1 may calculate quantities of ele...,https://www.accessdata.fda.gov/scripts/ires/?P...
90730,BAX,2022-07-31,baxter healthcare,2022-09-11,Firm noted an increase in customer reports of ...,https://www.accessdata.fda.gov/scripts/ires/?P...
91506,BAX,2023-02-12,baxter healthcare,2023-02-26,There is the potential for patient desaturatio...,https://www.accessdata.fda.gov/scripts/ires/?P...
69931,BSX,2014-12-28,boston scientific,2015-01-18,Lotus valve became unlocked during release fro...,https://www.accessdata.fda.gov/scripts/ires/?P...
72409,BSX,2015-09-20,boston scientific,2015-11-15,Boston Scientific is recalling its recently re...,https://www.accessdata.fda.gov/scripts/ires/?P...
84886,BSX,2020-01-19,boston scientific,2020-03-29,Potential for tip detachment of Imager II 5F A...,https://www.accessdata.fda.gov/scripts/ires/?P...
86947,BSX,2020-12-06,boston scientific,2021-01-10,Failure to execute the visual inspection corre...,https://www.accessdata.fda.gov/scripts/ires/?P...
81878,EW,2018-12-23,edwards lifesciences,2019-02-03,The product is being recalled as result of a n...,https://www.accessdata.fda.gov/scripts/ires/?P...
82456,EW,2019-04-21,edwards lifesciences,2019-05-26,Potential for an electrical short circuit lead...,https://www.accessdata.fda.gov/scripts/ires/?P...


## Compile ground-truth spillover network

In [18]:
days_window, edgelist = 3, list()
in_symbols = nasdaq_health.Symbol.copy()
in_symbols = in_symbols.loc[in_symbols.isin(R.columns)]
for ix in recalls_events.index:
    symbol_i = recalls_events.loc[ix].Symbol
    win_start = recalls_events.loc[ix].market_impact_date
    win_end = win_start + pd.Timedelta(days = days_window)
    bl_start = win_start - pd.Timedelta(days = days_baseline)
    Rwin = R.loc[win_start : win_end, in_symbols]
    Rbl = R.loc[bl_start : win_start, in_symbols]
    z = (Rwin - Rbl.mean(0)) / Rbl.std(0)
    symbols_j = ((abs(z) > z_thresh).sum(0) > 0)
    symbols_j = symbols_j.index[symbols_j].tolist() + [symbol_i]
    edgelist.extend([(symbol_i, j) for j in symbols_j])
edgelist = list(set(edgelist))

In [None]:
refNetwork = pd.DataFrame(edgelist, columns = ['Gene1', 'Gene2'])
refNetwork.sort_values(['Gene1', 'Gene2'], inplace = True)
refNetwork.to_csv(os.path.join('..', 'DELAY-dataset', 'refNetwork.csv'), index = False)
refNetwork

Unnamed: 0,Gene1,Gene2
664,BAX,ACHV
177,BAX,ACON
45,BAX,ADMA
767,BAX,ADPT
791,BAX,ALBT
...,...,...
230,VTRS,VSTM
837,VTRS,VTRS
285,VTRS,VYGR
471,VTRS,XOMA


In [20]:
n_nodes_in = refNetwork.Gene2.unique().size
C_out_norm = refNetwork.groupby('Gene1').size() / n_nodes_in
C_out_norm.sort_values(ascending = False, inplace = True)
print(C_out_norm, '\n', C_out_norm.mean())

Gene1
FMS     0.430108
ICUI    0.313978
MDT     0.275269
BAX     0.197849
RDY     0.161290
BSX     0.137634
VTRS    0.135484
EW      0.086022
TFX     0.086022
PHG     0.079570
dtype: float64 
 0.1903225806451613
