# Document Purpose:
* purpose of this notebook is to merge the stock price data on particular earnings announcement dates with the EPS data for those dates into one dataframe


# Imports

In [10]:
import pandas as pd
import numpy as np

# Cleaning

In [11]:
events_df = pd.read_csv("events_pct_changes.csv")
# drop unecessary columns
events_df = events_df[['date', 'symbol','security_name', 
       'before_eps_date', 'after_eps_date', 'Adj Close',
       'Close',  'Adj Close_pct_change',
       'Close_pct_change']].copy()

In [12]:
eps_data = pd.read_csv("eps_data.csv")

In [13]:
eps_data

Unnamed: 0,Earnings Date,EPS Estimate,Reported EPS,Surprise(%),Symbol
0,2024-07-05 08:00:00-04:00,,,,LEDS
1,2024-04-10 08:00:00-04:00,,,,LEDS
2,2024-01-12 06:00:00-05:00,,,,LEDS
3,2023-11-13 06:00:00-05:00,,,,LEDS
4,2023-11-10 04:00:00-05:00,,,,LEDS
...,...,...,...,...,...
22080,2020-08-04 16:00:00-04:00,0.13,0.27,1.0301,MNR
22081,2020-05-06 16:00:00-04:00,0.09,-0.77,-9.2796,MNR
22082,2020-02-06 16:00:00-05:00,0.11,0.04,-0.6190,MNR
22083,2019-11-25 16:00:00-05:00,0.10,0.24,1.5263,MNR


In [14]:
events_df[events_df["symbol"] == "SGH"]
# events_df.head()

Unnamed: 0,date,symbol,security_name,before_eps_date,after_eps_date,Adj Close,Close,Adj Close_pct_change,Close_pct_change
1,2023-01-03,SGH,"SMART Global Holdings, Inc. - Ordinary Shares",2023-01-03,2023-01-04,15.75,15.75,0.041667,0.041667
4188,2023-06-29,SGH,"SMART Global Holdings, Inc. - Ordinary Shares",2023-06-29,2023-06-30,29.01,29.01,0.089373,0.089373
6370,2023-10-12,SGH,"SMART Global Holdings, Inc. - Ordinary Shares",2023-10-12,2023-10-13,13.1,13.1,-0.444444,-0.444444


In [15]:
eps_data[eps_data["Symbol"] == "SGH"]

Unnamed: 0,Earnings Date,EPS Estimate,Reported EPS,Surprise(%),Symbol
12,2025-01-07 05:00:00-05:00,,,,SGH
13,2024-10-10 06:00:00-04:00,,,,SGH
14,2024-06-27 06:00:00-04:00,,,,SGH
15,2024-04-02 06:00:00-04:00,0.25,,,SGH
16,2024-01-09 16:00:00-05:00,0.16,0.24,0.5206,SGH
17,2023-10-12 16:00:00-04:00,0.45,0.35,-0.2285,SGH
18,2023-06-29 16:00:00-04:00,0.4,0.66,0.6384,SGH
19,2023-04-04 16:00:00-04:00,0.6,0.76,0.272,SGH
20,2023-01-03 16:00:00-05:00,0.57,0.79,0.3831,SGH
21,2022-10-04 16:00:00-04:00,0.65,0.8,0.232,SGH


In [16]:
#how many rows where reported EPS is NaN: ~= 7803 rows
eps_data[eps_data[[ "Reported EPS"]].isna().all(axis=1)]

# drop rows where reported EPS is NaN:
eps_data = eps_data.dropna(axis = 0, subset= ["Reported EPS"], ignore_index = True).copy()
eps_data

Unnamed: 0,Earnings Date,EPS Estimate,Reported EPS,Surprise(%),Symbol
0,2024-01-09 16:00:00-05:00,0.16,0.24,0.5206,SGH
1,2023-10-12 16:00:00-04:00,0.45,0.35,-0.2285,SGH
2,2023-06-29 16:00:00-04:00,0.40,0.66,0.6384,SGH
3,2023-04-04 16:00:00-04:00,0.60,0.76,0.2720,SGH
4,2023-01-03 16:00:00-05:00,0.57,0.79,0.3831,SGH
...,...,...,...,...,...
14277,2020-08-04 16:00:00-04:00,0.13,0.27,1.0301,MNR
14278,2020-05-06 16:00:00-04:00,0.09,-0.77,-9.2796,MNR
14279,2020-02-06 16:00:00-05:00,0.11,0.04,-0.6190,MNR
14280,2019-11-25 16:00:00-05:00,0.10,0.24,1.5263,MNR


In [17]:
eps_data[eps_data[['EPS Estimate', "Reported EPS", "Surprise(%)"]].isna().any(axis=1)]


Unnamed: 0,Earnings Date,EPS Estimate,Reported EPS,Surprise(%),Symbol
16,2024-01-03 16:00:00-05:00,0.10,0.10,,SLP
120,2022-11-28 20:00:00-05:00,-0.02,-0.02,,OGI
170,2023-04-27 06:00:00-04:00,0.07,0.07,,WIT
171,2023-01-13 05:00:00-05:00,0.07,0.07,,WIT
176,2021-10-13 06:00:00-04:00,0.07,0.07,,WIT
...,...,...,...,...,...
14085,2024-02-26 19:00:00-05:00,,0.02,4.0,CAVA
14094,2023-11-16 16:00:00-05:00,,-0.34,,VJET
14241,2023-11-14 07:00:00-05:00,,-0.03,,ISPR
14255,2023-08-14 16:00:00-04:00,,0.01,,POL


In [18]:
eps_data.dtypes

Earnings Date     object
EPS Estimate     float64
Reported EPS     float64
Surprise(%)      float64
Symbol            object
dtype: object

In [19]:
# Convert 'date' column to datetime type
# eps_data["Earnings Date"] = eps_data["Earnings Date"].datetime.datetime.date()

eps_data["Earnings Date"] = pd.to_datetime(eps_data["Earnings Date"], utc=True).dt.strftime('%Y-%m-%d')
print(eps_data.dtypes)
eps_data

# events_df['date'] = pd.to_datetime(events_df['date'], utc=True)


Earnings Date     object
EPS Estimate     float64
Reported EPS     float64
Surprise(%)      float64
Symbol            object
dtype: object


Unnamed: 0,Earnings Date,EPS Estimate,Reported EPS,Surprise(%),Symbol
0,2024-01-09,0.16,0.24,0.5206,SGH
1,2023-10-12,0.45,0.35,-0.2285,SGH
2,2023-06-29,0.40,0.66,0.6384,SGH
3,2023-04-04,0.60,0.76,0.2720,SGH
4,2023-01-03,0.57,0.79,0.3831,SGH
...,...,...,...,...,...
14277,2020-08-04,0.13,0.27,1.0301,MNR
14278,2020-05-06,0.09,-0.77,-9.2796,MNR
14279,2020-02-06,0.11,0.04,-0.6190,MNR
14280,2019-11-25,0.10,0.24,1.5263,MNR


In [20]:
events_df.dtypes

date                     object
symbol                   object
security_name            object
before_eps_date          object
after_eps_date           object
Adj Close               float64
Close                   float64
Adj Close_pct_change    float64
Close_pct_change        float64
dtype: object

# Merge EPS Data, Stock Data Pct Changes

Merge based on date and stock

In [21]:
events_df.sort_values(by = ["symbol", 'date'])

Unnamed: 0,date,symbol,security_name,before_eps_date,after_eps_date,Adj Close,Close,Adj Close_pct_change,Close_pct_change
1401,2023-02-28,A,"Agilent Technologies, Inc. Common Stock",2023-02-28,2023-03-01,136.518082,137.509995,-0.031415,-0.031415
3975,2023-05-23,A,"Agilent Technologies, Inc. Common Stock",2023-05-23,2023-05-24,120.312912,120.989998,-0.059468,-0.059468
6088,2023-08-15,A,"Agilent Technologies, Inc. Common Stock",2023-08-15,2023-08-16,120.911308,121.360001,-0.034142,-0.034142
8298,2023-11-20,A,"Agilent Technologies, Inc. Common Stock",2023-11-20,2023-11-21,123.710762,123.919998,0.087208,0.087208
36,2023-01-18,AA,Alcoa Corporation Common Stock,2023-01-18,2023-01-19,48.760040,49.520000,-0.073527,-0.073527
...,...,...,...,...,...,...,...,...,...
3419,2023-05-08,ZYME,Zymeworks Inc. - Common Stock,2023-05-08,2023-05-09,9.880000,9.880000,0.127854,0.127854
1821,2023-03-13,ZYXI,"Zynex, Inc. - Common Stock",2023-03-10,2023-03-13,10.940000,10.940000,0.147954,0.147954
2590,2023-04-27,ZYXI,"Zynex, Inc. - Common Stock",2023-04-26,2023-04-27,11.270000,11.270000,0.009857,0.009857
4727,2023-07-27,ZYXI,"Zynex, Inc. - Common Stock",2023-07-26,2023-07-27,9.750000,9.750000,0.018809,0.018809


In [22]:
eps_data.sort_values(by = ["Symbol", 'Earnings Date'])

Unnamed: 0,Earnings Date,EPS Estimate,Reported EPS,Surprise(%),Symbol
9010,2022-05-24,1.12,1.13,0.0087,A
9009,2022-08-16,1.20,1.34,0.1175,A
9008,2022-11-21,1.39,1.53,0.1038,A
9007,2023-02-28,1.30,1.37,0.0504,A
9006,2023-05-23,1.26,1.27,0.0062,A
...,...,...,...,...,...
11473,2023-03-13,0.19,0.20,0.0601,ZYXI
11472,2023-04-27,0.01,0.04,2.0008,ZYXI
11471,2023-07-27,0.05,0.09,1.0000,ZYXI
11470,2023-10-26,0.09,0.10,0.0715,ZYXI


#### Decision: 
for earnings events where there is no matching earnings values, these event dates will be dropped.
The ML model requires complete data. 

In [23]:
merged_df = events_df.merge(eps_data, left_on = ['date', 'symbol'], right_on = ["Earnings Date", "Symbol"], how = "inner")
merged_df

Unnamed: 0,date,symbol,security_name,before_eps_date,after_eps_date,Adj Close,Close,Adj Close_pct_change,Close_pct_change,Earnings Date,EPS Estimate,Reported EPS,Surprise(%),Symbol
0,2023-01-03,SGH,"SMART Global Holdings, Inc. - Ordinary Shares",2023-01-03,2023-01-04,15.750000,15.750000,0.041667,0.041667,2023-01-03,0.57,0.79,0.3831,SGH
1,2023-01-04,RGP,"Resources Connection, Inc. - Common Stock",2023-01-04,2023-01-05,14.715991,15.420000,-0.171858,-0.171858,2023-01-04,0.47,0.59,0.2553,RGP
2,2023-01-04,SLP,"Simulations Plus, Inc. - Common Stock",2023-01-04,2023-01-05,34.700558,34.959999,-0.070954,-0.070954,2023-01-04,0.09,0.06,-0.2941,SLP
3,2023-01-05,CAG,"ConAgra Brands, Inc. Common Stock",2023-01-04,2023-01-05,37.901043,39.970001,0.034153,0.034153,2023-01-05,0.66,0.81,0.2193,CAG
4,2023-01-05,HELE,Helen of Troy Limited - Common Stock,2023-01-04,2023-01-05,110.970001,110.970001,-0.014301,-0.014301,2023-01-05,2.61,2.75,0.0551,HELE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6153,2023-12-20,GIS,"General Mills, Inc. Common Stock",2023-12-19,2023-12-20,64.330002,64.330002,-0.035677,-0.035677,2023-12-20,1.16,1.25,0.0823,GIS
6154,2023-12-20,TTC,Toro Company (The) Common Stock,2023-12-19,2023-12-20,96.855721,97.209999,0.089432,0.089432,2023-12-20,0.56,0.71,0.2747,TTC
6155,2023-12-21,CTAS,Cintas Corporation - Common Stock,2023-12-20,2023-12-21,588.635681,589.950012,0.065546,0.065546,2023-12-21,3.49,3.61,0.0332,CTAS
6156,2023-12-21,LMNR,Limoneira Co - Common Stock,2023-12-21,2023-12-22,20.435904,20.510000,0.068786,0.068786,2023-12-21,-0.15,-0.15,0.0196,LMNR


 ### QA:
 confirming merge performed as expected by doing a full outer merge first, then reviewing results for individual stocks. 

 Result = if no earnings date is associated with the "date" field, ok to drop row, 

In [24]:
merged_df[merged_df['Earnings Date'].isnull()]

Unnamed: 0,date,symbol,security_name,before_eps_date,after_eps_date,Adj Close,Close,Adj Close_pct_change,Close_pct_change,Earnings Date,EPS Estimate,Reported EPS,Surprise(%),Symbol


In [25]:
stock = "OGI"
single_stock_df = merged_df[(merged_df['symbol'] == stock) | (merged_df['Symbol'] == stock)]
single_stock_df

Unnamed: 0,date,symbol,security_name,before_eps_date,after_eps_date,Adj Close,Close,Adj Close_pct_change,Close_pct_change,Earnings Date,EPS Estimate,Reported EPS,Surprise(%),Symbol
1493,2023-04-11,OGI,Organigram Holdings Inc. - Common Shares,2023-04-11,2023-04-12,2.4,2.4,-0.124088,-0.124088,2023-04-11,-0.02,-0.02,-0.2,OGI
6152,2023-12-19,OGI,Organigram Holdings Inc. - Common Shares,2023-12-18,2023-12-19,1.27,1.27,-0.030534,-0.030534,2023-12-19,-0.14,-0.13,0.073,OGI


In [26]:
merged_df.sort_values(by=['symbol', 'Symbol', 'date', "Earnings Date"], na_position = 'last').head(2000)

Unnamed: 0,date,symbol,security_name,before_eps_date,after_eps_date,Adj Close,Close,Adj Close_pct_change,Close_pct_change,Earnings Date,EPS Estimate,Reported EPS,Surprise(%),Symbol
1051,2023-02-28,A,"Agilent Technologies, Inc. Common Stock",2023-02-28,2023-03-01,136.518082,137.509995,-0.031415,-0.031415,2023-02-28,1.30,1.37,0.0504,A
2871,2023-05-23,A,"Agilent Technologies, Inc. Common Stock",2023-05-23,2023-05-24,120.312912,120.989998,-0.059468,-0.059468,2023-05-23,1.26,1.27,0.0062,A
4419,2023-08-15,A,"Agilent Technologies, Inc. Common Stock",2023-08-15,2023-08-16,120.911308,121.360001,-0.034142,-0.034142,2023-08-15,1.36,1.43,0.0480,A
6062,2023-11-20,A,"Agilent Technologies, Inc. Common Stock",2023-11-20,2023-11-21,123.710762,123.919998,0.087208,0.087208,2023-11-20,1.34,1.38,0.0286,A
29,2023-01-18,AA,Alcoa Corporation Common Stock,2023-01-18,2023-01-19,48.760040,49.520000,-0.073527,-0.073527,2023-01-18,-0.92,-0.70,0.2375,AA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,2023-01-24,FFIV,"F5, Inc. - Common Stock",2023-01-23,2023-01-24,146.470001,146.470001,-0.008797,-0.008797,2023-01-24,2.33,2.47,0.0602,FFIV
1526,2023-04-19,FFIV,"F5, Inc. - Common Stock",2023-04-18,2023-04-19,137.050003,137.050003,-0.040871,-0.040871,2023-04-19,2.42,2.53,0.0460,FFIV
3126,2023-07-24,FFIV,"F5, Inc. - Common Stock",2023-07-21,2023-07-24,150.210007,150.210007,0.009340,0.009340,2023-07-24,2.86,3.21,0.1234,FFIV
4748,2023-10-24,FFIV,"F5, Inc. - Common Stock",2023-10-23,2023-10-24,148.240005,148.240005,0.012430,0.012430,2023-10-24,3.21,3.50,0.0899,FFIV


In [28]:
merged_df.columns

Index(['date', 'symbol', 'security_name', 'before_eps_date', 'after_eps_date',
       'Adj Close', 'Close', 'Adj Close_pct_change', 'Close_pct_change',
       'Earnings Date', 'EPS Estimate', 'Reported EPS', 'Surprise(%)',
       'Symbol'],
      dtype='object')

In [29]:
# drop duplicate columns
merged_df.drop(columns =[ 'Earnings Date','Symbol'], inplace = True)

In [38]:
## QA: check how surprise % is calculated. It should be (reported minus estimate) divided by estimate
merged_df['suprise_calc'] = (merged_df['Reported EPS'] - merged_df['EPS Estimate']) / merged_df['EPS Estimate']
merged_df['suprise_calc'] = merged_df['suprise_calc'].round(4).copy()

In [39]:
merged_df

Unnamed: 0,date,symbol,security_name,before_eps_date,after_eps_date,Adj Close,Close,Adj Close_pct_change,Close_pct_change,EPS Estimate,Reported EPS,Surprise(%),suprise_calc
0,2023-01-03,SGH,"SMART Global Holdings, Inc. - Ordinary Shares",2023-01-03,2023-01-04,15.750000,15.750000,0.041667,0.041667,0.57,0.79,0.3831,0.3860
1,2023-01-04,RGP,"Resources Connection, Inc. - Common Stock",2023-01-04,2023-01-05,14.715991,15.420000,-0.171858,-0.171858,0.47,0.59,0.2553,0.2553
2,2023-01-04,SLP,"Simulations Plus, Inc. - Common Stock",2023-01-04,2023-01-05,34.700558,34.959999,-0.070954,-0.070954,0.09,0.06,-0.2941,-0.3333
3,2023-01-05,CAG,"ConAgra Brands, Inc. Common Stock",2023-01-04,2023-01-05,37.901043,39.970001,0.034153,0.034153,0.66,0.81,0.2193,0.2273
4,2023-01-05,HELE,Helen of Troy Limited - Common Stock,2023-01-04,2023-01-05,110.970001,110.970001,-0.014301,-0.014301,2.61,2.75,0.0551,0.0536
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6153,2023-12-20,GIS,"General Mills, Inc. Common Stock",2023-12-19,2023-12-20,64.330002,64.330002,-0.035677,-0.035677,1.16,1.25,0.0823,0.0776
6154,2023-12-20,TTC,Toro Company (The) Common Stock,2023-12-19,2023-12-20,96.855721,97.209999,0.089432,0.089432,0.56,0.71,0.2747,0.2679
6155,2023-12-21,CTAS,Cintas Corporation - Common Stock,2023-12-20,2023-12-21,588.635681,589.950012,0.065546,0.065546,3.49,3.61,0.0332,0.0344
6156,2023-12-21,LMNR,Limoneira Co - Common Stock,2023-12-21,2023-12-22,20.435904,20.510000,0.068786,0.068786,-0.15,-0.15,0.0196,-0.0000


### Decision: 
There is slight variation between the surprise % provided and the manually calculated surprise percentage. This will be attributed to rounding and the original values provided by yfinance will be used. 

In [40]:
merged_df.drop(columns=['suprise_calc'], inplace=True)

In [41]:
merged_df

Unnamed: 0,date,symbol,security_name,before_eps_date,after_eps_date,Adj Close,Close,Adj Close_pct_change,Close_pct_change,EPS Estimate,Reported EPS,Surprise(%)
0,2023-01-03,SGH,"SMART Global Holdings, Inc. - Ordinary Shares",2023-01-03,2023-01-04,15.750000,15.750000,0.041667,0.041667,0.57,0.79,0.3831
1,2023-01-04,RGP,"Resources Connection, Inc. - Common Stock",2023-01-04,2023-01-05,14.715991,15.420000,-0.171858,-0.171858,0.47,0.59,0.2553
2,2023-01-04,SLP,"Simulations Plus, Inc. - Common Stock",2023-01-04,2023-01-05,34.700558,34.959999,-0.070954,-0.070954,0.09,0.06,-0.2941
3,2023-01-05,CAG,"ConAgra Brands, Inc. Common Stock",2023-01-04,2023-01-05,37.901043,39.970001,0.034153,0.034153,0.66,0.81,0.2193
4,2023-01-05,HELE,Helen of Troy Limited - Common Stock,2023-01-04,2023-01-05,110.970001,110.970001,-0.014301,-0.014301,2.61,2.75,0.0551
...,...,...,...,...,...,...,...,...,...,...,...,...
6153,2023-12-20,GIS,"General Mills, Inc. Common Stock",2023-12-19,2023-12-20,64.330002,64.330002,-0.035677,-0.035677,1.16,1.25,0.0823
6154,2023-12-20,TTC,Toro Company (The) Common Stock,2023-12-19,2023-12-20,96.855721,97.209999,0.089432,0.089432,0.56,0.71,0.2747
6155,2023-12-21,CTAS,Cintas Corporation - Common Stock,2023-12-20,2023-12-21,588.635681,589.950012,0.065546,0.065546,3.49,3.61,0.0332
6156,2023-12-21,LMNR,Limoneira Co - Common Stock,2023-12-21,2023-12-22,20.435904,20.510000,0.068786,0.068786,-0.15,-0.15,0.0196


In [42]:
# export merged df with complete data to csv
merged_df.to_csv("data/data_full.csv", index = False)