In [77]:
import pandas as pd
import numpy as np
import time
from datetime import timedelta, datetime
from libs.helpermodules import memory_handling as mh

In [78]:
df = pd.read_csv("US SPX 500 (Mini) 1 Minute.csv")
columns_to_keep = ['<Date>', ' <Time>', ' <Open>', ' <Close>']
df = df[columns_to_keep]
df.columns = ['date', 'time', 'open', 'close']
print(df.columns)

# combining date and time in one column (format datetime)
df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%d/%m/%Y %H:%M:%S')
print(df['datetime'].dtype)
df = df.drop(columns=['time'])
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')
df = df[['datetime'] + [col for col in df.columns if col != 'datetime']]


Index(['date', 'time', 'open', 'close'], dtype='object')
datetime64[ns]


In [79]:
# downloading the pickle file 
file = "fedspeeches_preprocessed.pkl"
helper = mh.PickleHelper.pickle_load(file)
speeches= helper.obj

In [80]:
def clean_df(df):
    """
    Clean the dataframe by sorting and filtering based on date and weekdays.

    Parameters:
    df : pandas.DataFrame
        A dataframe containing at least 'date' and 'timestamp' columns.

    Returns:
    pandas.DataFrame
        A dataframe sorted by 'date' and 'timestamp', filtered to include only
        rows with dates >= '2024-01-01' and weekdays (Monday to Friday).
    """
        
    df_ordered = df.sort_values(by=['date', 'timestamp'], ascending = [True, True])
    df_final = df_ordered[df_ordered.date >= "2024-01-01"]
    df_final = df_final[df_final['date'].dt.weekday<5]
    return df_final

In [81]:
speeches=clean_df(speeches)

In [82]:
# FILTERING DF TO CONTAIN ONLY SPEECHES' DATAS 
vd = set(speeches['date'].dt.date)
filtdf = df[df['date'].dt.date.isin(vd)]

In [83]:
# ALL SPEECHES START AT 10
# FILTERING filtdf TO KEEP DATAS ONLY BETWEEN 09:45 AND 10:15
# Define the time range as time objects
start_time = pd.to_datetime("09:45:00").time()
end_time = pd.to_datetime("10:15:00").time()

# Filter rows based on the time component of 'datetime'
filtdf = filtdf[(filtdf['datetime'].dt.time >= start_time) & (filtdf['datetime'].dt.time <= end_time)]

In [84]:
def calculate_volatility(dataframe):
    """
    Calculate daily volatility for the 'close' column in the dataframe.

    Parameters:
    df : pandas.DataFrame
        A dataframe containing at least 'date' and 'close' columns.

    Returns:
    pandas.Series
        A series with dates as the index and the standard deviation
        of 'close' values (volatility) as the values.
    """

    dataframe['pct_change'] = dataframe['close'].pct_change()
    volatility_series = dataframe.groupby('date')['pct_change'].std()
    volatility_series.name = 'volatility'
    volatility_series.index = pd.to_datetime(volatility_series.index, format='%Y-%m-%d')
    return volatility_series

In [85]:
def get_best_values(volatility, dataframe, number):
    """
    Filter the dataframe for the top dates with the highest volatility.

    Parameters:
    volatility : pandas.Series
        A series with dates as the index and volatility as the values.
    dataframe : pandas.DataFrame
        The dataframe to filter, containing a 'date' column.
    number : int
        The number of top volatility dates to select.

    Returns:
    pandas.DataFrame
        A filtered dataframe containing rows with dates matching the top 
        'number' volatility values.
    """
    
    # Sort the volatility in descending order
    volatility = volatility.sort_values(ascending=False)

    # Get the top `number` dates with the highest volatility
    top_dates = volatility.head(number).index.tolist()

    # Filter the dataframe for rows where 'date' is in the top_dates
    filtered_df = dataframe[dataframe['date'].isin(top_dates)]

    return filtered_df


In [86]:
volatility = calculate_volatility(filtdf)
volatility

date
2024-01-08    0.000349
2024-01-16    0.001712
2024-01-17    0.001514
2024-02-02    0.006777
2024-02-07    0.001929
                ...   
2024-10-10    0.000627
2024-10-11    0.000299
2024-10-14    0.001050
2024-10-18    0.000322
2024-10-23    0.000687
Name: volatility, Length: 69, dtype: float64

In [87]:
newdf = get_best_values(volatility, speeches, 10)

Top 10:
- 2024-02-02, Governor Michelle W. Bowman, volatility: 0.006777226970784951 (2) ***
- 2024-03-25, Governor Lisa D. Cook, volatility: 0.002785226147242381 ***
- 2024-04-05, Governor Michelle W. Bowman, volatility: 0.002709719758927533
- 2024-04-16, Vice Chair Philip N. Jefferson, volatility: 0.004312766394127704 (3) ***
- 2024-05-03, Governor Michelle W. Bowman, volatility: 0.003364359644292597 ***
- 2024-07-09, Governor Michelle W. Bowman/Vice Chair for Supervision Michael S. Barr, volatility: 0.0031166837073481864 ***
- 2024-07-24, Governor Michelle W. Bowman, volatility: 0.0036281060467863763 (5) ***
- 2024-08-19, Governor Christopher J. Waller, volatility: 0.0029159356120044794
- 2024-09-06, Governor Christopher J. Waller, volatility: 0.0036960021216403565 (4) ***
- 2024-09-20, Governor Michelle W. Bowman, volatility: 0.007836799683358276 (1) ***


Legend:
(number) indicates the position in the ranking
*** means the speech was flagged by the study with ETF 'SPY' datas too.

Conclusions:
- Most volatile speaker: Governor Michelle Bowman
- Speeches with more volatility: 2024-09-20 (Statement by Governor Michelle W. Bowman) and 2024-02-02 (The future of Banking)

