In [61]:
import pandas as pd
from helpermodules import memory_handling as mh
import numpy as np
from datetime import timedelta, datetime
import time

In [62]:
# downloading the pickle file 
file = "fedspeeches_preprocessed.pkl"
helper = mh.PickleHelper.pickle_load(file)
df_speech = helper.obj

In [None]:
df = pd.read_csv("/Users/baudotedua/Dropbox/Mac/Documents/GitHub/cb-impact-nlps/US SPX 500 (Mini) 1 Minute (1).csv")
columns_to_keep = ['<Date>', ' <Time>', ' <Open>', ' <Close>']
df = df[columns_to_keep]
df.columns = ['date', 'time', 'open', 'close']
print(df.columns)
# combining date and time in one column (format datetime)
df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%d/%m/%Y %H:%M:%S')
print(df['datetime'].dtype)
df = df.drop(columns=['time'])
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')
df = df[['datetime'] + [col for col in df.columns if col != 'datetime']]

Index(['date', 'time', 'open', 'close'], dtype='object')


In [65]:
# choosing only the assigned speakers for the analysis
speaker_list_riccardo = [ 'Vice Chair Janet L. Yellen', 'Governor Sarah Bloom Raskin', 'Governor Jeremy C. Stein', 'Governor Jerome H. Powell', 'Chairman  Ben S. Bernanke', 'Governor Lael Brainard', 'Chair Janet L. Yellen', 'Vice Chairman Stanley Fischer', 'Vice Chairman for Supervision Randal K. Quarles', 'Chairman Jerome H. Powell', 'Vice Chairman Richard H. Clarida', 'Chair Jerome H. Powell', 'Vice Chair Richard H. Clarida', 'Vice Chair for Supervision Randal K. Quarles']
speaker_list_fabio = ['Governor Michelle W. Bowman', 'Vice Chair for Supervision and Chair of the Financial Stability Board Randal K. Quarles', 'Vice Chairman for Supervision and Chair of the Financial Stability Board Randal K. Quarles', 'Governor Christopher J. Waller', 'Governor Randal K. Quarles', 'Vice Chair for Supervision Michael S. Barr', 'Governor Lisa D. Cook', 'Vice Chair Lael Brainard', 'Governor Philip N. Jefferson', 'Chair Pro Tempore Jerome H. Powell', 'Vice Chair Philip N. Jefferson', 'Governor Adriana D. Kugler']
df_speech = df_speech[df_speech['speaker'].isin(speaker_list_fabio)]

In [66]:
def data_retrieve_minute(df, ticker):
    """
    Retrieve minute-level time series data for a specified ticker using the Twelve Data API.

    Parameters:
    df : pandas.DataFrame
        A dataframe containing at least two columns:
        - 'date': Dates for which data needs to be retrieved.
        - 'timestamp': Timestamps corresponding to the data entries.
    ticker : str
        The symbol of the financial instrument to retrieve data for.

    Returns:
    dffinal : pandas.DataFrame
        A concatenated DataFrame containing minute-level data for the specified ticker
        across all the unique dates in the provided dataframe `df`. The DataFrame will have:
        - 'date' as the index
        - A 'timestamp' column representing the minute-level time series.
    """
    
    timelist = df.date.unique().tolist()
    count = 0
    dffinal = pd.DataFrame()

    for date in timelist:
        print(f"Processing date: {date}")
        start = df[df['date'] == date].timestamp.min() - pd.Timedelta(minutes=15)
        end = df[df['date'] == date].timestamp.max() + pd.Timedelta(minutes=15)

        if count % 8 == 0 and count > 0:
            time.sleep(60)

        try:
            etf = td.time_series(
                symbol=ticker,
                interval="1min",
                start_date=start,
                end_date=end,
                outputsize=5000    
            ).as_pandas()
            
            # If the API call returns data, concatenate it to the final DataFrame
            if not etf.empty:
                # Add a 'date' column to the ETF DataFrame to use as an index later
                etf['date'] = date
                dffinal = pd.concat([dffinal, etf], ignore_index=False)
            else:
                print(f"No data available for {date}")

        except Exception as e:
            print(f"Error retrieving data for {date}: {e}")

        count += 1
    
    # Set the 'date' as the index and move 'timestamp' into a separate column
    dffinal.reset_index(inplace=True)
    dffinal.rename(columns={'index': 'timestamp'}, inplace=True)
    dffinal.set_index('date', inplace=True)
    
    return dffinal


In [67]:
def volatility_calculator(df , column):
    """
    Calculate daily volatility for the 'open' column in the dataframe.

    Parameters:
    df : pandas.DataFrame
        A dataframe containing at least 'date' and 'open' columns.
    column : string 
        The name of the column we want to calculate the standard deviation on

    Returns:
    pandas.Series
        A series with dates as the index and the standard deviation
        of 'open' values (volatility) as the values.
    """

    volatility_series = df.groupby('date')[column].std()
    return volatility_series

In [68]:
def clean_df(df):
    """
    Clean the dataframe by sorting and filtering based on date and weekdays.

    Parameters:
    df : pandas.DataFrame
        A dataframe containing at least 'date' and 'timestamp' columns.

    Returns:
    pandas.DataFrame
        A dataframe sorted by 'date' and 'timestamp', filtered to include only
        rows with dates >= '2020-12-01' and weekdays (Monday to Friday).
    """
        
    df_ordered = df.sort_values(by=['date', 'timestamp'], ascending = [True, True])
    df_final = df_ordered[df_ordered.date >= "2024-01-01"]
    df_final = df_final[df_final['date'].dt.weekday<5]
    return df_final

In [113]:
def get_best_values(volatility, df, number):
    """
    Filter the dataframe for the speeches with the highest volatility,
    calculating the 'length' as the number of rows for the same speech (identified by title),
    and ensuring each title appears only once. The volatility values are included in the result.

    Parameters:
    volatility : pandas.Series
        A series with dates as the index and volatility as the values.
    df : pandas.DataFrame
        The dataframe to filter, containing 'title' and 'date' columns.
    number : int
        The number of top volatility dates to select.

    Returns:
    pandas.DataFrame
        A filtered dataframe containing the top `number` speeches with the highest volatility,
        including a 'length' column, with each title appearing only once, and volatility in the last column.
    """
    # Sort the volatility in descending order
    volatility = volatility.sort_values(ascending=False)

    # Get the top `number` dates with the highest volatility
    top_dates = volatility.head(number).index.tolist()

    # Filter the dataframe for rows where 'date' is in the top_dates
    filtered_df = df[df['date'].isin(top_dates)]

    # Group by 'title' and calculate the length
    grouped_df = filtered_df.groupby('title').agg(
        date=('date', 'first'),  # Take the first date for each title
        length=('title', 'count')  # Count the number of rows for each title
    ).reset_index()

    # Add the volatility value by joining the `volatility` series with `grouped_df`
    grouped_df['volatility'] = grouped_df['date'].map(volatility)

    # Keep only the top `number` rows sorted by the corresponding volatility
    #grouped_df = grouped_df.loc[grouped_df['date'].isin(top_dates)].head(number)

    return grouped_df


In [71]:
def filter_price_df(df_price, df_speech):
    # Ensure the `timestamp` column in df_speech is in datetime format and remove timezone info
    df_speech['timestamp'] = pd.to_datetime(df_speech['timestamp']).dt.tz_localize(None)
    
    # Ensure the `datetime` column in df_price is in datetime format and remove timezone info
    df_price['datetime'] = pd.to_datetime(df_price['datetime']).dt.tz_localize(None)
    
    # Extract unique dates from df_speech
    datelist = df_speech['timestamp'].dt.date.unique().tolist()
    
    df_return = pd.DataFrame()
    
    for date in datelist:
        # Filter rows for the specific date in df_speech
        date_speech = df_speech.loc[df_speech['timestamp'].dt.date == date]
        min_timestamp = date_speech['timestamp'].min()
        max_timestamp = date_speech['timestamp'].max()
        
        # Filter rows for the specific date in df_price within the min and max timestamp range
        date_price = df_price.loc[
            (df_price['datetime'].dt.date == date) &
            (df_price['datetime'] >= min_timestamp) &
            (df_price['datetime'] <= max_timestamp)
        ]
        
        # Append filtered rows to the result
        df_return = pd.concat([df_return, date_price], ignore_index=True)

    return df_return


In [None]:
df_speech=clean_df(df_speech)

In [72]:
df_price = filter_price_df(df,df_speech)

In [104]:
volatility = volatility_calculator(df_price , 'close')

In [114]:
finaldf = get_best_values(volatility , df_speech , 10)

In [116]:
finaldf = finaldf.sort_values(by='volatility', ascending=False)
finaldf

Unnamed: 0,title,date,length,volatility
0,"Artificial Intelligence, Big Data, and the Pat...",2024-10-01,17,8.555683
8,Supporting Entrepreneurship & Small Businesses,2024-02-07,12,7.6354
13,The Outlook for the Economy and Monetary Policy,2024-02-07,21,7.6354
2,Challenges to the Community Banking Model,2024-10-11,17,7.582832
6,Recent Views on Monetary Policy and the Econom...,2024-09-26,18,6.768646
9,Supporting Market Resilience and Financial Sta...,2024-09-26,19,6.768646
14,What Will Artificial Intelligence Mean for Ame...,2024-09-26,24,6.768646
7,Remarks on the Economic Outlook and Financial ...,2024-08-20,22,6.432751
11,The Future of Stress Testing and the Stress Ca...,2024-09-10,30,6.265992
12,The Next Steps on Capital,2024-09-10,24,6.265992


In [87]:
def calculate_ret(df):
    """
    Adds a column to the dataframe that contains the percentage change
    of successive values of a price column, with the option to group by 'date'.
    
    Args:
    df (pd.DataFrame): The input dataframe with a price column.
    
    Returns:
    pd.DataFrame: The original dataframe with an additional column for percentage change.
    """
    # Group by 'date' and calculate the percentage change within each group
    df['percentage_change'] = df.groupby('date')['close'].pct_change()



In [88]:
def calculate_cum_ret(df):
    """
    Adds a column to the dataframe that contains the cumulative returns
    of a price column, with the option to group by 'date'.
    
    Args:
    df (pd.DataFrame): The input dataframe with a price column.
    
    Returns:
    pd.DataFrame: The original dataframe with an additional column for cumulative returns.
    """
    # Group by 'date' and calculate cumulative returns within each group
    df['cumulative_returns'] = (1 + df.groupby('date')['close'].pct_change()).cumprod()

In [89]:
calculate_ret(df_price)
calculate_cum_ret(df_price)

Now I try to calculate the top variance speeches by using the percentage change instead of the closing prices

In [117]:
volatility_pctchange = volatility_calculator(df_price, 'percentage_change')
dffinal_pct = get_best_values(volatility_pctchange, df_speech, 10)
dffinal_pct = dffinal_pct.sort_values(by='volatility', ascending = False)
dffinal_pct

Unnamed: 0,title,date,length,volatility
7,Statement by Governor Michelle W. Bowman,2024-09-20,3,0.001005
0,Brief Remarks on the Economy and Monetary Policy,2024-05-03,8,0.000964
2,"Hope, Promise, and Mentors",2024-04-01,7,0.000636
5,Risks and Uncertainty in Monetary Policy: Curr...,2024-04-05,33,0.000576
4,Recent Views on Monetary Policy and the Econom...,2024-09-24,18,0.000531
1,Economic Uncertainty and the Evolution of Mone...,2024-04-16,20,0.000498
3,Lessons from the American Economic Association...,2024-06-14,10,0.000479
8,"Tailoring, Fidelity to the Rule of Law, and Un...",2024-03-05,18,0.000475
6,Some Thoughts on r*: Why Did It Fall and Will...,2024-05-24,26,0.000473
9,The Future of Stress Testing and the Stress Ca...,2024-09-10,30,0.000465


We have different results !

In [136]:
def get_best_values_cumret(df_price, df, number):
    """
    Filter the dataframe for the top dates with the highest final value for cumulated return.

    Parameters:
    df_price : pandas.DataFrame
        A dataframe containing price values, including 'datetime' and 'cumulative_returns'.
    df : pandas.DataFrame
        The dataframe to filter, containing 'title' and 'timestamp' columns.
    number : int
        The number of top cumret dates to select.

    Returns:
    pandas.DataFrame
        A filtered dataframe containing rows with the top `number` cumulative return dates.
    """
    # Get the latest timestamp for each title
    latest_timestamps = df.groupby('title')['timestamp'].max()

    # Retrieve corresponding cumulative returns from df_price
    cumret_values = []
    for timestamp in latest_timestamps:
        matching_row = df_price[df_price['datetime'] == timestamp]
        if not matching_row.empty:
            cumret_values.append(matching_row['cumulative_returns'].iloc[0])
        else:
            cumret_values.append(float('-inf'))  # Handle missing timestamps

    # Combine titles and cumulative return values into a DataFrame
    cumret_df = pd.DataFrame({
        'title': latest_timestamps.index,
        'timestamp': latest_timestamps.values,
        'cumulative_returns': cumret_values
    })

    # Add a 'date' column based on the latest timestamp (just the date part)
    cumret_df['date'] = pd.to_datetime(cumret_df['timestamp']).dt.date

    # Calculate the length of the speech (number of entries for the same title)
    cumret_df['speech_length'] = df.groupby('title').size().loc[cumret_df['title']].values

    # Sort by cumulative_returns in descending order and select the top rows
    cumret_df = cumret_df.sort_values(by='cumulative_returns', ascending=False).head(number)

    return cumret_df


In [137]:
dffinal_cumret = get_best_values_cumret(df_price, df_speech, 10)

In [132]:
dffinal_cumret

Unnamed: 0,title,timestamp,cumulative_returns,date,speech_length
10,"Brief Remarks on the Economy, Monetary Policy,...",2024-06-27 10:21:00,1.014504,2024-06-27,51
36,Moving Toward Better Balance and Implications ...,2024-06-25 10:25:00,1.01432,2024-06-25,20
56,The Consequences of Bank Capital Reform,2024-06-26 10:18:00,1.014134,2024-06-26,19
42,Promoting an Inclusive Financial System,2024-07-09 10:10:00,1.013832,2024-07-09,11
15,Common Inflation and Monetary Policy Challenge...,2024-07-10 10:20:00,1.013687,2024-07-10,21
23,"Financial Inclusion: Past, Present, and Hopes ...",2024-07-09 10:29:00,1.013433,2024-07-09,19
41,Perspectives on U.S. Monetary Policy and Bank ...,2024-06-25 10:56:00,1.013393,2024-06-25,31
75,Welcoming Remarks,2024-06-24 10:05:00,1.012855,2024-06-24,6
48,Some Thoughts on r*: Why Did It Fall and Will...,2024-05-24 10:25:00,1.012479,2024-05-24,26
1,A New Class of Trailblazers,2024-06-07 10:11:00,1.012479,2024-06-07,12
