In [None]:
import pandas as pd
from helpermodules import memory_handling as mh
import numpy as np
from datetime import timedelta, datetime
import time

In [None]:
# downloading the pickle file 
file = "fedspeeches_preprocessed.pkl"
helper = mh.PickleHelper.pickle_load(file)
df = helper.obj

In [None]:
from twelvedata import TDClient
td = TDClient(apikey="ce7c5d1412ff42d1a2ca90e47bd3105f")

In [None]:
# choosing only the assigned speakers for the analysis
speaker_list_riccardo = [ 'Vice Chair Janet L. Yellen', 'Governor Sarah Bloom Raskin', 'Governor Jeremy C. Stein', 'Governor Jerome H. Powell', 'Chairman  Ben S. Bernanke', 'Governor Lael Brainard', 'Chair Janet L. Yellen', 'Vice Chairman Stanley Fischer', 'Vice Chairman for Supervision Randal K. Quarles', 'Chairman Jerome H. Powell', 'Vice Chairman Richard H. Clarida', 'Chair Jerome H. Powell', 'Vice Chair Richard H. Clarida', 'Vice Chair for Supervision Randal K. Quarles']
speaker_list = ['Governor Michelle W. Bowman', 'Vice Chair for Supervision and Chair of the Financial Stability Board Randal K. Quarles', 'Vice Chairman for Supervision and Chair of the Financial Stability Board Randal K. Quarles', 'Governor Christopher J. Waller', 'Governor Randal K. Quarles', 'Vice Chair for Supervision Michael S. Barr', 'Governor Lisa D. Cook', 'Vice Chair Lael Brainard', 'Governor Philip N. Jefferson', 'Chair Pro Tempore Jerome H. Powell', 'Vice Chair Philip N. Jefferson', 'Governor Adriana D. Kugler']
df = df[df['speaker'].isin(speaker_list)]

In [None]:
def data_retrieve_minute(df, ticker):
    """
    Retrieve minute-level time series data for a specified ticker using the Twelve Data API.

    Parameters:
    df : pandas.DataFrame
        A dataframe containing at least two columns:
        - 'date': Dates for which data needs to be retrieved.
        - 'timestamp': Timestamps corresponding to the data entries.
    ticker : str
        The symbol of the financial instrument to retrieve data for.

    Returns:
    dffinal : pandas.DataFrame
        A concatenated DataFrame containing minute-level data for the specified ticker
        across all the unique dates in the provided dataframe `df`. The DataFrame will have:
        - 'date' as the index
        - A 'timestamp' column representing the minute-level time series.
    """
    
    timelist = df.date.unique().tolist()
    count = 0
    dffinal = pd.DataFrame()

    for date in timelist:
        print(f"Processing date: {date}")
        start = df[df['date'] == date].timestamp.min() - pd.Timedelta(minutes=15)
        end = df[df['date'] == date].timestamp.max() + pd.Timedelta(minutes=15)

        if count % 8 == 0 and count > 0:
            time.sleep(60)

        try:
            etf = td.time_series(
                symbol=ticker,
                interval="1min",
                start_date=start,
                end_date=end,
                outputsize=5000    
            ).as_pandas()
            
            # If the API call returns data, concatenate it to the final DataFrame
            if not etf.empty:
                # Add a 'date' column to the ETF DataFrame to use as an index later
                etf['date'] = date
                dffinal = pd.concat([dffinal, etf], ignore_index=False)
            else:
                print(f"No data available for {date}")

        except Exception as e:
            print(f"Error retrieving data for {date}: {e}")

        count += 1
    
    # Set the 'date' as the index and move 'timestamp' into a separate column
    dffinal.reset_index(inplace=True)
    dffinal.rename(columns={'index': 'timestamp'}, inplace=True)
    dffinal.set_index('date', inplace=True)
    
    return dffinal


In [None]:
def volatility_calculator(df):
    """
    Calculate daily volatility for the 'open' column in the dataframe.

    Parameters:
    df : pandas.DataFrame
        A dataframe containing at least 'date' and 'open' columns.

    Returns:
    pandas.Series
        A series with dates as the index and the standard deviation
        of 'open' values (volatility) as the values.
    """

    volatility_series = df.groupby('date')['open'].std()
    return volatility_series

In [None]:
def clean_df(df):
    """
    Clean the dataframe by sorting and filtering based on date and weekdays.

    Parameters:
    df : pandas.DataFrame
        A dataframe containing at least 'date' and 'timestamp' columns.

    Returns:
    pandas.DataFrame
        A dataframe sorted by 'date' and 'timestamp', filtered to include only
        rows with dates >= '2020-12-01' and weekdays (Monday to Friday).
    """
        
    df_ordered = df.sort_values(by=['date', 'timestamp'], ascending = [True, True])
    df_final = df_ordered[df_ordered.date >= "2024-01-01"]
    df_final = df_final[df_final['date'].dt.weekday<5]
    return df_final

In [None]:
def get_best_values(volatility, df, number):
    """
    Filter the dataframe for the top dates with the highest volatility.

    Parameters:
    volatility : pandas.Series
        A series with dates as the index and volatility as the values.
    df : pandas.DataFrame
        The dataframe to filter, containing a 'date' column.
    number : int
        The number of top volatility dates to select.

    Returns:
    pandas.DataFrame
        A filtered dataframe containing rows with dates matching the top 
        `number` volatility values.
    """
    
    # Sort the volatility in descending order
    volatility = volatility.sort_values(ascending=False)

    # Get the top `number` dates with the highest volatility
    top_dates = volatility.head(number).index.tolist()

    # Filter the dataframe for rows where 'date' is in the top_dates
    filtered_df = df[df['date'].isin(top_dates)]

    return filtered_df


In [None]:
df=clean_df(df)

In [None]:
# little trial 
df2 = df[:100]

In [41]:
#estimated time 30 mins
df_price=data_retrieve_minute(df2,'SPY')
#picklehelper = mh.PickleHelper(df_price)
#picklehelper.pickle_dump("fabio2020onwardpricemovements")

Processing date: 2024-01-08 00:00:00
Processing date: 2024-01-16 00:00:00
Processing date: 2024-01-17 00:00:00
Processing date: 2024-02-02 00:00:00


In [None]:
#df_price= pd.read_csv('/Users/baudotedua/Dropbox/Mac/Documents/GitHub/cb-impact-nlps/US SPX 500 (Mini) 1 Minute (1).csv')

In [None]:
'''# Step 1: Combine date and time columns
df_price['datetime'] = pd.to_datetime(df_price['<Date>'] + ' ' + df_price.iloc[:,1], format='%d/%m/%Y %H:%M:%S')

# Step 2: Set the new 'datetime' column as the index
df_price.set_index('datetime', inplace=True)

df_price = df_price.drop(df_price.columns[[0,1,2,3,4,6,7,8,9,10,11]], axis=1)'''

In [None]:
volatility=volatility_calculator(df_price)
newdf= get_best_values(volatility,df,5)

In [43]:
newdf

Unnamed: 0,date,speaker,title,link,text,timestamp,text_by_minute
24286,2024-01-08,Governor Michelle W. Bowman,New Year’s Resolutions for Bank Regulatory Pol...,/newsevents/speech/bowman20240108a.htm,It is a pleasure to join you this afternoon fo...,2024-01-08 10:00:00-05:00,It is a pleasure to join you this afternoon fo...
24287,2024-01-08,Governor Michelle W. Bowman,New Year’s Resolutions for Bank Regulatory Pol...,/newsevents/speech/bowman20240108a.htm,It is a pleasure to join you this afternoon fo...,2024-01-08 10:01:00-05:00,offer my thoughts on the economy and monetary ...
24288,2024-01-08,Governor Michelle W. Bowman,New Year’s Resolutions for Bank Regulatory Pol...,/newsevents/speech/bowman20240108a.htm,It is a pleasure to join you this afternoon fo...,2024-01-08 10:02:00-05:00,"year, which may be a sign that labor market su..."
24289,2024-01-08,Governor Michelle W. Bowman,New Year’s Resolutions for Bank Regulatory Pol...,/newsevents/speech/bowman20240108a.htm,It is a pleasure to join you this afternoon fo...,2024-01-08 10:03:00-05:00,to prevent policy from becoming overly restric...
24290,2024-01-08,Governor Michelle W. Bowman,New Year’s Resolutions for Bank Regulatory Pol...,/newsevents/speech/bowman20240108a.htm,It is a pleasure to join you this afternoon fo...,2024-01-08 10:04:00-05:00,"outlook, I will continue to watch the data clo..."
...,...,...,...,...,...,...,...
24231,2024-02-02,Governor Michelle W. Bowman,The Future of Banking,/newsevents/speech/bowman20240202a.htm,Thank you for the invitation to join you today...,2024-02-02 10:20:00-05:00,risksacan often be more efficient and effectiv...
24232,2024-02-02,Governor Michelle W. Bowman,The Future of Banking,/newsevents/speech/bowman20240202a.htm,Thank you for the invitation to join you today...,2024-02-02 10:21:00-05:00,"a new, stark line at $100 billion in assets? W..."
24233,2024-02-02,Governor Michelle W. Bowman,The Future of Banking,/newsevents/speech/bowman20240202a.htm,Thank you for the invitation to join you today...,2024-02-02 10:22:00-05:00,an integral part of the larger financial syste...
24234,2024-02-02,Governor Michelle W. Bowman,The Future of Banking,/newsevents/speech/bowman20240202a.htm,Thank you for the invitation to join you today...,2024-02-02 10:23:00-05:00,the products and services they offer.While reg...


I have to create a function to calculate the cumulated returns of a dataframe and the returns -> add them to a column

In [56]:
def calculate_ret(df):
    """
    Adds a column to the dataframe that contains the percentage change
    of successive values of a price column, with the option to group by 'date'.
    
    Args:
    df (pd.DataFrame): The input dataframe with a price column.
    
    Returns:
    pd.DataFrame: The original dataframe with an additional column for percentage change.
    """
    # Group by 'date' and calculate the percentage change within each group
    df['percentage_change'] = df.groupby('date')['close'].pct_change()



In [58]:
def calculate_cum_ret(df):
    """
    Adds a column to the dataframe that contains the cumulative returns
    of a price column, with the option to group by 'date'.
    
    Args:
    df (pd.DataFrame): The input dataframe with a price column.
    
    Returns:
    pd.DataFrame: The original dataframe with an additional column for cumulative returns.
    """
    # Group by 'date' and calculate cumulative returns within each group
    df['cumulative_returns'] = (1 + df.groupby('date')['close'].pct_change()).cumprod()

In [None]:
df_price = df_price.sort_values(by=['date', 'datetime'], ascending = [True, True])
calculate_ret(df_price)
calculate_cum_ret(df_price)

In [None]:
df_price

Unnamed: 0_level_0,datetime,open,high,low,close,volume,percentage_change,cumulative_returns
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-01-08,2024-01-08 09:45:00,469.17999,469.19501,468.89001,469.17999,232183,,
2024-01-08,2024-01-08 09:46:00,469.18011,469.22000,469.04999,469.12000,174765,-0.000128,0.999872
2024-01-08,2024-01-08 09:47:00,469.13000,469.31000,469.10001,469.25000,123268,0.000277,1.000149
2024-01-08,2024-01-08 09:48:00,469.17999,469.35001,469.17999,469.17999,134429,-0.000149,1.000000
2024-01-08,2024-01-08 09:49:00,469.17999,469.29001,469.10999,469.12689,126757,-0.000113,0.999887
...,...,...,...,...,...,...,...,...
2024-02-02,2024-02-02 10:31:00,490.95001,491.01749,490.54999,490.59879,199997,-0.000715,1.007961
2024-02-02,2024-02-02 10:32:00,490.57999,490.60001,490.34750,490.39999,653217,-0.000405,1.007553
2024-02-02,2024-02-02 10:33:00,490.39999,490.54999,490.34000,490.45001,126996,0.000102,1.007656
2024-02-02,2024-02-02 10:34:00,490.45499,490.89499,490.45001,490.85999,236558,0.000836,1.008498
