In [419]:
# import functions
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from http.client import HTTPSConnection
import pickle
from urllib.request import urlopen
import requests
import os
from datetime import datetime
import re


In [420]:
#HYPOTHESIS outside of the function we create a month_list and a year_list with all the 
#values we need
#this will all be part of the same function, which takes as argument the month and year, 
#to make the code clearer and the coding easier 
month  = 'january'
year = 2024

In [421]:
host = 'www.federalreserve.gov'
prefix = '/newsevents/'
suffix = '.htm'
mid_str = f"{year}-{month}"
url  = 'https://' + host + prefix + mid_str + suffix

In [None]:
def breakdown_html(url):
    # Example URL

    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)

    # Regex for time validation
    time_pattern = re.compile(r'^\d{1,2}:\d{2} (a\.m\.|p\.m\.)$')

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all "row cal-nojs__rowTitle" sections
        sections = soup.find_all('div', class_='row cal-nojs__rowTitle')
        print('Found sections:', sections)

        # Initialize lists for storing data
        titles_list = []
        times_list = []
        dates_list = []

        for section in sections:
            # Find the <h4> header within the section
            header = section.find('h4', class_='col-md-12')

            if header:
                header_text = header.get_text(strip=True)
                print('Header text:', header_text)

                # Extract data only for "Speeches" 
                if header_text =="Speeches":
                    print(f"Extracting data for {header_text}...")

                    # Collect all following `div` elements with class "row"
                    current = section.find_next_sibling('div', class_='row')

                    while current and not current.find('h4', class_='col-md-12'):
                        # Extract title, time, and date elements
                        title = current.find('div', class_='col-xs-7')
                        time = current.find('div', class_='col-xs-2')
                        date = current.find('div', class_='col-xs-3')

                        # Append data to lists if found
                        if title:
                            titles_list.append(title)
                        if time:
                            times_list.append(time.get_text(strip=True))
                        if date:
                            dates_list.append(date.get_text(strip=True))

                        # Move to the next sibling
                        current = current.find_next_sibling('div', class_='row')

        # Output the lists
        #VERY PRONE TO COMMIT MISTAKES
        titles_list=titles_list[0:]
        times_list=times_list[1:]
        dates_list=dates_list[1:]
        print("Titles List:", titles_list)
        print("Times List:", times_list)
        print("Dates List:", dates_list)
    else:
        print(f"Failed to fetch the page: {response.status_code}")
    return titles_list, dates_list, times_list


Found sections: [<div class="row cal-nojs__rowTitle">
<h4 class="col-md-12">Announcements </h4>
</div>, <div class="row cal-nojs__rowTitle">
<h4 class="col-md-12">Speeches </h4>
</div>, <div class="row cal-nojs__rowTitle">
<h4 class="col-md-12">FOMC Meetings </h4>
</div>, <div class="row cal-nojs__rowTitle">
<h4 class="col-md-12">Beige Book </h4>
</div>, <div class="row cal-nojs__rowTitle">
<h4 class="col-md-12">Statistical Releases </h4>
</div>, <div class="row cal-nojs__rowTitle">
<h4 class="col-md-12">Other </h4>
</div>]
Header text: Announcements
Header text: Speeches
Extracting data for Speeches...
Header text: FOMC Meetings
Header text: Beige Book
Header text: Statistical Releases
Header text: Other
Titles List: [<div class="col-xs-7">
<p>Discussion -- Vice Chair for Supervision Michael S. Barr</p>
<p><a aria-hidden="true" class="watchLive" href="https://us02web.zoom.us/j/82535112593" title="Watch Live"><span class="icon-video icon icon__sm"></span><span class="icon-label">Watch 

In [423]:
times_list

['1:00 p.m.', '9:00 a.m.', '9:00 a.m.', '11:00 a.m.']

In [424]:
def handle_titles(titles_list):
    speaker_names = []
    calendar_titles = []

    for tag in titles_list:
        # Extract the first <p> tag for the speaker's name
        speaker_tag = tag.find('p')
        if speaker_tag:
            # Extract the name part (assuming the format "Discussion -- Speaker Name")
            name_text = speaker_tag.get_text(strip=True)
            # Split the name from the rest of the text
            speaker_name = name_text.split('--')[1].strip() if '--' in name_text else name_text
            speaker_names.append(speaker_name)
        else:
            speaker_names.append(None)

        # Extract the <p class="calendar__title"> for the title, looking for <em>
        title_tag = tag.find('p', class_='calendar__title')
        if title_tag and title_tag.find('em'):
            calendar_titles.append(title_tag.find('em').get_text(strip=True))
        else:
            calendar_titles.append(None)

    return speaker_names, calendar_titles


In [425]:
from datetime import datetime

def time_handling(times_list):
    """
    Convert a list of time strings in various formats (e.g., '1:00 p.m.') 
    to a standardized time format 'H:M:S'.

    Args:
        times_list (list): List of time strings in formats like '1:00 p.m.', '09:00 am'.

    Returns:
        list: List of times in 'H:M:S' format.
    """
    updated_times = []
    for time_str in times_list:
        try:
            # Preprocess the input to remove unnecessary periods
            clean_time_str = time_str.replace('.', '').strip()
            # Parse the cleaned time string
            parsed_time = datetime.strptime(clean_time_str, '%I:%M %p')
            # Convert to time format H:M:S and append to the result list
            updated_times.append(parsed_time.strftime('%H:%M:%S'))
        except ValueError as e:
            print(f"Error parsing time string: {time_str}. Ensure it follows formats like '1:00 p.m.' or '09:00 am'.")
            raise e
    return updated_times


In [426]:
from datetime import datetime, date

def handle_dates(days_list, month, year):
    """
    Convert a list of days, a month, and a year into a list of datetime.date objects.

    Args:
        days_list (list): List of numbers or strings representing the days of the month.
        month (str): Month as a string (e.g., 'January', 'February').
        year (int): Year as a number (e.g., 2024).

    Returns:
        list: List of datetime.date objects.
    """
    # Convert month name to its corresponding number
    try:
        month_number = datetime.strptime(month, '%B').month
    except ValueError:
        raise ValueError(f"Invalid month name: '{month}'. Use the full month name (e.g., 'January').")
    
    # Ensure all days in the list are integers
    try:
        days_list = [int(day) for day in days_list]
    except ValueError:
        raise ValueError("All elements in days_list must be integers or convertible to integers.")
    
    # Generate datetime.date objects for each day
    dates_list = []
    for day in days_list:
        try:
            # Create the date object
            date_obj = date(year, month_number, day)
            dates_list.append(date_obj)
        except ValueError:
            print(f"Invalid date: Year={year}, Month={month_number}, Day={day}.")
            raise
    
    return dates_list


In [427]:
def create_dataframe( titles_list, dates_list, times_list):
    speaker_names , speech_titles = handle_titles(titles_list)
    times = time_handling(times_list)
    date = handle_dates(dates_list, month, year)

    # Creating a dictionary with list values
    data = {'date': date, 'speaker': speaker_names, 'title': speech_titles, 'timestamp':times}

    # Creating the DataFrame
    df = pd.DataFrame(data)
    return df

    
    

In [428]:
final_df = create_dataframe(titles_list, dates_list, times_list)

In [429]:
final_df

Unnamed: 0,date,speaker,title,timestamp
0,2024-01-19,Vice Chair for Supervision Michael S. Barr,Bank Regulation,13:00:00
1,2024-01-17,Governor Michelle W. Bowman,The Path Forward for Bank Capital Reform,09:00:00
2,2024-01-17,Vice Chair for Supervision Michael S. Barr,Cyber Risk,09:00:00
3,2024-01-16,Governor Christopher J. Waller,Economic Outlook,11:00:00


In [430]:
import pytz
import pandas as pd
from datetime import datetime

def add_timezone(df):
    # Define the Eastern Time zone
    eastern = pytz.timezone('US/Eastern')

    # Function to ensure that 'date' and 'timestamp' are datetime objects and handle them
    def process_row(row):
        # Step 1: Ensure both 'date' and 'timestamp' are datetime objects
        date = pd.to_datetime(row['date'], format='%y:%m:%d')  # Ensure 'date' is in datetime format
        timestamp = pd.to_datetime(row['timestamp'], format='%H:%M:%S').time()  # Ensure 'timestamp' is in datetime format
        
        # Step 2: Combine the date and timestamp into a single datetime object
        combined_datetime = datetime.combine(date, timestamp)

        # Step 3: Make sure the datetime object is tz-naive
        combined_datetime = combined_datetime.replace(tzinfo=None)

        # Step 4: Add the correct Eastern Time Zone (with or without DST)
        localized_time = eastern.localize(combined_datetime, is_dst=None)

        # Step 5: Cut the date and keep only time in hh:mm:ss format with the timezone
        final_timestamp = localized_time.strftime('%H:%M:%S%:z')  # Keep only time and timezone info

        return final_timestamp

    # Step 6: Apply the function to the whole dataset
    df['timestamp'] = df.apply(process_row, axis=1)
    
    return df

In [431]:
def remove_time_from_datetime(df):
    # Ensure the 'date' column is of datetime type
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

    # Now remove the time part (normalize) while keeping the 'date' as a datetime object
    df['date'] = df['date'].dt.normalize()

    return df

In [432]:
ultimate_df = remove_time_from_datetime(final_df)

In [433]:
ultimate_df = add_timezone(ultimate_df)


In [434]:
ultimate_df

Unnamed: 0,date,speaker,title,timestamp
0,2024-01-19,Vice Chair for Supervision Michael S. Barr,Bank Regulation,13:00:00-05:00
1,2024-01-17,Governor Michelle W. Bowman,The Path Forward for Bank Capital Reform,09:00:00-05:00
2,2024-01-17,Vice Chair for Supervision Michael S. Barr,Cyber Risk,09:00:00-05:00
3,2024-01-16,Governor Christopher J. Waller,Economic Outlook,11:00:00-05:00


------

In [None]:
for year_month_str in year_month_str_list:

  # Itera su ogni elemento trovato
  for div in date_elements:
      # Estrai il giorno dall'elemento <p>
      day_str = div.find('p').text.strip() if div.find('p') else None

      # Controlla se il giorno è valido
      if day_str:
          try:
              # Combina il giorno con la stringa dell'anno e mese
              full_date_str = f"{year_month_str}-{day_str}"

              # Converte la stringa in datetime
              full_date_obj = datetime.strptime(full_date_str, "%Y-%B-%d")

              # Ottieni la data formattata
              formatted_date = full_date_obj.strftime("%Y-%m-%d")
              print(f"Data Valida: {formatted_date}")
          except ValueError as e:
              #print(f"Errore nella conversione di '{full_date_str}': {e}")
              pass
      else:
          #print("Giorno non trovato")
          pass

