# Summary:

Goal: to make a corpus form the FOMC statements that are realsed after every meeting.

Steps:
1. Make a df of all the possible links.
2. Find the working links.
3. Filter to the statement links only by extracting the titles. ( There are 8 meeting a year, and I went back to 2010)
4. Find the text form the links and add them as df[Text]
5. Save the outcome.

( the reason I had step 1-3 is since I was dealing with unknown links and over 5k options I needed a method that wont take to long on my laptop)


Note: Total word count in my corpus's text column is 54,808

# Making a df of all links possible: 

In [2]:
# I had a base link and I added dates on the end to get all the possible links.

def generate_links(start_year, end_year):
    base_url = "https://www.federalreserve.gov/newsevents/pressreleases/monetary{}a.htm"
    links = []

    current_date = datetime(start_year, 1, 1)
    end_date = datetime(end_year, 12, 31)

    while current_date <= end_date:
        formatted_date = current_date.strftime("%Y%m%d")
        link = base_url.format(formatted_date)
        links.append(link)
        current_date += timedelta(days=1)

    return links

# Example: Generate links for the year range 2010 to 2024
start_year = 2010
end_year = 2023
links = generate_links(start_year, end_year)

# Create a DataFrame to store the links
df = pd.DataFrame({'Links': links})

pd.set_option('display.max_colwidth', None)
print(df)

                                                                              Links
0     https://www.federalreserve.gov/newsevents/pressreleases/monetary20100101a.htm
1     https://www.federalreserve.gov/newsevents/pressreleases/monetary20100102a.htm
2     https://www.federalreserve.gov/newsevents/pressreleases/monetary20100103a.htm
3     https://www.federalreserve.gov/newsevents/pressreleases/monetary20100104a.htm
4     https://www.federalreserve.gov/newsevents/pressreleases/monetary20100105a.htm
...                                                                             ...
5108  https://www.federalreserve.gov/newsevents/pressreleases/monetary20231227a.htm
5109  https://www.federalreserve.gov/newsevents/pressreleases/monetary20231228a.htm
5110  https://www.federalreserve.gov/newsevents/pressreleases/monetary20231229a.htm
5111  https://www.federalreserve.gov/newsevents/pressreleases/monetary20231230a.htm
5112  https://www.federalreserve.gov/newsevents/pressreleases/monetary202312

# Finding the working ones:

In [4]:
# Function to check if the link works, meaning the FOMC issued some sort of statement that day.

def check_link(link):
    try:
        response = requests.get(link)
        return response.status_code == 200
    except:
        return False

# Function to apply multithreading for URL checking
def batch_test_links(links):
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(check_link, links))
    return results

def save_dataframe_to_unique_csv(df, filename):
    base_filename, extension = os.path.splitext(filename)
    counter = 1

    while os.path.exists(filename):
        filename = f"{base_filename}_{counter}{extension}"
        counter += 1

    df.to_csv(filename, index=False)

# Batch test the links using multithreading and create a new column 'Link Works'
df['Link Works'] = batch_test_links(df['Links'])


In [10]:
# see how many results we got: >8per year, means we will need to filter some out.

df_with_status = df
count_true = df_with_status['Link Works'].sum()
print('Number of links that work:', count_true)
working_links_df = df_with_status[df_with_status['Link Works']==True]
df = working_links_df
df

Number of links that work: 553


Unnamed: 0,Links,Link Works
5,https://www.federalreserve.gov/newsevents/pressreleases/monetary20100106a.htm,True
10,https://www.federalreserve.gov/newsevents/pressreleases/monetary20100111a.htm,True
11,https://www.federalreserve.gov/newsevents/pressreleases/monetary20100112a.htm,True
26,https://www.federalreserve.gov/newsevents/pressreleases/monetary20100127a.htm,True
38,https://www.federalreserve.gov/newsevents/pressreleases/monetary20100208a.htm,True
...,...,...
5037,https://www.federalreserve.gov/newsevents/pressreleases/monetary20231017a.htm,True
5052,https://www.federalreserve.gov/newsevents/pressreleases/monetary20231101a.htm,True
5072,https://www.federalreserve.gov/newsevents/pressreleases/monetary20231121a.htm,True
5079,https://www.federalreserve.gov/newsevents/pressreleases/monetary20231128a.htm,True


# add titles from websites:

In [11]:

# Function to get title from a given URL
def get_title(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup.title.text
    except Exception as e:
        return f"Error: {e}"

# Number of threads to use (adjust as needed)
num_threads = 8

# Use ThreadPoolExecutor to parallelize the process
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Map each URL to its corresponding title asynchronously
    titles = list(executor.map(get_title, df['Links']))

# Add the titles to the DataFrame
df['Title'] = titles

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = titles


Unnamed: 0,Links,Link Works,Title
5,https://www.federalreserve.gov/newsevents/pressreleases/monetary20100106a.htm,True,"Federal Reserve Board - Minutes of Federal Open Market Committee, December 15-16, 2009"
10,https://www.federalreserve.gov/newsevents/pressreleases/monetary20100111a.htm,True,Federal Reserve Board - Federal Reserve offers $75 billion in 28-day credit through its Term Auction Facility
11,https://www.federalreserve.gov/newsevents/pressreleases/monetary20100112a.htm,True,"Federal Reserve Board - Federal Reserve announces results of auction of $75 billion in 28-day credit held on January 11, 2010"
26,https://www.federalreserve.gov/newsevents/pressreleases/monetary20100127a.htm,True,Federal Reserve Board - FOMC statement
38,https://www.federalreserve.gov/newsevents/pressreleases/monetary20100208a.htm,True,Federal Reserve Board - Federal Reserve offers $50 billion in 28-day credit through its Term Auction Facility
...,...,...,...
5037,https://www.federalreserve.gov/newsevents/pressreleases/monetary20231017a.htm,True,"Federal Reserve Board - Minutes of the Board's discount rate meetings from August 21 and September 20, 2023"
5052,https://www.federalreserve.gov/newsevents/pressreleases/monetary20231101a.htm,True,Federal Reserve Board - Federal Reserve issues FOMC statement
5072,https://www.federalreserve.gov/newsevents/pressreleases/monetary20231121a.htm,True,"Federal Reserve Board - Minutes of the Federal Open Market Committee, October 31-November 1, 2023"
5079,https://www.federalreserve.gov/newsevents/pressreleases/monetary20231128a.htm,True,"Federal Reserve Board - Minutes of the Board's discount rate meetings from October 23 through November 1, 2023"


In [12]:
# filter to select links:

filtered_df = df[(df['Title'] == 'Federal Reserve Board - Federal Reserve issues FOMC statement') |
                  (df['Title'] == 'Federal Reserve Board - FOMC statement')]

print('Total:', filtered_df['Links'].count())
print('expected_total:', 8*14) # 8 a year....  I have 14years.


Total: 111
expected_total: 112


In [13]:
# Extract date from the Links column
filtered_df['Date'] = filtered_df['Links'].str.extract(r'monetary(\d{8})a.htm')

# Use loc to avoid SettingWithCopyWarning
filtered_df.loc[:, 'Date'] = pd.to_datetime(filtered_df['Date'], format='%Y%m%d').dt.strftime('%m/%d/%Y')

# Print the updated DataFrame
filtered_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Date'] = filtered_df['Links'].str.extract(r'monetary(\d{8})a.htm')


Unnamed: 0,Links,Link Works,Title,Date
26,https://www.federalreserve.gov/newsevents/pressreleases/monetary20100127a.htm,True,Federal Reserve Board - FOMC statement,01/27/2010
74,https://www.federalreserve.gov/newsevents/pressreleases/monetary20100316a.htm,True,Federal Reserve Board - FOMC statement,03/16/2010
117,https://www.federalreserve.gov/newsevents/pressreleases/monetary20100428a.htm,True,Federal Reserve Board - FOMC statement,04/28/2010
173,https://www.federalreserve.gov/newsevents/pressreleases/monetary20100623a.htm,True,Federal Reserve Board - FOMC statement,06/23/2010
221,https://www.federalreserve.gov/newsevents/pressreleases/monetary20100810a.htm,True,Federal Reserve Board - FOMC statement,08/10/2010
...,...,...,...,...
4912,https://www.federalreserve.gov/newsevents/pressreleases/monetary20230614a.htm,True,Federal Reserve Board - Federal Reserve issues FOMC statement,06/14/2023
4954,https://www.federalreserve.gov/newsevents/pressreleases/monetary20230726a.htm,True,Federal Reserve Board - Federal Reserve issues FOMC statement,07/26/2023
5010,https://www.federalreserve.gov/newsevents/pressreleases/monetary20230920a.htm,True,Federal Reserve Board - Federal Reserve issues FOMC statement,09/20/2023
5052,https://www.federalreserve.gov/newsevents/pressreleases/monetary20231101a.htm,True,Federal Reserve Board - Federal Reserve issues FOMC statement,11/01/2023


In [17]:
# Reorder the columns as per your requirement
desired_column_order = ['Date','Title','Links', 'Link Works']

# Use loc to reorder the columns
filtered_df = filtered_df.loc[:, desired_column_order]


# Making a corpus: 

In [18]:
# renaming the df
df = filtered_df

def get_description_by_class(url, target_class):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract content within the specified class
        description = soup.find('div', class_=target_class).get_text(strip=True)

        return description
    except Exception as e:
        return f"Error: {e}"

def process_row(row):
    index, data = row
    url = data['Links']  # Access the 'Links' column using the correct indexing
    target_class = 'col-xs-12 col-sm-8 col-md-8'
    description = get_description_by_class(url, target_class)
    return description

# Create a new column 'Text' using multithreading
with ThreadPoolExecutor() as executor:
    df['Text'] = list(executor.map(process_row, df.iterrows()))


In [24]:
df.columns

Index(['Date', 'Title', 'Links', 'Link Works', 'Text'], dtype='object')

In [25]:
#drop columns that arent needed:
columns_to_drop = ['Link Works','Title']
df = df.drop(columns=columns_to_drop)


In [29]:
pd.reset_option('display.max_colwidth')
df

Unnamed: 0,Date,Links,Text
26,01/27/2010,https://www.federalreserve.gov/newsevents/pres...,Information received since the Federal Open Ma...
74,03/16/2010,https://www.federalreserve.gov/newsevents/pres...,Information received since the Federal Open Ma...
117,04/28/2010,https://www.federalreserve.gov/newsevents/pres...,Information received since the Federal Open Ma...
173,06/23/2010,https://www.federalreserve.gov/newsevents/pres...,Information received since the Federal Open Ma...
221,08/10/2010,https://www.federalreserve.gov/newsevents/pres...,Information received since the Federal Open Ma...
...,...,...,...
4912,06/14/2023,https://www.federalreserve.gov/newsevents/pres...,Recent indicators suggest that economic activi...
4954,07/26/2023,https://www.federalreserve.gov/newsevents/pres...,Recent indicators suggest that economic activi...
5010,09/20/2023,https://www.federalreserve.gov/newsevents/pres...,Recent indicators suggest that economic activi...
5052,11/01/2023,https://www.federalreserve.gov/newsevents/pres...,Recent indicators suggest that economic activi...


In [31]:
# saving the df:
def save_dataframe_to_unique_csv(df, filename):
    base_filename, extension = os.path.splitext(filename)
    counter = 1

    while os.path.exists(filename):
        filename = f"{base_filename}_{counter}{extension}"
        counter += 1

    df.to_csv(filename, index=False)


filename = 'Corpus.csv'
save_dataframe_to_unique_csv(df, filename)


In [32]:
# Getting the amount of words in my corpus:
total_word_count = len(' '.join(df['Text']).split())

print(f'Total word count in the "text" column: {total_word_count}')


Total word count in the "text" column: 54808
