# Text Extraction
- Reads the PDFs associated with FOMC, ECB, and BoE statements.
- Outputs an excel file with three sheets: 'fomc', 'ecb', 'boe'.
- Each sheet contains FK score, DC score, word count, text, lemmatized text.

In [42]:
import numpy as np
import pandas as pd
#import textract
#import pdfkit
import os
import PyPDF2
import re
import textstat
import spacy
from datetime import datetime
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import openpyxl
import statistics
from IPython.core.display import display, HTML
import random
pd.set_option('display.max_rows', 10)

  from IPython.core.display import display, HTML


In [43]:
nlp = spacy.load('en_core_web_sm')

In [44]:
# Flesch Kincaid Score
def calc_fk(text):
    return textstat.flesch_kincaid_grade(text)

# Dale Chall Score
def calc_dc(text):
    return textstat.dale_chall_readability_score(text)

# Extract 3 samples of size 200 from longer statements/minutes
def get_random_samples(text, num_samples=3, sample_size=200):
    words = text.split()  
    total_words = len(words)
    samples = []
    if total_words < sample_size:
        samples.append(text)
        samples.append(text)
        samples.append(text)
        return samples

    for _ in range(num_samples):
        start_idx = random.randint(0, total_words - sample_size)  # Pick a random start index
        sample = " ".join(words[start_idx:start_idx + sample_size])  # Extract 200 words
        samples.append(sample)
    
    return samples

# Function to "lemmatize" text using spacy, preparing for DC score calculation
def lemmatize_text(text):
    doc = nlp(text)
    words = []
    spaces = [token.whitespace_ for token in doc]
    for token, space in zip(doc, spaces):
        words.append(token.lemma_)
        if space:
            words.append(" ")
    return "".join(words)

In [45]:
# process pdfs, calculate their scores, given a text processing function - names are monetary, boe, ecb
def process_pdfs(folder_path, text_function, name, start, end):
    results = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf') and name in filename:
            # Extract date from the filename
            
            date_str = filename[start:end]  
            date = datetime.strptime(date_str, '%Y%m%d')

            pdf_path = os.path.join(folder_path, filename)
            try:
                text = text_function(pdf_path)
                wc = len(text.split())
                # For longer PDFs, take random samples of the text
                samples = get_random_samples(text)
                fk_scores = []
                dc_scores = []
                lemmas = []

                for sample in samples:
                    fk_score = calc_fk(sample)
                    fk_scores.append(fk_score)
                    lemma = lemmatize_text(sample)
                    lemmas.append(lemma)
                    dc_score = calc_dc(lemma)
                    dc_scores.append(dc_score)

                fk_score = statistics.mean(fk_scores)
                dc_score = statistics.mean(dc_scores)

                names = ['Date', 'fk_score', 'dc_score', 'wc', 'text', 'sample1', 'sample2', 'sample3', 'lemma1', 'lemma2', 'lemma3']
                variables = [date, fk_score, dc_score, wc, text, samples[0], samples[1], samples[2], lemmas[0], lemmas[1], lemmas[2]]
                results.append(dict(zip(names, variables)))

                #gf_score = textstat.gunning_fog(text)
                #smog_score = textstat.smog_index(text)
                #cl_score = textstat.coleman_liau_index(text)
                #ls_score = textstat.linsear_write_formula(text)
                    

                    # Append the result as a dictionary

            except Exception as e:
                print(f"Error processing {filename}: {e}")

    return results

## FOMC Statements Text Extraction
- We read the PDFs using the PyPDF2 package
- Replace newlines with spaces, strip out duplicate spaces. Add spaces between pages
- Remove the paragraph about voting decision, as we don't want names to interfere with the score
- Remove starting text (date, title, etc). We identify the first instance of a starting phrase. (The starting phrase has changed over time)
- Save results in Excel file for later use

In [46]:
# text extraction and FK score functions
def fomc_pdf_to_text(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:  
                text += page_text + " "  # Add a space between pages
    text = text.replace('\n', ' ').strip() # Replace newlines with space
    text = re.sub(r'\s+', ' ', text).strip()
    #text = re.sub(r'(\S) +(\S)', r'\1\2', text)

    # clean FOMC pdf by removing voting decision details
    truncated_text = text
    if "Voting for" in truncated_text:
        truncated_text = truncated_text.split("Voting for")[0].strip()
    if "In taking the discount rate action" in truncated_text:
        truncated_text = truncated_text.split("In taking the discount rate action")[0].strip()

    starting_phrases = ["Information received", "Recent indicators", "Indicators of economic activity", "The Federal Open Market Committee", "The Federal Reserve", "Economic activity", "Although", "Indicators", "Overall", "received"]

    return_text = truncated_text  # Default if no starting phrase is found
    for phrase in starting_phrases:
        if phrase in truncated_text:
            try:
                return_text = phrase + " " + truncated_text.split(phrase, 1)[1].strip()
                break  
            except IndexError:
                continue 

    return(return_text)

In [47]:
fomc_statements = pd.DataFrame(process_pdfs('FOMCstatements', fomc_pdf_to_text, 'monetary', 8, 16))

In [48]:
fomc_statements['Date'] = pd.to_datetime(fomc_statements['Date'])
fomc_statements = fomc_statements.sort_values(by='Date')
fomc_statements = fomc_statements.reset_index(drop=True)
fomc_statements

Unnamed: 0,Date,fk_score,dc_score,wc,text,sample1,sample2,sample3,lemma1,lemma2,lemma3
0,2001-01-03,15.000000,10.220000,192,The Federal Open Market Committee decided toda...,The Federal Open Market Committee decided toda...,The Federal Open Market Committee decided toda...,The Federal Open Market Committee decided toda...,the Federal Open Market Committee decide today...,the Federal Open Market Committee decide today...,the Federal Open Market Committee decide today...
1,2001-03-20,15.200000,11.160000,215,The Federal Open Market Committee at its meeti...,for the federal funds rate by 50 basis points ...,its tar get for the federal funds rate by 50 b...,to lower its tar get for the federal funds rat...,for the federal fund rate by 50 basis point to...,its tar get for the federal fund rate by 50 ba...,to lower its tar get for the federal fund rate...
2,2001-04-18,14.866667,10.663333,251,The Federal Open Market Committee decided toda...,4 percent. The FOMC has reviewed prospects for...,for the economy in light of the information th...,"action, the Board of Governors approved a 50 b...",4 percent. the FOMC have review prospect for t...,for the economy in light of the information th...,"action, the Board of Governors approve a 50 ba..."
3,2001-05-15,13.433333,10.723333,229,The Federal Open Market Committee at its meeti...,"percent. In a related action, the Board of Gov...",The Federal Open Market Committee at its meeti...,"related action, the Board of Governors approve...","percent. in a related action, the Board of Gov...",the Federal Open Market Committee at its meeti...,"relate action, the Board of Governors approve ..."
4,2001-06-27,15.400000,10.380000,170,The Federal Open Market Committee at its meeti...,The Federal Open Market Committee at its meeti...,The Federal Open Market Committee at its meeti...,The Federal Open Market Committee at its meeti...,the Federal Open Market Committee at its meeti...,the Federal Open Market Committee at its meeti...,the Federal Open Market Committee at its meeti...
...,...,...,...,...,...,...,...,...,...,...,...
186,2024-06-12,12.500000,9.416667,307,Recent indicators suggest that economic activi...,and inflation goals have moved toward better b...,the risks to achieving its employment and infl...,the Committee’s 2 percent inflation objective ...,and inflation goal have move toward well balan...,the risk to achieve its employment and inflati...,the Committee’s 2 percent inflation objective ...
187,2024-07-31,11.866667,9.063333,311,Recent indicators suggest that economic activi...,economic activity has continued to expand at a...,"economic outlook is uncertain, and the Committ...",maximum employment and inflation at the rate o...,economic activity have continue to expand at a...,"economic outlook be uncertain, and the Committ...",maximum employment and inflation at the rate o...
188,2024-09-18,13.466667,10.100000,293,Recent indicators suggest that economic activi...,employment and inflation goals are roughly in ...,made further progress toward the Committee’s 2...,greater confidence that inflation is moving su...,employment and inflation goal be roughly in ba...,make further progress toward the Committee’s 2...,great confidence that inflation be move sustai...
189,2024-11-07,13.166667,9.693333,282,Recent indicators suggest that economic activi...,and inflation goals are roughly in balance. Th...,"Since earlier in the year, labor market condit...",The Committee seeks to achieve maximum employm...,and inflation goal be roughly in balance. the ...,"since early in the year, labor market conditio...",the Committee seek to achieve maximum employme...


## UK Text Extraction

In [49]:
def boe_pdf_to_text(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:  # Check if text was extracted
                text += page_text + " "  # Add a newline for separation between pages
    text = text.replace('\n', ' ').strip() # Return the text without trailing newlines

    # clean BoE pdf by removing title page
    truncated_text = text.split("Published on")[1].strip()
    truncated_text = truncated_text.split("Minutes of the Monetary Policy Committee meeting ending on")[0].strip()

    starting_phrases = ["The Bank of England", "MPC"]
    return_text = truncated_text  # Default if no starting phrase is found
    for phrase in starting_phrases:
        if phrase in truncated_text:
            try:
                return_text = phrase + " " + truncated_text.split(phrase, 1)[1].strip()
                break  
            except IndexError:
                continue 

    return(return_text)

In [50]:
boe_statements = pd.DataFrame(process_pdfs('BoEsummaries', boe_pdf_to_text, 'boe', 3, 11))

In [51]:
boe_statements

Unnamed: 0,Date,fk_score,dc_score,wc,text,sample1,sample2,sample3,lemma1,lemma2,lemma3
0,2023-05-11,12.033333,8.880000,908,The Bank of England ’ s Monetary Policy Commit...,than in the February Report in the near term. ...,"its meeting ending on 10 May 2023, the MPC vot...","sharply from April, in part as large rises in ...",than in the February Report in the near term. ...,"its meeting end on 10 May 2023, the MPC vote b...","sharply from April, in part as large rise in t..."
1,2018-03-22,11.533333,9.660000,684,The Bank of England ’ s Monetary Policy Commit...,"revision, and other indicators of exports and ...",projected to emerge by early 2020 and build th...,costs will pick up to target- consistent rates...,"revision, and other indicator of export and in...",project to emerge by early 2020 and build ther...,cost will pick up to target- consistent rate. ...
2,2024-08-01,10.800000,9.680000,523,MPC ) sets monetary policy to meet the 2% infl...,"to meet the 2% inflation target, and in a way ...",set of projections for activity and inflation ...,", but underlying momentum appears weaker . The...","to meet the 2% inflation target, and in a way ...",set of projection for activity and inflation i...,", but underlying momentum appear weak . the Co..."
3,2023-09-21,11.666667,9.476667,1050,The Bank of England ’ s Monetary Policy Commit...,"near term, with some potential month-to-month ...",declined by 0.5% in July and the S&P Global/CI...,over the summer holiday period. Excluding thes...,"near term, with some potential month-to-month ...",decline by 0.5% in July and the S&P Global/CIP...,over the summer holiday period. exclude these ...
4,2017-06-15,13.833333,9.810000,750,The Bank of England ’ s Monetary Policy Commit...,and the support that monetary policy provides ...,Consumer confidence has remained relatively re...,"quarter , in part reflecting weaker household ...",and the support that monetary policy provide t...,consumer confidence have remain relatively res...,"quarter , in part reflect weak household spend..."
...,...,...,...,...,...,...,...,...,...,...,...
75,2021-06-24,11.566667,9.686667,1042,The Bank of England ’ s Monetary Policy Commit...,"recruitment dif ficulties for some businesses,...",purchases at £875 billion and so the total tar...,target for the stock of these government bond ...,"recruitment dif ficultie for some business, an...",purchase at £875 billion and so the total targ...,target for the stock of these government bond ...
76,2017-08-03,12.666667,9.626667,914,The Bank of England ’ s Monetary Policy Commit...,"in October , as the past depreciation of sterl...",the next few years. Attempting to of fset full...,financed by the issuance of central bank reser...,"in October , as the past depreciation of sterl...",the next few year. attempt to of fset fully th...,finance by the issuance of central bank reserv...
77,2017-11-02,13.133333,10.126667,866,The Bank of England ’ s Monetary Policy Commit...,and consumer confidence has remained resilient...,The ef fects of rising import prices on inflat...,a 42-year low and the MPC judges that the leve...,and consumer confidence have remain resilient....,the ef fect of rise import price on inflation ...,a 42-year low and the MPC judge that the level...
78,2020-11-05,10.500000,9.373333,900,The Bank of England ’ s Monetary Policy Commit...,"expected to pick up in 2021 Q1, as restriction...",rate rose to 4.5% in the three months to Augus...,"adjustment, over the first half of next year ,...","expect to pick up in 2021 q1, as restriction l...",rate rise to 4.5% in the three month to August...,"adjustment, over the first half of next year ,..."


In [52]:
boe_statements['Date'] = pd.to_datetime(boe_statements['Date'])
boe_statements = boe_statements.sort_values(by='Date')
boe_statements = boe_statements.reset_index(drop=True)
boe_statements

Unnamed: 0,Date,fk_score,dc_score,wc,text,sample1,sample2,sample3,lemma1,lemma2,lemma3
0,2015-08-06,12.000000,9.336667,1075,The Bank of England ’ s Monetary Policy Commit...,suggest that medium-term inflation expectation...,"tightening of the labour market is expected, s...",market. Business investment has made a substan...,suggest that medium-term inflation expectation...,"tightening of the labour market be expect, sup...",market. business investment have make a substa...
1,2015-09-10,13.000000,9.440000,783,The Bank of England ’ s Monetary Policy Commit...,"energy , food and import prices waned. In the ...",labour costs in particular . Although pay grow...,best collective judgement is that there remain...,"energy , food and import price wane. in the th...",labour cost in particular . although pay growt...,good collective judgement be that there remain...
2,2015-10-08,14.066667,10.466667,836,The Bank of England ’ s Monetary Policy Commit...,at the beginning of 2014. The sharp declines i...,activity and global growth has continued at be...,"Growth in the euro area, the United Kingdom’ s...",at the beginning of 2014. the sharp decline in...,activity and global growth have continue at be...,"growth in the euro area, the United Kingdom' s..."
3,2015-11-05,10.700000,9.420000,954,The Bank of England ’ s Monetary Policy Commit...,drag from commodity and other imported goods p...,growth. The MPC’ s objective is to return infl...,is evident in subdued measures of core inflati...,drag from commodity and other import good pric...,growth. the MPC' s objective be to return infl...,be evident in subdue measure of core inflation...
4,2015-12-10,12.200000,9.240000,740,The Bank of England ’ s Monetary Policy Commit...,issuance of central bank reserves at £375 bill...,of further shocks. The MPC set out its most re...,Twelve-month CPI inflation remained at -0.1% i...,issuance of central bank reserve at £375 billi...,of further shock. the MPC set out its most rec...,twelve-month CPI inflation remain at -0.1% in ...
...,...,...,...,...,...,...,...,...,...,...,...
75,2024-06-20,10.633333,9.300000,587,The Bank of England ’ s Monetary Policy Commit...,tight by historical standards. The collective ...,derived from the ONS Labour Force Survey means...,Report. This strength in part reflected prices...,tight by historical standard. the collective s...,derive from the ONS Labour Force Survey mean t...,Report. this strength in part reflect price th...
76,2024-08-01,10.800000,9.680000,523,MPC ) sets monetary policy to meet the 2% infl...,"to meet the 2% inflation target, and in a way ...",set of projections for activity and inflation ...,", but underlying momentum appears weaker . The...","to meet the 2% inflation target, and in a way ...",set of projection for activity and inflation i...,", but underlying momentum appear weak . the Co..."
77,2024-09-19,10.366667,9.736667,624,MPC ) sets monetary policy to meet the 2% infl...,member preferred to reduce Bank Rate by 0.25 p...,"Since the MPC’ s previous meeting, global acti...","continued at a steady pace, although some data...",member prefer to reduce Bank rate by 0.25 perc...,"since the MPC' s previous meeting, global acti...","continue at a steady pace, although some datum..."
78,2024-11-07,12.500000,9.156667,663,MPC ) sets monetary policy to meet the 2% infl...,withdrawn. The MPC’ s latest projections for a...,by historical standards. Monetary policy has b...,"Policy Report. In the first case, most of the ...",withdraw. the MPC' s late projection for activ...,by historical standard. monetary policy have b...,"Policy Report. in the first case, most of the ..."


## ECB Text Extraction

In [53]:
def ecb_pdf_to_text(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:  
                text += page_text + " "  # Add a space between pages
    text = text.replace('\n', ' ').strip() # Replace newlines with space
    text = re.sub(r'\s+', ' ', text).strip()

    statement = text
    qna = text
    # clean FOMC pdf by removing voting decision details
    starting_phrases = ["Ladies and gentlemen", "Good afternoon", "Let me just say", "The Vice-President and I"]

    return_text = text  # Default if no starting phrase is found
    for phrase in starting_phrases:
        if phrase in return_text:
            try:
                return_text = phrase + " " + return_text.split(phrase, 1)[1].strip()
                break  
            except IndexError:
                continue 
    
    ending_phrases = ["disposal for questions", "Question:", "*"]

    for phrase in ending_phrases:
        if phrase in return_text:
            try: 
                split_index = return_text.find(phrase)
                statement = return_text[:split_index].strip() # everything before the phrase
                qna = return_text[split_index:].strip()
                break
            except IndexError:
                continue 

    return statement
#, qna

In [54]:
ecb_statements = pd.DataFrame(process_pdfs('ECBPress', ecb_pdf_to_text, 'ecb', 8, 16))

In [55]:
ecb_statements

Unnamed: 0,Date,fk_score,dc_score,wc,text,sample1,sample2,sample3,lemma1,lemma2,lemma3
0,2011-01-13,11.900000,8.980000,1712,"Ladies and gentlemen , the Vice-President and ...","Eesti Pank, became a member of the Governing C...","monetary analysis, the annual growth rate of M...","in oil and other commodity prices, protectioni...","Eesti Pank, become a member of the Governing C...","monetary analysis, the annual growth rate of M...","in oil and other commodity price, protectionis..."
1,2007-02-08,12.033333,9.526667,1611,"Ladies and gentlemen , let me welcome you to o...",currently expected wage developments pose subs...,"futures and previous oil price developments, s...",employment growth – continue to improve. Risks...,currently expect wage development pose substan...,"future and previous oil price development, sig...",employment growth – continue to improve. risk ...
2,2014-10-02,13.366667,10.270000,1241,"Ladies and gentlemen , the Vice-President and ...",information on the modalities of our new purch...,asset-backed securities (ABSs) in the fourth q...,"time, the recovery is likely to continue to be...",information on the modality of our new purchas...,asset-back security (ABSs) in the fourth quart...,"time, the recovery be likely to continue to be..."
3,2008-01-10,13.033333,9.910000,1665,"Ladies and gentlemen , the Vice-President and ...","and, consequently, medium and long-term inflat...",2% in the coming months and is likely to moder...,"Council, this is absolutely essential in order...","and, consequently, medium and long-term inflat...",2% in the come month and be likely to moderate...,"Council, this be absolutely essential in order..."
4,2008-03-06,11.033333,9.520000,1725,"Ladies and gentlemen , the Vice-President and ...",anchoring of medium to longer-term inflation e...,monitoring wage negotiations in the euro area ...,in 2008 both domestic and foreign demand are e...,anchor of medium to long-term inflation expect...,monitor wage negotiation in the euro area with...,in 2008 both domestic and foreign demand be ex...
...,...,...,...,...,...,...,...,...,...,...,...
237,2019-01-24,16.800000,10.186667,986,"Ladies and gentlemen , the Vice-President and ...","summer of 2019, and in any case for as long as...",ensure the continued sustained convergence of ...,"euro area. To sum up, a cross-check of the out...","summer of 2019, and in any case for as long as...",ensure the continue sustained convergence of i...,"euro area. to sum up, a cross-check of the out..."
238,2010-10-07,11.266667,9.153333,1394,"Ladies and gentlemen , the Vice-President and ...",that the underlying pace of monetary expansion...,the business cycle that was also observed in p...,"in September, according to Eurostat’s flash es...",that the underlie pace of monetary expansion b...,the business cycle that be also observe in pas...,"in September, accord to Eurostat’s flash estim..."
239,2023-05-04,9.066667,8.930000,1483,"Good afternoon , the Vice-President and I welc...",fiscal policy support for firms and households...,"supply capacity, especially in the energy sect...",under the APP as of July 2023. The decisions t...,fiscal policy support for firm and household h...,"supply capacity, especially in the energy sect...",under the APP as of July 2023. the decision ta...
240,2004-07-01,14.633333,10.883333,1905,"Ladies and gentlemen , the Vice-President and ...",implementation of this year’s budget and a com...,year’s budget and a comprehensive reform strat...,"the last months, goes in the direction of a co...",implementation of this year’s budget and a com...,year’s budget and a comprehensive reform strat...,"the last month, go in the direction of a confi..."


In [56]:
ecb_statements['Date'] = pd.to_datetime(ecb_statements['Date'])
ecb_statements = ecb_statements.sort_values(by='Date')
ecb_statements = ecb_statements.reset_index(drop=True)
ecb_statements

Unnamed: 0,Date,fk_score,dc_score,wc,text,sample1,sample2,sample3,lemma1,lemma2,lemma3
0,2001-02-01,10.633333,9.330000,1344,"Ladies and gentlemen , let me welcome you to o...","direct taxes in 2001. Hence, while downside ri...","outlook for economic activity. Moreover, infla...","downside risks to real GDP growth exist, growt...","direct taxis in 2001. hence, while downside ri...","outlook for economic activity. moreover, infla...","downside risk to real GDP growth exist, growth..."
1,2001-03-01,12.333333,9.503333,658,"Ladies and gentlemen , the Vice-President and ...",the second half of last year than in the first...,continuing employment growth and declines in u...,of the Governing Council of the ECB. The Gover...,the second half of last year than in the first...,continue employment growth and decline in unem...,of the Governing Council of the ECB. the Gover...
2,2001-04-11,10.966667,8.820000,1071,"Ladies and gentlemen , the Vice-President and ...",applies to the extent and duration of the slow...,area. It decided to keep the key ECB interest ...,the rest of the world. While this indicates th...,apply to the extent and duration of the slowdo...,area. it decide to keep the key ECB interest r...,the rest of the world. while this indicate tha...
3,2001-05-10,12.833333,9.516667,1345,"Ladies and gentlemen , the Vice-President and ...","4.8% in the period from January to March 2001,...",decentralised production scheme with pooling. ...,the oil price shock. This has been a very posi...,"4.8% in the period from January to March 2001,...",decentralise production scheme with pooling. t...,the oil price shock. this have be a very posit...
4,2001-06-07,12.800000,9.456667,1238,"Ladies and gentlemen , the Vice-President and ...",might entail a resurfacing of upward risks to ...,by domestic demand and will remain broadly in ...,over 80% of the launch stock. In order to mark...,might entail a resurfacing of upward risk to p...,by domestic demand and will remain broadly in ...,over 80% of the launch stock. in order to mark...
...,...,...,...,...,...,...,...,...,...,...,...
237,2024-04-11,8.466667,8.426667,1372,"Good afternoon , the Vice-President and I welc...",would help mobilise the massive private invest...,with most standing around 2 per cent. Risk ass...,"in February to 2.4 per cent in March, accordin...",would help mobilise the massive private invest...,with most stand around 2 per cent. risk assess...,"in February to 2.4 per cent in March, accord t..."
238,2024-06-06,10.633333,9.100000,1595,"Good afternoon , the Vice-President and I welc...",year. Goods price inflation continued to decre...,ensure that inflation returns to our two per c...,"and duration of restriction. In particular, ou...",year. good price inflation continue to decreas...,ensure that inflation return to our two per ce...,"and duration of restriction. in particular, ou..."
239,2024-07-18,9.133333,8.750000,1420,"Good afternoon , the Vice-President and I welc...",and without delay will help governments bring ...,will keep policy rates sufficiently restrictiv...,"interest rates, while broader financial condit...",and without delay will help government bring d...,will keep policy rate sufficiently restrictive...,"interest rate, while broad financial condition..."
240,2024-09-12,10.166667,9.276667,1663,"Good afternoon , the Vice-President and I welc...",that level since our July meeting. Risk assess...,"will grow by 0.8 per cent in 2024, rising to 1...",in the first. Recent survey indicators point t...,that level since our July meeting. risk assess...,"will grow by 0.8 per cent in 2024, rise to 1.3...",in the first. recent survey indicator point to...


In [57]:
# edge case: ECB 10-06-2011
with open('ecb1062011.txt', 'r') as file:
    statement = file.read()

#with open('ecbqna1062011.txt', 'r') as file:
    #qna = file.read()

ecb_statements.loc[ecb_statements['Date'] == '2011-10-06', 'text'] = statement
ecb_statements.loc[ecb_statements['Date'] == '2011-10-06', 'wc'] = len(statement.split())
ecb_statements.loc[ecb_statements['Date'] == '2011-10-06', 'fk_score'] = calc_fk(statement)
ecb_statements.loc[ecb_statements['Date'] == '2011-10-06', 'dc_score'] = calc_dc(lemmatize_text(statement))


In [58]:
pd.set_option('display.max_rows', 10)
ecb_statements

Unnamed: 0,Date,fk_score,dc_score,wc,text,sample1,sample2,sample3,lemma1,lemma2,lemma3
0,2001-02-01,10.633333,9.330000,1344,"Ladies and gentlemen , let me welcome you to o...","direct taxes in 2001. Hence, while downside ri...","outlook for economic activity. Moreover, infla...","downside risks to real GDP growth exist, growt...","direct taxis in 2001. hence, while downside ri...","outlook for economic activity. moreover, infla...","downside risk to real GDP growth exist, growth..."
1,2001-03-01,12.333333,9.503333,658,"Ladies and gentlemen , the Vice-President and ...",the second half of last year than in the first...,continuing employment growth and declines in u...,of the Governing Council of the ECB. The Gover...,the second half of last year than in the first...,continue employment growth and decline in unem...,of the Governing Council of the ECB. the Gover...
2,2001-04-11,10.966667,8.820000,1071,"Ladies and gentlemen , the Vice-President and ...",applies to the extent and duration of the slow...,area. It decided to keep the key ECB interest ...,the rest of the world. While this indicates th...,apply to the extent and duration of the slowdo...,area. it decide to keep the key ECB interest r...,the rest of the world. while this indicate tha...
3,2001-05-10,12.833333,9.516667,1345,"Ladies and gentlemen , the Vice-President and ...","4.8% in the period from January to March 2001,...",decentralised production scheme with pooling. ...,the oil price shock. This has been a very posi...,"4.8% in the period from January to March 2001,...",decentralise production scheme with pooling. t...,the oil price shock. this have be a very posit...
4,2001-06-07,12.800000,9.456667,1238,"Ladies and gentlemen , the Vice-President and ...",might entail a resurfacing of upward risks to ...,by domestic demand and will remain broadly in ...,over 80% of the launch stock. In order to mark...,might entail a resurfacing of upward risk to p...,by domestic demand and will remain broadly in ...,over 80% of the launch stock. in order to mark...
...,...,...,...,...,...,...,...,...,...,...,...
237,2024-04-11,8.466667,8.426667,1372,"Good afternoon , the Vice-President and I welc...",would help mobilise the massive private invest...,with most standing around 2 per cent. Risk ass...,"in February to 2.4 per cent in March, accordin...",would help mobilise the massive private invest...,with most stand around 2 per cent. risk assess...,"in February to 2.4 per cent in March, accord t..."
238,2024-06-06,10.633333,9.100000,1595,"Good afternoon , the Vice-President and I welc...",year. Goods price inflation continued to decre...,ensure that inflation returns to our two per c...,"and duration of restriction. In particular, ou...",year. good price inflation continue to decreas...,ensure that inflation return to our two per ce...,"and duration of restriction. in particular, ou..."
239,2024-07-18,9.133333,8.750000,1420,"Good afternoon , the Vice-President and I welc...",and without delay will help governments bring ...,will keep policy rates sufficiently restrictiv...,"interest rates, while broader financial condit...",and without delay will help government bring d...,will keep policy rate sufficiently restrictive...,"interest rate, while broad financial condition..."
240,2024-09-12,10.166667,9.276667,1663,"Good afternoon , the Vice-President and I welc...",that level since our July meeting. Risk assess...,"will grow by 0.8 per cent in 2024, rising to 1...",in the first. Recent survey indicators point t...,that level since our July meeting. risk assess...,"will grow by 0.8 per cent in 2024, rise to 1.3...",in the first. recent survey indicator point to...


## Process Minutes

In [59]:
# process pdfs, calculate their FK scores
def process_min_pdfs(folder_path, text_function, name, start, end):
    results = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf') and name in filename:
            # Extract date from the filename
            date_str = filename[start:end]  
            date = datetime.strptime(date_str, '%m%d%Y')

            # Extract text from the PDF
            pdf_path = os.path.join(folder_path, filename)
            try:
                text = text_function(pdf_path)

                # Calculate key statistics
                fk_score = calc_fk(text)
                #lemmas = lemmatize_text(text)
                #dc_score = calc_dc(lemmas)
                wc = len(text.split())
                results.append({'Date': date, 'fk_score': fk_score, 'wc': wc, 'text': text})

            except Exception as e:
                print(f"Error processing {filename}: {e}")

    return results

### FOMC Minutes

In [60]:
def fomcmin_pdf_to_text(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:  
                text += page_text + " "  
    text = text.replace('\n', ' ').strip() 
    text = re.sub(r'\s+', ' ', text).strip()

    # Find all occurrences of "Minutes of the Monetary Policy Committee"
    split_text = re.split(r"vote", text, flags=re.IGNORECASE, maxsplit=1)

    if len(split_text) > 1:
        text = split_text[1]
    return text

In [61]:
fomc_min = pd.DataFrame(process_pdfs('FOMCminutes', fomcmin_pdf_to_text, 'min', 3, 11))

In [62]:
fomc_min['Date'] = pd.to_datetime(fomc_min['Date'])
fomc_min = fomc_min.sort_values(by='Date')
fomc_min = fomc_min.reset_index(drop=True)
fomc_min

Unnamed: 0,Date,fk_score,dc_score,wc,text,sample1,sample2,sample3,lemma1,lemma2,lemma3
0,2001-01-01,16.066667,10.373333,8677,", the following ofﬁcers of the Federal Open Ma...",form of cable transfers through spot or forwar...,"pool or the permanent portfolio, with the safe...",continued to read as follows: GUIDELINES FOR T...,form of cable transfer through spot or forward...,"pool or the permanent portfolio, with the safe...",continue to read as follow: GUIDELINES for the...
1,2001-03-01,14.266667,9.680000,4650,", the minutes of the meeting of the Federal Op...","lower interest rates, and relatively robust ex...",apparently was not as far along. In the absenc...,"Confer ences On April 1 1, 2001, the Committee...","low interest rate, and relatively robust expan...",apparently be not as far along. in the absence...,"confer ence on April 1 1, 2001, the Committee ..."
2,2001-05-01,15.166667,10.033333,4781,", the minutes of the meeting of the Federal Op...",before the ef fects of countervailing actions ...,Bank of New York. The information reviewed at ...,"an end, adding to the upward momentum of econo...",before the ef fect of countervail action by th...,Bank of New York. the information review at th...,"an end, add to the upward momentum of economic..."
3,2001-06-01,16.000000,10.673333,4699,", the minutes of the meeting of the Federal Op...",the members believed that the risks to the exp...,"in May , and there were signs that weakness in...","the second quarter , reﬂecting the ef fects of...",the member believe that the risk to the expans...,"in May , and there be sign that weakness in em...","the second quarter , reﬂecte the ef fect of lo..."
4,2001-08-01,14.366667,10.646667,4600,", the minutes of the meeting of the Federal Op...",reserve markets consistent with a decrease of ...,the outlook for reopening the stock exchanges....,"objectives, the Committee in the immediate fut...",reserve market consistent with a decrease of 2...,the outlook for reopen the stock exchange. whi...,"objective, the Committee in the immediate futu..."
...,...,...,...,...,...,...,...,...,...,...,...
186,2024-06-01,14.066667,9.663333,5190,", the Committee ratified the Desk’s domestic t...","asset prices. Many participants observed that,...",rose in April and May to the highest levels si...,stayed slightly above pre- pandemic levels. In...,"asset price. many participant observe that, in...",rise in April and May to the high level since ...,stay slightly above pre- pandemic level. in th...
187,2024-07-01,16.866667,10.560000,6456,", the Committee ratified the Desk ’s domestic ...",judged that it was appropriate to continue the...,their Di strict contacts reported that they we...,"ucture, and, potentially, the overall economy....",judge that it be appropriate to continue the p...,their Di strict contact report that they be ac...,"ucture, and, potentially, the overall economy...."
188,2024-09-01,13.966667,9.770000,6083,", the Committee ratified the Desk’s domestic t...",5 ¼ percent in light of core inflation remaini...,"this FOMC meeting, participants submitted thei...","to housing services prices, some participants ...",5 ¼ percent in light of core inflation remain ...,"this fomc meeting, participant submit their pr...","to housing service price, some participant sug..."
189,2024-11-01,12.000000,9.513333,6497,", the Committee ratified the Desk ’s domestic ...","Survey on Bank Lending Practices (SLOOS), bank...",domestic product (GDP) had expanded solidly so...,of geopolitical tensions or a sizable as set p...,"survey on Bank Lending Practices (SLOOS), bank...",domestic product (GDP) have expand solidly so ...,of geopolitical tension or a sizable as set pr...


### BoE Minutes

In [63]:
def boemin_pdf_to_text(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:  
                text += page_text + " "  
    text = text.replace('\n', ' ').strip() 
    text = re.sub(r'\s+', ' ', text).strip()

    # Check if "Minutes of Monetary" is in the text
    has_mom = re.search(r"Minutes of Monetary", text, re.IGNORECASE) is not None

    # Find all occurrences of "Minutes of the Monetary Policy Committee"
    split_text = re.split(r"minutes of the monetary", text, flags=re.IGNORECASE)


    # If "Minutes of Monetary" is found, take text after the second MPC occurrence
    if has_mom and len(split_text) > 2:
        text = split_text[2]
    # Otherwise, take text after the third MPC occurrence
    elif len(split_text) > 4:
        text = split_text[3]

    return text

In [64]:
def boemin_pdf_to_text(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:  # Check if text was extracted
                text += page_text + " "  # Add a newline for separation between pages
    text = text.replace('\n', ' ').strip() # Return the text without trailing newlines

    phrase = "Before turning to"
    return_text = text

    if phrase in text:
        return_text = phrase + " " + text.split(phrase, 1)[1].strip()

    return(return_text)

In [65]:
boe_min = pd.DataFrame(process_pdfs('BOEminutes', boemin_pdf_to_text, 'mpc', 3, 11))

In [66]:
boe_min['Date'] = pd.to_datetime(boe_min['Date'])
boe_min = boe_min.sort_values(by='Date')
boe_min = boe_min.reset_index(drop=True)
boe_min

Unnamed: 0,Date,fk_score,dc_score,wc,text,sample1,sample2,sample3,lemma1,lemma2,lemma3
0,2001-01-10,8.333333,8.650000,8093,Before turning to its immediate policy decisio...,from 3.6% in October to 3.7% in November. The ...,target since April 1999 and recent development...,percentage points). A26 Export growth in Q3 ha...,from 3.6% in October to 3.7% in November. the ...,target since April 1999 and recent development...,percentage point). a26 export growth in Q3 hav...
1,2001-02-08,6.900000,8.866667,7932,Before turning to its immediate policy decisio...,The Bank’s twelve-month AEI-weighted mean meas...,made by UK firms in January had been higher th...,"(PNFCs’) M4 and M4 lending growth in December,...",the Bank’s twelve-month AEI-weighted mean meas...,make by UK firm in January have be high than i...,"(pnfc') M4 and m4 lend growth in December, dow..."
2,2001-03-07,12.366667,8.783333,8571,Before turning to its immediate policy decisio...,"offered outside the M4 sector, substantial hou...",be reflected in lower profitability and the va...,"to the Pre-Budget Report (PBR) forecast, extra...","offer outside the M4 sector, substantial house...",be reflect in low profitability and the value ...,"to the Pre-Budget Report (PBR) forecast, extra..."
3,2001-04-04,11.900000,9.153333,9230,Before turning to its immediate policy decisio...,"credit aggregates continued to grow robustly, ...",implied by short sterling futures contracts ha...,resilience of the sterling exchange rate. For ...,"credit aggregate continue to grow robustly, su...",imply by short sterling future contract have a...,resilience of the sterling exchange rate. for ...
4,2001-05-09,10.833333,9.343333,9076,Before turning to its immediate policy decisio...,notes and coin growth in recent months. A12 Th...,was too soon to form a view on the overall eco...,disease. It was likely that strong consumption...,note and coin growth in recent month. A12 the ...,be too soon to form a view on the overall econ...,disease. it be likely that strong consumption ...
...,...,...,...,...,...,...,...,...,...,...,...
250,2024-06-20,11.533333,9.426667,5016,Before turning to its immediate policy decisio...,expected CPI inflation to increase slightly in...,"volatile components, rents and foreign holiday...",reported a pickup in consumer demand in 2024 Q...,expect CPI inflation to increase slightly in t...,"volatile component, rent and foreign holiday, ...",report a pickup in consumer demand in 2024 q2 ...
251,2024-08-01,10.900000,9.476667,2213,Before turning to its immediate policy decisio...,"import prices and inflation, all else equal , ...",than earlier in the year . 20. Following weakn...,"in 2024 Q1, with that strength appearing to ha...","import price and inflation, all else equal , b...",than early in the year . 20. follow weakness i...,"in 2024 Q1, with that strength appear to have ..."
252,2024-09-19,10.966667,8.956667,4218,Before turning to its immediate policy decisio...,set an amount for the reduction in the stock o...,"meeting ending on 18 September, the FOMC was e...",intelligence indicated that pay settlements ov...,set an amount for the reduction in the stock o...,"meeting end on 18 September, the FOMC be expec...",intelligence indicate that pay settlement over...
253,2024-11-07,10.000000,9.286667,1750,Before turning to its immediate policy decisio...,"back significantly since mid -2023, but it had...","as expected, but material downside news to wag...",was unlikely to intensify over the coming year...,"back significantly since mid -2023, but it hav...","as expect, but material downside news to wage ...",be unlikely to intensify over the come year . ...


### ECB Minutes

In [67]:
def ecbmin_pdf_to_text(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:  
                text += page_text + " "  
    text = text.replace('\n', ' ').strip() 
    text = re.sub(r'\s+', ' ', text).strip()

    # Find all occurrences of "Minutes of the Monetary Policy Committee"
    split_text = re.split(r"financial market developments", text, flags=re.IGNORECASE, maxsplit=1)

    if len(split_text) > 1:
        text = split_text[1]

    #split_text = re.split(r"press conference", text, flags=re.IGNORECASE)

    #if len(split_text) > 1:
    #    text = split_text[0]
    return text

In [68]:
ecb_min = pd.DataFrame(process_pdfs('ECBminutes', ecbmin_pdf_to_text, 'ecb', 3, 11))

In [69]:
ecb_min['Date'] = pd.to_datetime(ecb_min['Date'])
ecb_min = ecb_min.sort_values(by='Date')
ecb_min = ecb_min.reset_index(drop=True)
ecb_min

Unnamed: 0,Date,fk_score,dc_score,wc,text,sample1,sample2,sample3,lemma1,lemma2,lemma3
0,2015-01-22,12.500000,9.106667,9603,Mr Cœuré reviewed recent financial market dev...,"indicators currently stood at, or close to, th...",evidence that a turning point had occurred in ...,"the euro area, together with the likelihood of...","indicator currently stand at, or close to, the...",evidence that a turning point have occur in th...,"the euro area, together with the likelihood of..."
1,2015-04-02,12.966667,9.796667,7375,Mr Cœuré reviewed recent financial market dev...,on an annual basis. The unemployment rate had ...,"of the upward revisions. In this respect, the ...","in the view of market participants, the euro a...",on an annual basis. the unemployment rate have...,"of the upward revision. in this respect, the i...","in the view of market participant, the euro ar..."
2,2015-05-21,12.733333,9.606667,6834,Mr Cœuré reviewed recent financial market dev...,"line with its usual seasonal pattern and, base...",area Mr Praet reviewed the global environment ...,were expected to dampen its pace. Several comm...,"line with its usual seasonal pattern and, base...",area Mr Praet review the global environment an...,be expect to dampen its pace. several comment ...
3,2015-07-02,13.933333,10.350000,6534,Mr Cœuré reviewed recent financial market dev...,significantly exacerbating the rise in bond yi...,within MFI liabilities towards short-term inst...,Mr Jazbec Mr Knot Ms Lautenschläger Mr Liikane...,significantly exacerbate the rise in bond yiel...,within MFI liability towards short-term instru...,Mr Jazbec Mr Knot Ms Lautenschläger Mr Liikane...
4,2015-08-13,11.033333,10.190000,7350,Mr Cœuré reviewed recent financial market dev...,developments in the euro area. Following a slo...,"facility would remain unchanged at 0.05%, 0.30...","at around €95 billion at the end of June, with...",development in the euro area. follow a slowdow...,"facility would remain unchanged at 0.05%, 0.30...","at around €95 billion at the end of June, with..."
...,...,...,...,...,...,...,...,...,...,...,...
77,2024-07-04,13.700000,9.700000,10044,Ms Schnabel noted that since the Governing Co...,as a whole was likely to remain in the first q...,of each year. The fourth quarter was less affe...,while the evolution of cost dynamics (includin...,as a whole be likely to remain in the first qu...,of each year. the fourth quarter be less affec...,while the evolution of cost dynamic (include t...
78,2024-08-22,12.766667,8.796667,7221,Ms Schnabel noted that since the Governing Co...,"outlook, caution was expressed that this decli...",around the target in 2026. It was also importa...,they had been fluctuating to some extent since...,"outlook, caution be express that this decline ...",around the target in 2026. it be also importan...,they have be fluctuate to some extent since th...
79,2024-10-10,12.266667,9.223333,9853,Ms Schnabel noted that since the Governing Co...,the main refinancing operations. As announced ...,"had supported the baseline scenario, there wer...",productivity growth remained low and profits a...,the main refinance operation. as announce in M...,"have support the baseline scenario, there be u...",productivity growth remain low and profit abso...
80,2024-11-14,14.400000,9.800000,8524,Ms Schnabel noted that since the Governing Co...,PMI for services activity declining to 51.4 in...,"an annual rate of 0.8% in August, on the back ...","adjustments. At the same time, labour cost pre...",pmi for service activity decline to 51.4 in Se...,"an annual rate of 0.8% in August, on the back ...","adjustment. at the same time, labour cost pres..."


In [70]:
names = ["fomc", "boe", "ecb", "fomcmin", "boemin", "ecbmin"]

dataframes = [fomc_statements, boe_statements, ecb_statements, fomc_min, boe_min, ecb_min]

with pd.ExcelWriter('text_scores.xlsx', engine='openpyxl') as writer:
    for i, frame in enumerate(dataframes):
        frame.to_excel(writer, sheet_name=names[i], index=False)