In [5]:
import pandas as pd
# read file from data
labelled_sentences = pd.read_excel('data/FOMC Labelled Sentences.xlsx')
statements = pd.read_excel('data/FOMC Statements 1997-2023.xlsx')

In [6]:
labelled_sentences.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Meeting Date,Score
0,1,"In addition, the Committee intends to purchase...",2010-12-14,-1
1,2,_x000D_\n_x000D_\nThe Committee perceives that...,2003-09-16,0
2,3,"_x000D_\n_x000D_\nIn these circumstances, alth...",2002-05-07,0
3,4,This assessment will take into account a wide ...,2015-01-28,0
4,5,\n_x000D_\n Information received since t...,2010-11-03,-1


In [7]:
statements.head()

Unnamed: 0,Meeting Date,Unnamed: 1
0,1997-03-25,_x000D_\n_x000D_\n\tThe Federal Open Market Co...
1,1998-09-29,_x000D_\nThe Federal Open Market Committee dec...
2,1998-10-15,_x000D_\nThe Federal Reserve today announced t...
3,1998-11-17,_x000D_\nThe Federal Reserve today announced t...
4,1999-05-18,_x000D_\nThe Federal Reserve released the foll...


In [8]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from gensim.models import KeyedVectors

In [9]:
labelled_sentences_sample = labelled_sentences['Sentence'][0]
labelled_sentences_sample

'In addition, the Committee intends to purchase $600 billion of longer-term Treasury securities by the end of the second quarter of 2011, a pace of about $75 billion per month.'

In [17]:
# Word Embeddings Augmenter
# substitution by word similarity

aug = naw.WordEmbsAug(
    model_type='word2vec', model_path='data/GoogleNews-vectors-negative300.bin',
    action="substitute")
augmented_text = aug.augment(labelled_sentences_sample)
print("Original:")
print(labelled_sentences_sample)
print("Augmented Text:")
print(augmented_text)

Original:
In addition, the Committee intends to purchase $600 billion of longer-term Treasury securities by the end of the second quarter of 2011, a pace of about $75 billion per month.
Augmented Text:
['In addition, same Committee aspires to borrow $ 600 #.##billion of indefinitely - term Treasury Collateralized_Debt_Obligations_CDOs by Rink_Rap end of the fourthplace quarter of 2011, a pace of Debunking_myths $ 75 ##.#bn per month.']


In [12]:
# Synonym Augmenter
# substitution by WordNet's synonym

aug = naw.SynonymAug(aug_src='wordnet')
augmented_text = aug.augment(labelled_sentences_sample)
print("Original:")
print(labelled_sentences_sample)
print("Augmented Text:")
print(augmented_text)

Original:
In addition, the Committee intends to purchase $600 billion of longer-term Treasury securities by the end of the second quarter of 2011, a pace of about $75 billion per month.
Augmented Text:
['In accession, the Committee intends to purchase $ 600 jillion of longer - condition Treasury securities by the close of the second quarter of 2011, a pace of astir $ 75 billion per month.']


In [15]:
# OCR Augmenter
# substitution by OCR error, e.g. 'l' vs 'i'

aug = nac.OcrAug()
augmented_text = aug.augment(labelled_sentences_sample, n=2)
print("Original:")
print(labelled_sentences_sample)
print("Augmented Text:")
print(augmented_text)


Original:
In addition, the Committee intends to purchase $600 billion of longer-term Treasury securities by the end of the second quarter of 2011, a pace of about $75 billion per month.
Augmented Text:
['1n addition, the Committee intends to pokcha8e $ 600 billion uf longer - tekm Treasury securities by the end of the second qoaktek of 2011, a pace of about $ 75 billion pek month.', 'In addition, the Committee intends to pokcha8e $ 600 6il1i0n uf l0n9er - term Trea8oky securities by the end uf the second quarter 0f 2011, a pace of a6oot $ 75 billion per month.']


In [18]:
# Random Word Augmenter
# swaps words randomly in sentence

aug = naw.RandomWordAug(action="swap")
augmented_text = aug.augment(labelled_sentences_sample)
print("Original:")
print(labelled_sentences_sample)
print("Augmented Text:")
print(augmented_text)

Original:
In addition, the Committee intends to purchase $600 billion of longer-term Treasury securities by the end of the second quarter of 2011, a pace of about $75 billion per month.
Augmented Text:
['In addition, the Committee to purchase intends $ 600 of billion longer - term Treasury securities the by end of the second quarter of 2011 a, of pace $ about per 75 month billion.']


In [19]:
# Function for augmenting the whole dataset by providing the augmenter

def augment_dataset(data, augmenter):
    augmented_data = []
    for sentence in data:
        augmented_data.append(augmenter.augment(sentence))
    return augmented_data

In [23]:
aug = naw.RandomWordAug(action="swap")
test = augment_dataset(labelled_sentences['Sentence'], aug)
test

[['Addition in, the intends Committee to purchase 600 $ billion longer of - term Treasury securities by end the second of the quarter of 2011 a, pace of $ about 75 billion month per.'],
 ['_x000D_ _x000D_ perceives The Committee that the upside downside and risks to the of attainment sustainable growth for the next few quarters are roughly. equal'],
 ['_x000D_ In _x000D_ these circumstances, although the stance of monetary policy currently is accommodative, the believes Committee that, the for foreseeable future, against the background of its long run goals of price stability and economic sustainable growth and of the information currently available, the risks are balanced respect with to the prospects for both goals. _x000D_ _x000D_ Voting for the FOMC monetary policy were action: Alan Greenspan, Chairman; William J. McDonough, Vice Chairman; Susan S. Bies; Roger W. Ferguson, Jr. ; Edward M. Gramlich; Jerry L Jordan. ; Robert D. McTeer, Jr. Mark; W. Olson; Anthony M. Santomero, and Ga