In [1]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Define the path to your model and tokenizer
model_path = "C:/Users/MCOB PHD 14/Desktop/bbFinal/Notebooks/best_model_epoch_8"

# Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

# Move the model to the appropriate device
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [5]:
from tqdm import tqdm
import os

# Define the directory containing the Beige Book chunks
chunks_dir = "C:/Users/MCOB PHD 14/Dropbox/Charlie's Dissertation/Beige Books/chunks"

# Read all text files in the directory
chunk_files = [os.path.join(chunks_dir, file) for file in os.listdir(chunks_dir) if file.endswith(".txt")]

# Load the text content from each file with a progress bar
chunk_texts = []
chunk_filenames = []

print("Reading chunk files...")
for file_path in tqdm(chunk_files, desc="Processing Files"):
    with open(file_path, "r", encoding="utf-8") as f:
        chunk_texts.append(f.read())
        chunk_filenames.append(os.path.basename(file_path))


Reading chunk files...


Processing Files: 100%|██████████| 29521/29521 [03:33<00:00, 138.18it/s] 


In [6]:
# Tokenize the chunks
def tokenize_chunks(texts, tokenizer, max_length=512):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

# Tokenize the chunk texts
chunk_encodings = tokenize_chunks(chunk_texts, tokenizer)


In [8]:
from tqdm import tqdm
import torch
import numpy as np

# Predict labels for the chunks in batches
chunk_predictions = []

# Define batch size for processing
batch_size = 16  # Adjust based on your system's memory capacity
num_chunks = len(chunk_encodings["input_ids"])

print("Predicting labels for chunks...")
for i in tqdm(range(0, num_chunks, batch_size), desc="Processing Predictions"):
    # Prepare batch
    batch_input_ids = chunk_encodings["input_ids"][i : i + batch_size].to(device)
    batch_attention_mask = chunk_encodings["attention_mask"][i : i + batch_size].to(device)

    # Perform prediction
    with torch.no_grad():
        outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
        logits = outputs.logits
        probabilities = torch.sigmoid(logits).cpu().numpy()  # Convert logits to probabilities
        chunk_predictions.extend((probabilities > 0.5).astype(int))  # Apply threshold for binary classification


Predicting labels for chunks...


Processing Predictions: 100%|██████████| 1846/1846 [2:09:43<00:00,  4.22s/it]  


In [None]:
import pandas as pd

# Define topic names
topics = [
    "Employment", "Prices", "Consumption", "Manufacturing", "Construction",
    "Lending", "Uncertainty", "Resources", "Transportation", "Outlook",
    "Labor", "Capital", "Trade"
]

# Create a DataFrame with filenames, texts, and predictions
results_df = pd.DataFrame(chunk_predictions, columns=topics)
results_df["Filename"] = chunk_filenames
results_df["Text"] = chunk_texts

# Reorder columns for better readability
results_df = results_df[["Filename", "Text"] + topics]

# Save to CSV
results_df.to_csv("chunk_predictions.csv", index=False, encoding="utf-8")
print("Predictions saved to chunk_predictions.csv")


Unnamed: 0,Filename,Text,Employment,Prices,Consumption,Manufacturing,Construction,Lending,Uncertainty,Resources,Transportation,Outlook,Labor,Capital,Trade
0,1970_at (1)_chunk_1.txt,"December 9 , 1970 The economy of the Southeast...",1,0,0,0,0,0,0,0,0,0,0,0,0
1,1970_at (1)_chunk_2.txt,be dropping 300 employees by the end of the ye...,1,1,1,0,0,0,0,0,0,0,0,0,0
2,1970_at (1)_chunk_3.txt,percent above the year-ago level . This prompt...,0,0,0,0,0,0,0,1,0,0,0,0,0
3,1970_at (1)_chunk_4.txt,"lowering of interest rates '' , '' improving c...",0,1,0,0,0,0,0,0,0,0,0,0,0
4,1970_at (2)_chunk_1.txt,"November 11 , 1970 Our directors report a weak...",1,0,1,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29516,2024_sl (8)_chunk_1.txt,"Beige Book Report : St Louis January 17 , 2024...",0,0,0,0,0,1,0,0,0,0,0,0,0
29517,2024_sl (8)_chunk_2.txt,modestly since our previous report . Multiple ...,0,1,1,0,0,0,0,0,0,0,0,0,0
29518,2024_sl (8)_chunk_3.txt,numbers in early 2024 . Manufacturing Manufact...,0,0,0,0,0,0,0,0,0,0,1,0,0
29519,2024_sl (8)_chunk_4.txt,"Rock metros . In the Louisville metro area , y...",0,0,0,0,0,1,0,1,0,0,0,0,0


In [14]:
# Read in CSV of GDP
    # This is real GDP, Percent Change from Preceding Period, Seasonally Adjusted Annual Rate 
    # https://fred.stlouisfed.org/series/A191RL1Q225SBEA
GDP = pd.read_csv("real_GDP_change.csv")
GDP["DATE"] = pd.to_datetime(GDP["DATE"])

In [37]:
import pandas as pd

# Step 1: Extract Year and Report Number
results_df["Year"] = results_df["Filename"].str.extract(r"(\d{4})").astype(int)
results_df["Report_Number"] = results_df["Filename"].str.extract(r"\((\d+)\)").astype(int)

# Step 2: Map Report Numbers to Quarters
def get_quarter(year, report_number):
    if 1970 <= year <= 1978:
        if report_number in {1, 2, 3}:
            return "Q4"
        elif report_number in {4, 5, 6}:
            return "Q3"
        elif report_number in {7, 8, 9}:
            return "Q2"
        elif report_number in {10, 11, 12}:
            return "Q1"
    elif year == 1979:
        if report_number in {9, 8}:
            return "Q1"
        elif report_number in {5, 6, 7}:
            return "Q2"
        elif report_number in {4, 3}:
            return "Q3"
        elif report_number in {1, 2}:
            return "Q4"
    elif year == 1980:
        if report_number in {9, 10}:
            return "Q1"
        elif report_number in {8, 7, 6}:
            return "Q2"
        elif report_number in {5, 4}:
            return "Q3"
        elif report_number in {3, 2, 1}:
            return "Q4"
    elif year >= 1981:
        if report_number in {1, 2}:
            return "Q4"
        elif report_number in {3, 4}:
            return "Q1"
        elif report_number in {5, 6}:
            return "Q2"
        elif report_number in {7, 8}:
            return "Q3"
    return "Unknown"

results_df["Quarter"] = results_df.apply(lambda row: get_quarter(row["Year"], row["Report_Number"]), axis=1)

# Step 3: Convert Quarters to Date-Time Format
def quarter_to_date(year, quarter):
    quarter_start_months = {"Q1": "01", "Q2": "04", "Q3": "07", "Q4": "10"}
    month = quarter_start_months[quarter]
    return f"{year}-{month}-01"

results_df["DATE"] = results_df.apply(lambda row: quarter_to_date(row["Year"], row["Quarter"]), axis=1)
results_df["DATE"] = pd.to_datetime(results_df["DATE"])

# Step 4: Aggregate Data by Year and Quarter
# Group by DATE and sum the topic counts
topic_columns = ["Employment", "Prices", "Consumption", "Manufacturing", "Construction",
                 "Lending", "Uncertainty", "Resources", "Transportation", "Outlook",
                 "Labor", "Capital", "Trade"]

aggregated_df = results_df.groupby("DATE")[topic_columns].sum().reset_index()


In [None]:
# Merge the aggregated data with the GDP data
merged_df = pd.merge(aggregated_df, GDP, how="left", left_on="DATE", right_on="DATE")

# Save the merged data to a CSV file
#merged_df.to_csv("merged_data.csv", index=False)

Unnamed: 0,DATE,Employment,Prices,Consumption,Manufacturing,Construction,Lending,Uncertainty,Resources,Transportation,Outlook,Labor,Capital,Trade,real_GDP_change
0,1970-04-01,24,21,30,7,7,18,9,3,6,13,28,14,1,0.6
1,1970-07-01,43,25,29,10,7,24,3,6,3,10,26,23,3,3.7
2,1970-10-01,33,24,40,12,8,29,4,7,3,5,42,12,0,-4.2
3,1971-01-01,21,29,39,8,5,30,6,8,1,10,22,17,1,11.3
4,1971-04-01,26,22,29,6,4,28,7,8,1,7,20,14,1,2.2


In [43]:
merged_df.describe()

Unnamed: 0,DATE,Employment,Prices,Consumption,Manufacturing,Construction,Lending,Uncertainty,Resources,Transportation,Outlook,Labor,Capital,Trade,real_GDP_change
count,217,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0
mean,1997-04-01 23:40:05.529953920,12.239631,16.175115,26.617512,8.917051,20.894009,32.658986,9.880184,13.516129,7.082949,4.359447,16.447005,6.069124,5.797235,2.86129
min,1970-04-01 00:00:00,0.0,0.0,1.0,0.0,3.0,11.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-28.1
25%,1983-10-01 00:00:00,6.0,8.0,18.0,3.0,13.0,26.0,4.0,9.0,1.0,1.0,6.0,2.0,2.0,1.4
50%,1997-04-01 00:00:00,10.0,13.0,26.0,7.0,18.0,31.0,7.0,13.0,4.0,3.0,11.0,4.0,3.0,3.0
75%,2010-10-01 00:00:00,16.0,23.0,34.0,12.0,26.0,38.0,13.0,16.0,9.0,6.0,23.0,8.0,6.0,4.4
max,2024-07-01 00:00:00,56.0,69.0,67.0,42.0,57.0,79.0,56.0,58.0,68.0,30.0,77.0,24.0,59.0,35.2
std,,9.207369,12.543507,11.883161,8.40924,11.858846,10.097847,9.419465,7.063926,9.97323,5.026763,14.310971,5.433079,8.967812,4.409184
