# beigeBERT

This notebook lets you load and then run the fine-tuned beigeBERT model on texts.

### Reading in testing data

In [2]:
# Import necessary libraries
import pandas as pd
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import shutil

# Load the sentiment scores CSV
excel_path = r"C:/Users/MCOB PHD 14/Dropbox/Charlie's Dissertation/Beige Books/manual_sentiment.csv"
sentiment_data = pd.read_csv(excel_path)

# Define the label function
def label_sentiment(score):
    if score <= -0.3:
        return 0  # Negative
    elif score <= 0.2:
        return 1  # Mixed
    else:
        return 2  # Positive

# Apply the label function to the sentiment scores
sentiment_data['label'] = sentiment_data['human_sentiment'].apply(label_sentiment)

# Define path where text files are stored
text_files_dir = r"C:/Users/MCOB PHD 14/Dropbox/Charlie's Dissertation/Beige Books/selected_chunks2"

# Load the text files and create a DataFrame
text_data = {}
for filename in os.listdir(text_files_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(text_files_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text_data[filename] = file.read()

# Combine text data with sentiment data
text_df = pd.DataFrame(list(text_data.items()), columns=['file_names', 'text'])
combined_data = pd.merge(sentiment_data, text_df, on='file_names')

# Split data into training and testing sets
train_data, test_data = train_test_split(combined_data, test_size=0.2, random_state=42, stratify=combined_data['label'])

  from .autonotebook import tqdm as notebook_tqdm


### Load beigeBERT

In [3]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch

# Define the path where the model and tokenizer are saved
saved_model_path = 'C:/Users/MCOB PHD 14/Desktop/bbFinal/Notebooks/RoBERTa_three_validated'

# Load the saved tokenizer
tokenizer = RobertaTokenizer.from_pretrained(saved_model_path)

# Load the saved model
model = RobertaForSequenceClassification.from_pretrained(saved_model_path)

# Set the model to evaluation mode
model.eval()


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [4]:
# Define a function to tokenize and predict sentiment
def predict_sentiment_with_status(texts):
    predictions = []
    total_texts = len(texts)
    for idx, text in enumerate(tqdm(texts, desc="Predicting Sentiment", ncols=100)):
        # Tokenize the text
        inputs = tokenizer(
            text,
            return_tensors="pt",       # Return as PyTorch tensors
            truncation=True,           # Truncate longer sequences
            padding='max_length',      # Pad to max length
            max_length=512             # Set maximum length
        )
        
        # Perform prediction
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Get the predicted label
        predicted_label = torch.argmax(outputs.logits, dim=1).item()
        predictions.append(predicted_label)
        
        # Print status update for every 10 texts
        if (idx + 1) % 10 == 0 or (idx + 1) == total_texts:
            print(f"Processed {idx + 1}/{total_texts} texts")

    return predictions

In [9]:
from tqdm import tqdm

# Apply the prediction function to the 'text' column of test_data DataFrame
test_texts = test_data['text'].tolist()  # Convert text column to list
test_data['predicted_label'] = predict_sentiment_with_status(test_texts)

# Step 4: Map the numerical labels back to the class names (optional)
label_map = {0: "Negative", 1: "Mixed", 2: "Positive"}
test_data['predicted_class'] = test_data['predicted_label'].map(label_map)

# Display the DataFrame with predictions
print(test_data[['text', 'predicted_label', 'predicted_class']])

Predicting Sentiment:   5%|██                                      | 10/200 [00:04<01:33,  2.03it/s]

Processed 10/200 texts


Predicting Sentiment:  10%|████                                    | 20/200 [00:09<01:23,  2.15it/s]

Processed 20/200 texts


Predicting Sentiment:  15%|██████                                  | 30/200 [00:14<01:18,  2.15it/s]

Processed 30/200 texts


Predicting Sentiment:  20%|████████                                | 40/200 [00:18<01:13,  2.18it/s]

Processed 40/200 texts


Predicting Sentiment:  25%|██████████                              | 50/200 [00:23<01:10,  2.14it/s]

Processed 50/200 texts


Predicting Sentiment:  30%|████████████                            | 60/200 [00:28<01:06,  2.10it/s]

Processed 60/200 texts


Predicting Sentiment:  35%|██████████████                          | 70/200 [00:32<01:00,  2.14it/s]

Processed 70/200 texts


Predicting Sentiment:  40%|████████████████                        | 80/200 [00:37<00:54,  2.19it/s]

Processed 80/200 texts


Predicting Sentiment:  45%|██████████████████                      | 90/200 [00:42<00:51,  2.15it/s]

Processed 90/200 texts


Predicting Sentiment:  50%|███████████████████▌                   | 100/200 [00:46<00:47,  2.12it/s]

Processed 100/200 texts


Predicting Sentiment:  55%|█████████████████████▍                 | 110/200 [00:51<00:41,  2.15it/s]

Processed 110/200 texts


Predicting Sentiment:  60%|███████████████████████▍               | 120/200 [00:56<00:35,  2.23it/s]

Processed 120/200 texts


Predicting Sentiment:  65%|█████████████████████████▎             | 130/200 [01:00<00:32,  2.16it/s]

Processed 130/200 texts


Predicting Sentiment:  70%|███████████████████████████▎           | 140/200 [01:05<00:27,  2.19it/s]

Processed 140/200 texts


Predicting Sentiment:  75%|█████████████████████████████▎         | 150/200 [01:09<00:22,  2.19it/s]

Processed 150/200 texts


Predicting Sentiment:  80%|███████████████████████████████▏       | 160/200 [01:14<00:17,  2.25it/s]

Processed 160/200 texts


Predicting Sentiment:  85%|█████████████████████████████████▏     | 170/200 [01:18<00:13,  2.19it/s]

Processed 170/200 texts


Predicting Sentiment:  90%|███████████████████████████████████    | 180/200 [01:23<00:09,  2.19it/s]

Processed 180/200 texts


Predicting Sentiment:  95%|█████████████████████████████████████  | 190/200 [01:28<00:04,  2.19it/s]

Processed 190/200 texts


Predicting Sentiment: 100%|███████████████████████████████████████| 200/200 [01:32<00:00,  2.16it/s]

Processed 200/200 texts
                                                  text  predicted_label  \
528  reshaping their job mix . Longer-term , many r...                2   
491  December 8 , 1999 The Fifth District economy c...                2   
888  over the year in Massachusetts , Boston , and ...                1   
899  April 17 , 2019 Summary of Economic Activity S...                2   
960  economic conditions visit : https : //www.atla...                1   
..                                                 ...              ...   
672  , you do n't get the money . '' Looking ahead ...                0   
340  split between those expecting the trade balanc...                0   
244  activity has been `` unexpectedly quiet '' sin...                1   
471  outlook for 1998 , Third District bankers see ...                2   
550  for capital investment in the industry , dampe...                1   

    predicted_class  
528        Positive  
491        Positive  
888      




In [11]:
# Convert label in test_data to be Positive, Negative, or Mixed
test_data['label'] = test_data['label'].map({0: "Negative", 1: "Mixed", 2: "Positive"})

# Display the classification report
print(classification_report(test_data['label'], test_data['predicted_class']))

              precision    recall  f1-score   support

       Mixed       0.64      0.65      0.65        83
    Negative       0.63      0.79      0.70        39
    Positive       0.79      0.68      0.73        78

    accuracy                           0.69       200
   macro avg       0.69      0.71      0.69       200
weighted avg       0.70      0.69      0.69       200



In [12]:
# Rename predicted_class as Sentiment_BERT
test_data.rename(columns={'predicted_class': 'Sentiment_RoBERTa'}, inplace=True)

# Save the test_data DataFrame to a CSV file
test_data.to_csv('RoBERTa_predictions.csv', index=False)