#  Text Preprocessing for Financial Sentiment Analysis


## 1. Setup and Library Imports <a class="anchor" id="setup"></a>



In [21]:
import re
import pandas as pd
import numpy as np
import nltk
import spacy
import torch
from nltk.corpus import stopwords
from transformers import AutoTokenizer
import logging

# Configure logging for better visibility
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

try:
    nltk.download('stopwords', quiet=True)
    logging.info("NLTK stopwords downloaded.")
except Exception as e:
    logging.error(f"Error downloading NLTK stopwords: {e}")

try:
    # Load a smaller model for general text processing if 'en_core_web_sm' is too large
    # For this specific task, if only tokenization is needed, spacy might be optional
    spacy.load('en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')
    logging.info("SpaCy 'en_core_web_sm' loaded.")
except Exception as e:
    logging.error(f"Error loading SpaCy model: {e}. Please run 'python -m spacy download en_core_web_sm'")
    nlp = None 

stop_words = set(stopwords.words('english'))
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}

try:
    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    logging.info("FinBERT tokenizer loaded.")
except Exception as e:
    logging.error(f"Error loading FinBERT tokenizer: {e}")
    tokenizer = None 


2025-07-09 23:40:59,010 - INFO - NLTK stopwords downloaded.
2025-07-09 23:40:59,014 - ERROR - Error loading SpaCy model: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.. Please run 'python -m spacy download en_core_web_sm'
2025-07-09 23:41:00,071 - INFO - FinBERT tokenizer loaded.


## 2. Load Dataset <a class="anchor" id="load_dataset"></a>



In [22]:
file_path = r"C:\Users\teamp\Desktop\Financial Sentiment Analysis\data\raw\financial_phrasebank.csv"

try:
    df = pd.read_csv(file_path)
    logging.info(f"Dataset loaded successfully from {file_path}. Shape: {df.shape}")
    display(df.head())
except FileNotFoundError:
    logging.error(f"Error: The file {file_path} was not found. Please upload it or check the path.")
    df = pd.DataFrame() 
except Exception as e:
    logging.error(f"An error occurred while loading the dataset: {e}")
    df = pd.DataFrame() 

2025-07-09 23:41:00,097 - INFO - Dataset loaded successfully from C:\Users\teamp\Desktop\Financial Sentiment Analysis\data\raw\financial_phrasebank.csv. Shape: (2264, 2)


Unnamed: 0,sentence,label
0,"According to Gran , the company has no plans t...",1
1,"For the last quarter of 2010 , Componenta 's n...",2
2,"In the third quarter of 2010 , net sales incre...",2
3,Operating profit rose to EUR 13.1 mn from EUR ...,2
4,"Operating profit totalled EUR 21.1 mn , up fro...",2


## 3. Text Cleaning <a class="anchor" id="text_cleaning"></a>



In [23]:
def clean_text(text: str) -> str:
    """
    Cleans a given text string by removing unwanted characters and normalizing whitespace.

    Args:
        text (str): The input text string.

    Returns:
        str: The cleaned text string.
    """
    if not isinstance(text, str):
        logging.warning(f"Non-string input detected for cleaning: {type(text)}. Converting to string.")
        text = str(text)

    # Keep important symbols, remove unwanted chars
    text = re.sub(r"[^a-zA-Z0-9$€%.,!? ]+", " ", text)
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

if not df.empty:
    logging.info("Applying text cleaning to the 'sentence' column...")
    df['sentence'] = df['sentence'].apply(clean_text)
    display(df.head())
else:
    logging.warning("DataFrame is empty. Skipping text cleaning.")


2025-07-09 23:41:00,137 - INFO - Applying text cleaning to the 'sentence' column...


Unnamed: 0,sentence,label
0,"According to Gran , the company has no plans t...",1
1,"For the last quarter of 2010 , Componenta s ne...",2
2,"In the third quarter of 2010 , net sales incre...",2
3,Operating profit rose to EUR 13.1 mn from EUR ...,2
4,"Operating profit totalled EUR 21.1 mn , up fro...",2


## 4. Tokenization <a class="anchor" id="tokenization"></a>



In [24]:
if tokenizer and not df.empty:
    logging.info("Tokenizing the dataset...")
    try:
        tokens = tokenizer(
            list(df['sentence']),
            padding='max_length',
            truncation=True,
            max_length=64,
            return_tensors='pt'
        )
        logging.info(f"Tokens shape: {tokens['input_ids'].shape}")
    except Exception as e:
        logging.error(f"Error during tokenization: {e}")
        tokens = None
else:
    logging.warning("Tokenizer not loaded or DataFrame is empty. Skipping tokenization.")
    tokens = None


2025-07-09 23:41:00,225 - INFO - Tokenizing the dataset...
2025-07-09 23:41:00,409 - INFO - Tokens shape: torch.Size([2264, 64])


## 5. Prepare Labels Tensor <a class="anchor" id="prepare_labels"></a>



In [25]:
if not df.empty:
    logging.info("Preparing labels tensor...")
    try:
        # Ensure labels are numeric based on the map if they are not already
        # Assuming 'label' column already contains numeric labels (0, 1, 2) as per df.head()
        labels = torch.tensor(df['label'].values, dtype=torch.long)
        logging.info(f"Labels tensor shape: {labels.shape}")
    except Exception as e:
        logging.error(f"Error preparing labels tensor: {e}")
        labels = None
else:
    logging.warning("DataFrame is empty. Skipping label preparation.")
    labels = None


2025-07-09 23:41:00,432 - INFO - Preparing labels tensor...
2025-07-09 23:41:00,433 - INFO - Labels tensor shape: torch.Size([2264])


In [26]:
output_path = r"C:\Users\teamp\Desktop\Financial Sentiment Analysis\data\processed\cleaned_data.csv"

# Ensure the directory exists
import os
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Save DataFrame to CSV without the index
df.to_csv(output_path, index=False)

print(f"Data saved successfully at: {output_path}")


Data saved successfully at: C:\Users\teamp\Desktop\Financial Sentiment Analysis\data\processed\cleaned_data.csv


In [27]:
import re

def check_nlp_data_cleanliness(df, text_col):
    # Verify the text column exists
    if text_col not in df.columns:
        print(f"Error: Column '{text_col}' not found in DataFrame.")
        print(f"Available columns: {df.columns.tolist()}")
        return
    
    # 1. Null or missing values
    null_count = df[text_col].isnull().sum()

    # 2. Empty or whitespace-only strings
    empty_count = (df[text_col].astype(str).str.strip() == '').sum()

    # 3. Special characters count
    def contains_special_chars(text):
        if not isinstance(text, str):
            return False
        return bool(re.search(r'[^a-zA-Z0-9\s]', text))

    special_char_count = df[text_col].apply(contains_special_chars).sum()

    # 4. Sample few texts for manual inspection
    sample_texts = df[text_col].head(5).tolist()

    # Results printout
    print(f"Null or missing texts: {null_count}")
    print(f"Empty or whitespace-only texts: {empty_count}")
    print(f"Texts containing special characters: {special_char_count}")
    print("Sample texts:")
    for i, text in enumerate(sample_texts, 1):
        print(f"{i}: {text}")

# Usage example:
text_column = 'sentence'  # your actual text column name from the DataFrame
check_nlp_data_cleanliness(df, text_column)


Null or missing texts: 0
Empty or whitespace-only texts: 0
Texts containing special characters: 2259
Sample texts:
1: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
2: For the last quarter of 2010 , Componenta s net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre tax profit from a pre tax loss of EUR7m .
3: In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .
4: Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007 representing 7.7 % of net sales .
5: Operating profit totalled EUR 21.1 mn , up from EUR 18.6 mn in 2007 , representing 9.7 % of net sales .
