Bert model without baseline model approach

In [10]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax
import torch
# retrieve text from PDF
from tqdm.notebook import tqdm
from PyPDF2 import PdfReader
from tqdm.notebook import tqdm
from textblob import TextBlob
import numpy as np
import spacy  
import re
from collections import Counter
from nltk.corpus import wordnet
import nltk
nltk.download('omw-1.4')
import pandas as pd
import numpy as np
from transformers import pipeline
import pdfplumber
import os
from difflib import SequenceMatcher
import pickle
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import torch
from torch.nn.functional import softmax


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
def tokenize_reports(pdf_texts):
    
    # Initialize a dictionary to store the joined sentences for each report.
    joined_sentences = {}

    # Iterate over each report in pdf_texts.
    for report_name, report_text in tqdm(pdf_texts.items()): # 18 seconds

        # Split the report text into sentences.
        sentences = nlp(report_text).sents

        # Initialize a list to hold the tokenized sentences for this report.
        tokenized_report_sentences = []

        # Iterate over each sentence.
        for sentence in sentences:
            # Tokenize, lemmatize, and remove stop words and punctuation.
            tokenized = [token.lemma_ for token in sentence if not token.is_stop and not token.is_punct]
            # Add the tokenized sentence to the list.
            tokenized_report_sentences.append(tokenized)

        # Join each tokenized sentence into a single string, and store them in a list.
        joined_report_sentences = [' '.join(sentence) for sentence in tokenized_report_sentences]

        # Add the joined sentences for this report to joined_sentences.
        joined_sentences[report_name] = joined_report_sentences 

    return joined_sentences

In [11]:
# Load the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

def analyze_sentiment_bert(joined_sentences, keywords):
      
    # Initialize a dictionary to store the sentences and their sentiment scores for each report
    sentiment_results_dict = {}

    # Iterate over each report
    for report_name, sentences in joined_sentences.items():
        # Initialize a dictionary to store the sentiment analysis results for the current report
        report_sentiment_results = {keyword: [] for keyword in keywords}

        # Create a list to hold sentence chunks
        sentence_chunks = []

        for sentence in sentences:
            # If a sentence exceeds 512 tokens, break it into chunks
            if len(sentence) >= 512:
                chunked_sentences = [sentence[i:i + 512] for i in range(0, len(sentence), 512)]
                sentence_chunks.extend(chunked_sentences)
            else:
                sentence_chunks.append(sentence)

        # Analyze the sentiment for each sentence chunk using BERT model
        for chunk in sentence_chunks:
            chunk_lower = chunk.lower()
            for keyword in keywords:
                if keyword in chunk_lower:
                    inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
                    outputs = model(**inputs)
                    probs = softmax(outputs.logits, dim=1)
                    sentiment_result = {
                        "label": "positive" if probs[0][1] > probs[0][0] else "negative",
                        "score": probs[0][1].item()
                    }
                    report_sentiment_results[keyword].append(sentiment_result)

        # Add the results to the dictionary
        sentiment_results_dict[report_name] = report_sentiment_results

    return sentiment_results_dict

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
if __name__ == "__main__":

    # Load nlp model
    nlp = spacy.load("en_core_web_sm")

    # Load pdf texts from the pickle file
    pdf_texts = pickle.load(open("pdf_texts.pkl", "rb"))

    # Tokenize the reports
    joined_sentences = tokenize_reports(pdf_texts)
    # Define keywords
    keywords = ['revenue', 'forecast', 'profit']
    # Analyze the sentiment of the sentences containing the keywords
    baseline_keyword_polarity_dict = analyze_sentiment_bert(joined_sentences, keywords)

    # Initialize a dictionary to hold total scores for each keyword in each report
    total_scores = {report: {keyword: 0 for keyword in keywords} for report in baseline_keyword_polarity_dict.keys()}

    # Calculate total scores for each keyword in each report
    for report_name, keywords_dict in baseline_keyword_polarity_dict.items():
        for keyword, sentiments in keywords_dict.items():
            for sentiment in sentiments:
                # If the sentiment is POSITIVE, add the score
                # If the sentiment is NEGATIVE, subtract the score
                if sentiment['label'] == 'POSITIVE':
                    total_scores[report_name][keyword] += sentiment['score']
                else:
                    total_scores[report_name][keyword] -= sentiment['score']

    # Convert the total_scores to a DataFrame
    baseline_keyword_polarity_df = pd.DataFrame(total_scores).T
    baseline_keyword_polarity_df.reset_index(inplace=True)
    baseline_keyword_polarity_df.columns = ['pdf_name', 'revenue_score', 'forecast_score', 'profit_score']
    baseline_keyword_polarity_df['polarity'] = baseline_keyword_polarity_df['revenue_score'] + baseline_keyword_polarity_df['forecast_score'] + baseline_keyword_polarity_df['profit_score']
    
    print(baseline_keyword_polarity_df.head())
    
    # export dataframe to csv
    baseline_keyword_polarity_df.to_csv('baseline_keyword_polarity.csv', index=False)

  0%|          | 0/50 [00:00<?, ?it/s]

                                  pdf_name  revenue_score  forecast_score  \
0  COMBINED-Q4-17-Shareholder-Letter-FINAL      -3.411952       -2.958613   
1           FINAL-Q1-18-Shareholder-Letter      -2.613802       -2.476859   
2           FINAL-Q1-19-Shareholder-Letter      -2.996670       -1.924049   
3           FINAL-Q1-20-Shareholder-Letter      -4.502948       -2.797396   
4           FINAL-Q1-21-Shareholder-Letter      -3.343452       -3.391523   

   profit_score  polarity  
0     -3.099139 -9.469704  
1     -0.896289 -5.986951  
2     -0.818484 -5.739203  
3     -1.002262 -8.302606  
4     -0.373031 -7.108006  


BERT Model using baseline model approach

In [None]:
#loading spacy
nlp = spacy.load("en_core_web_sm")

# tokenizing the reports
tokenized_reports = {}

# Loop through each report in pdf_texts
for report_name, report_text in pdf_texts.items():
    # Use the nlp.pipe method to tokenize the report_text
    tokenized_report = [doc for doc in nlp.pipe([report_text])]
    # Add tokenized_report to tokenized_reports
    tokenized_reports[report_name] = tokenized_report