In [7]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.nn.functional import softmax
import torch
# retrieve text from PDF
from tqdm.notebook import tqdm
from PyPDF2 import PdfReader
from tqdm.notebook import tqdm
from textblob import TextBlob
import numpy as np
import spacy  
import re
from collections import Counter
from nltk.corpus import wordnet
import nltk
nltk.download('omw-1.4')
import pandas as pd
import numpy as np
from transformers import pipeline
import pdfplumber
import os
from difflib import SequenceMatcher
import pickle

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
pdf_texts = pickle.load(open("pdf_texts.pkl", "rb"))

In [3]:
#loading spacy
nlp = spacy.load("en_core_web_sm")

# tokenizing the reports
tokenized_reports = {}

# Loop through each report in pdf_texts
for report_name, report_text in pdf_texts.items():
    # Use the nlp.pipe method to tokenize the report_text
    tokenized_report = [doc for doc in nlp.pipe([report_text])]
    # Add tokenized_report to tokenized_reports
    tokenized_reports[report_name] = tokenized_report

In [4]:
def find_related_financial_keywords(tokenized_reports):
    # financial_terms.txt is a list of financial keywords from Tilburg University
    with open('financial_terms.txt', 'r') as f:
        financial_keywords = [line.strip() for line in f]

    # Find all synsets related to financial keywords
    financial_synsets = [wordnet.synsets(keyword) for keyword in financial_keywords]

    # Flatten the list of synsets
    financial_synsets = [synset for sublist in financial_synsets for synset in sublist]

    # Find all lemmas for these synsets
    financial_lemmas = [lemma.name() for synset in financial_synsets for lemma in synset.lemmas()]

    # Initialize a dictionary to store the most frequent financial keywords for each document
    financial_keywords_counts_dict = {}

    # Iterate over each document in tokenized_reports
    for report_name, tokenized_report in tokenized_reports.items():

        # For each tokenized report, get a list of all tokens
        all_tokens = [token.text for doc in tokenized_report for token in doc]

        # Filter tokens to only include those in the financial lemmas list
        financial_tokens = [token for token in all_tokens if token in financial_lemmas]

        # Use Counter to count each financial token's occurrences
        financial_token_counts = Counter(financial_tokens)

        # Only keep the top 10 most frequent financial words
        top_10_financial_words = financial_token_counts.most_common(10)

        # Add the result to the dictionary
        financial_keywords_counts_dict[report_name] = top_10_financial_words

    return financial_keywords_counts_dict

# Call the function
financial_keywords_counts = find_related_financial_keywords(tokenized_reports)

In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")

def find_top10_frequent_words(tokenized_reports):
    # Select the two reports
    reports = ['FINAL-Q1-23-Shareholder-Letter', 'FINAL-Q2-23-Shareholder-Letter']

    # Initialize a dictionary to store the most frequent words for each selected report
    frequent_words_counts_dict = {}

    # Iterate over each selected report
    for report_name in reports:
        if report_name in tokenized_reports:
            tokenized_report = tokenized_reports[report_name]

            # For each tokenized report, get a list of all tokens
            all_tokens = [token.text.lower() for doc in tokenized_report for token in doc]

            # Filter out punctuation, stop words, and other non-alphabetic tokens
            all_tokens = [token for token in all_tokens if token.isalpha() and not nlp.vocab[token].is_stop]

            # Use Counter to count each token's occurrences
            token_counts = Counter(all_tokens)

            # Only keep the top 10 most frequent words
            top_10_words = token_counts.most_common(10)

            # Add the result to the dictionary
            frequent_words_counts_dict[report_name] = top_10_words

    return frequent_words_counts_dict

# Call the function
frequent_words_counts = find_top10_frequent_words(tokenized_reports)

# # Print the results
# for report_name, frequent_words_counts in frequent_words_counts.items():
#     print(f"Report name: {report_name}")
#     for word, count in frequent_words_counts:
#         print(f"Word: {word}, Count: {count}")
#     print("\n")

In [6]:
# Initialize a dictionary to store the joined sentences for each report.
joined_sentences = {}

# Iterate over each report in pdf_texts.
for report_name, report_text in pdf_texts.items():

    # Split the report text into sentences.
    sentences = nlp(report_text).sents

    # Initialize a list to hold the tokenized sentences for this report.
    tokenized_report_sentences = []

    # Iterate over each sentence.
    for sentence in sentences:
        # Tokenize, lemmatize, and remove stop words and punctuation.
        tokenized = [token.lemma_ for token in sentence if not token.is_stop and not token.is_punct]
        # Add the tokenized sentence to the list.
        tokenized_report_sentences.append(tokenized)

    # Join each tokenized sentence into a single string, and store them in a list.
    joined_report_sentences = [' '.join(sentence) for sentence in tokenized_report_sentences]

    # Add the joined sentences for this report to joined_sentences.
    joined_sentences[report_name] = joined_report_sentences

# # Print the joined sentences for each report.
# for report_name, joined_report_sentences in joined_sentences.items():
#     print(f"Report name: {report_name}")
#     print(joined_report_sentences)
#     print("\n")

In [8]:
# Load the pre-trained BERT model and tokenizer
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def analyze_sentiment_of_sentences_with_keywords(joined_sentences, keywords):
      
    # Initialize a dictionary to store the sentences and their sentiment scores for each report
    sentiment_results_dict = {}

    # Iterate over each report
    for report_name, sentences in joined_sentences.items():
        # Initialize a dictionary to store the sentiment analysis results for the current report
        report_sentiment_results = {keyword: [] for keyword in keywords}

        # Create a list to hold sentence chunks
        sentence_chunks = []

        for sentence in sentences:
            # If a sentence exceeds 512 characters, break it into chunks
            if len(sentence) >= 512:
                chunked_sentences = [sentence[i:i + 512] for i in range(0, len(sentence), 512)]
                sentence_chunks.extend(chunked_sentences)
            else:
                sentence_chunks.append(sentence)

        # Analyze the sentiment for each sentence chunk using BERT model
        for chunk in sentence_chunks:
            chunk_lower = chunk.lower()
            for keyword in keywords:
                if keyword in chunk_lower:
                    inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
                    outputs = model(**inputs)
                    probs = softmax(outputs.logits, dim=1)
                    sentiment_result = {
                        "label": "positive" if probs[0][1] > probs[0][0] else "negative",
                        "score": probs[0][1].item()
                    }
                    
                    #
                    
                    
                    report_sentiment_results[keyword].append(sentiment_result)

        # Add the results to the dictionary
        sentiment_results_dict[report_name] = report_sentiment_results

    return sentiment_results_dict

keywords = ['revenue', 'forecast', 'profit']

# Call the function
sentences_with_keywords_and_sentiment = analyze_sentiment_of_sentences_with_keywords(joined_sentences, keywords)
print(sentences_with_keywords_and_sentiment)

{'COMBINED-Q4-17-Shareholder-Letter-FINAL': {'revenue': [{'label': 'negative', 'score': 0.4383440613746643}, {'label': 'negative', 'score': 0.4415714740753174}, {'label': 'negative', 'score': 0.4395172894001007}, {'label': 'negative', 'score': 0.44059914350509644}, {'label': 'negative', 'score': 0.44020137190818787}, {'label': 'negative', 'score': 0.443690687417984}, {'label': 'negative', 'score': 0.4391561448574066}, {'label': 'negative', 'score': 0.44409722089767456}, {'label': 'negative', 'score': 0.44124263525009155}, {'label': 'negative', 'score': 0.44046199321746826}, {'label': 'negative', 'score': 0.44320544600486755}], 'forecast': [{'label': 'negative', 'score': 0.4414482116699219}, {'label': 'negative', 'score': 0.43984851241111755}, {'label': 'negative', 'score': 0.44267868995666504}, {'label': 'negative', 'score': 0.4384211301803589}, {'label': 'negative', 'score': 0.4409177601337433}, {'label': 'negative', 'score': 0.43904659152030945}, {'label': 'negative', 'score': 0.4398

In [10]:
# Initialize a dictionary to hold total scores for each keyword in each report
total_scores = {report: {keyword: 0 for keyword in keywords} for report in sentences_with_keywords_and_sentiment.keys()}

# Calculate total scores for each keyword in each report
for report_name, keywords_dict in sentences_with_keywords_and_sentiment.items():
    for keyword, sentiments in keywords_dict.items():
        for sentiment in sentiments:
            # If the sentiment is POSITIVE, add the score
            # If the sentiment is NEGATIVE, subtract the score
            if sentiment['label'] == 'POSITIVE':
                total_scores[report_name][keyword] += sentiment['score']
            else:
                total_scores[report_name][keyword] -= sentiment['score']

# Convert the total_scores to a DataFrame
df = pd.DataFrame(total_scores).T
df.reset_index(inplace=True)
df.columns = ['report_name', 'revenue_score', 'forecast_score', 'profit_score']
# Add a new column 'total_score'
df['total_score'] = df['revenue_score'] + df['forecast_score'] + df['profit_score']

In [11]:
df

Unnamed: 0,report_name,revenue_score,forecast_score,profit_score,total_score
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,-4.852087,-3.956564,-4.410806,-13.219457
1,FINAL-Q1-18-Shareholder-Letter,-3.531702,-3.512161,-1.326354,-8.370217
2,FINAL-Q1-19-Shareholder-Letter,-4.401268,-2.6383,-1.323687,-8.363255
3,FINAL-Q1-20-Shareholder-Letter,-5.733111,-3.519706,-1.317164,-10.569981
4,FINAL-Q1-21-Shareholder-Letter,-4.397273,-4.402816,-0.439084,-9.239173
5,FINAL-Q1-22-Shareholder-Letter,-9.687912,-4.399493,-1.761608,-15.849013
6,Final-Q1-23-Shareholder-Letter,-11.44348,-3.968753,-2.639089,-18.051322
7,FINAL-Q2-18-Shareholder-Letter,-5.295144,-4.835024,-2.210923,-12.341092
8,FINAL-Q2-20-Shareholder-Letter-V3-with-Tables,-5.280199,-3.956858,0.0,-9.237057
9,FINAL-Q2-21-Shareholder-Letter,-4.397508,-4.395126,-1.32706,-10.119695
