In [1]:
# retrieve text from PDF
from tqdm.notebook import tqdm
from PyPDF2 import PdfReader
from tqdm.notebook import tqdm
from textblob import TextBlob
import numpy as np
import spacy  
import re
from collections import Counter
from nltk.corpus import wordnet
import nltk
nltk.download('omw-1.4')
import pandas as pd
import numpy as np
from transformers import pipeline
import pdfplumber
import os
from difflib import SequenceMatcher

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Я\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


#### Scrapping last 10 quarterly reports

In [2]:
# Define similarity function
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()
# Define headings in the document using word size function
def word_ratio_func(word):
    try:
        word_length = len(word["text"])
        word_bottom = float(word['bottom'])
        word_top = float(word['top'])
        return (word_bottom - word_top), word_length, word["text"]
        
    except:
        return 0, 0, 0

def preprocess_text(texts):

    # preprocess the text
    text = "".join(texts.values()).strip("●").strip("*")
    text = text.split("\n")
    text = [x for x in text if x != '' and x.startswith("Source") == False]
    text = [x[0].replace("●", "") + x[1:] if x[0] == "●" else x for x in text]
    text = [x[0].replace("*", "") + x[1:] if x[0] == "*" else x for x in text]
    text = [x[0].replace("○", "") + x[1:] if x[0] == "○" else x for x in text]
    text = [x[0].replace("1", "") + x[1:] if x[1:3] in ["Q1", "Q2", "Q3", "Q4"] else x for x in text]
    text = text[2:]

    return text


In [3]:
# get file paths for all pdfs
pdf_paths = []
for root, dirs, files in os.walk("ShareholderLetters/"):
    for file in files:
        if file.endswith(".pdf"):
             pdf_paths.append(os.path.join(root, file))

In [4]:
# creating a pdf reader object
pdf_paths = ["ShareholderLetters/FINAL-Q1-21-Shareholder-Letter.pdf",
             "ShareholderLetters/FINAL-Q2-21-Shareholder-Letter.pdf",
             "ShareholderLetters/FINAL-Q3-21-Shareholder-Letter.pdf",
             "ShareholderLetters/FINAL-Q4-21-Shareholder-Letter.pdf",
             "ShareholderLetters/FINAL-Q1-22-Shareholder-Letter.pdf",
             "ShareholderLetters/FINAL-Q2-22-Shareholder-Letter.pdf",
             "ShareholderLetters/FINAL-Q3-22-Shareholder-Letter.pdf",
             "ShareholderLetters/FINAL-Q4-22-Shareholder-Letter.pdf",
             "ShareholderLetters/FINAL-Q1-23-Shareholder-Letter.pdf",
            "ShareholderLetters/FINAL-Q2-23-Shareholder-Letter.pdf"]

pdf_texts = {}
pdf_headings = {}

if not os.path.exists('Txt'):
    os.makedirs('Txt')

for file_path in tqdm(pdf_paths):

    try:
        
        reader = pdfplumber.open(file_path)

        texts = {}
        texts_v2 = {}
        headings = []
        headings_count = 0

        for page_number in range(0, len(reader.pages)):

            # get the specific page from the pdf file
            page = reader.pages[page_number]
            # extract text from page
            text = page.extract_text()
            # add text to dictionary
            texts[page_number] = text

            # extract headings from page
            words = page.extract_words()
            word_count = 0
            while word_count < len(words):
                # find if the words are large enough to be headings
                word_size, word_length, word_text = word_ratio_func(words[word_count])
                heading = []

                if word_size > 15 and word_length > 1:
                    while True:
                        heading.append(word_text)
                        word_count += 1
                        if word_count >= len(words):
                            break
                        word_size, word_length, word_text = word_ratio_func(words[word_count])
                        if not word_size > 15 and word_length > 1:
                            headings.append(" ".join(heading))
                            word_count += 10
                            break
                headings_count += 1
                word_count += 1
            
            # break if the page covers the reference section
            if re.search("\nReference\n", text):
                break
        
        # preprocess the text
        text = preprocess_text(texts)
        final_text = " ".join(text)

        # export the text to a txt file
        with open("Txt/" + file_path.split("/")[-1].split(".")[0] + ".txt", "w", encoding='utf-8') as f:
            f.write(final_text)

        # add the text to the dictionary
        pdf_texts[file_path.split("/")[-1].split(".")[0]] = final_text
        # add the headings to the dictionary
        pdf_headings[file_path.split("/")[-1].split(".")[0]] = headings

    except Exception as e:
        print(e)
        continue

  0%|          | 0/10 [00:00<?, ?it/s]

In [5]:
# view your pdf texts with pdf names as keys
list(pdf_texts.items())

[('FINAL-Q1-21-Shareholder-Letter',
  "Revenue grew 24% year over year and was in line withour beginning of quarter forecast, while operating profit and margin reached all-time highs. We finishedQ1’21 with 208m paid memberships, up 14% year over year, but below our guidance forecast of 210mpaid memberships. We believe paid membership growth slowed due to the big Covid-19 pull forwardin 2020 and a lighter content slate in the firsthalf of this year, due to Covid-19 production delays. We continueto anticipate a strong second half with the return of new seasons of some of our biggest hitsand an exciting film lineup. In the short-term, thereis some uncertainty from Covid-19; in the long-term,the rise of streaming to replace linear TV aroundthe world is the clear trend in entertainment. Q1 Results and Q2 Forecast Average revenue per membership1rose 6% year overyear, or 5%, excluding a foreign exchange impact of +$80m. Operating income of $2 billion vs. $958 millionmore than doubled vs. Q1’2

In [3]:
import pickle
# Load pdf text and headings from the pickle file
pdf_texts = pickle.load(open("Src/pdf_texts.pkl", "rb"))
pdf_headings = pickle.load(open("Src/pdf_headings.pkl", "rb"))

### Tokenized the earning reports

In [None]:
# Load English tokenizer, tagger, parser, NER and word vectors (30 seconds to preprocess all the reports)
nlp = spacy.load("en_core_web_sm")

# Tokenize all the reports
tokenized_reports = {}

# Loop through each report in pdf_texts
for report_name, report_text in pdf_texts.items():
    # Use the nlp.pipe method to tokenize the report_text
    tokenized_report = [doc for doc in nlp.pipe([report_text])]
    
    # Add tokenized_report to tokenized_reports
    tokenized_reports[report_name] = tokenized_report

# Print the tokenized reports
for report_name, tokenized_report in tokenized_reports.items():
    print(f"Report name: {report_name}")
    for doc in tokenized_report:
        print([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])  
        print("\n")

### Find the most frequent mentioned financial keywords in each report

In [5]:
def find_related_financial_keywords(tokenized_reports):
    # financial_terms.txt is a list of financial keywords from Tilburg University
    with open('financial_terms.txt', 'r') as f:
        financial_keywords = [line.strip() for line in f]

    # Find all synsets related to financial keywords
    financial_synsets = [wordnet.synsets(keyword) for keyword in financial_keywords]

    # Flatten the list of synsets
    financial_synsets = [synset for sublist in financial_synsets for synset in sublist]

    # Find all lemmas for these synsets
    financial_lemmas = [lemma.name() for synset in financial_synsets for lemma in synset.lemmas()]

    # Initialize a dictionary to store the most frequent financial keywords for each document
    financial_keywords_counts_dict = {}

    # Iterate over each document in tokenized_reports
    for report_name, tokenized_report in tokenized_reports.items():

        # For each tokenized report, get a list of all tokens
        all_tokens = [token.text for doc in tokenized_report for token in doc]

        # Filter tokens to only include those in the financial lemmas list
        financial_tokens = [token for token in all_tokens if token in financial_lemmas]

        # Use Counter to count each financial token's occurrences
        financial_token_counts = Counter(financial_tokens)

        # Only keep the top 10 most frequent financial words
        top_10_financial_words = financial_token_counts.most_common(10)

        # Add the result to the dictionary
        financial_keywords_counts_dict[report_name] = top_10_financial_words

    return financial_keywords_counts_dict

# Call the function
financial_keywords_counts = find_related_financial_keywords(tokenized_reports)

# Print the results
for report_name, financial_keywords_counts in financial_keywords_counts.items():
    print(f"Report name: {report_name}")
    for word, count in financial_keywords_counts:
        print(f"Word: {word}, Count: {count}")
    print("\n")

FileNotFoundError: [Errno 2] No such file or directory: 'financial_terms.txt'

In [6]:
import spacy
nlp = spacy.load("en_core_web_sm")

def find_top10_frequent_words(tokenized_reports):
    # Select the two reports
    reports = ['FINAL-Q1-23-Shareholder-Letter', 'FINAL-Q2-23-Shareholder-Letter']

    # Initialize a dictionary to store the most frequent words for each selected report
    frequent_words_counts_dict = {}

    # Iterate over each selected report
    for report_name in reports:
        if report_name in tokenized_reports:
            tokenized_report = tokenized_reports[report_name]

            # For each tokenized report, get a list of all tokens
            all_tokens = [token.text.lower() for doc in tokenized_report for token in doc]

            # Filter out punctuation, stop words, and other non-alphabetic tokens
            all_tokens = [token for token in all_tokens if token.isalpha() and not nlp.vocab[token].is_stop]

            # Use Counter to count each token's occurrences
            token_counts = Counter(all_tokens)

            # Only keep the top 10 most frequent words
            top_10_words = token_counts.most_common(10)

            # Add the result to the dictionary
            frequent_words_counts_dict[report_name] = top_10_words

    return frequent_words_counts_dict

# Call the function
frequent_words_counts = find_top10_frequent_words(tokenized_reports)

# Print the results
for report_name, frequent_words_counts in frequent_words_counts.items():
    print(f"Report name: {report_name}")
    for word, count in frequent_words_counts:
        print(f"Word: {word}, Count: {count}")
    print("\n")


Report name: FINAL-Q2-23-Shareholder-Letter
Word: forecast, Count: 3
Word: inmay, Count: 2
Word: revenue, Count: 2
Word: y, Count: 2
Word: f, Count: 2
Word: cash, Count: 2
Word: ourrevenuebase, Count: 1
Word: revenueineachregionisnowhigherthanpre, Count: 1
Word: launch, Count: 1
Word: withsign, Count: 1




### split documents into sentence

In [None]:
# Initialize a dictionary to store the joined sentences for each report.
joined_sentences = {}

# Iterate over each report in pdf_texts.
for report_name, report_text in pdf_texts.items():

    # Split the report text into sentences.
    sentences = nlp(report_text).sents

    # Initialize a list to hold the tokenized sentences for this report.
    tokenized_report_sentences = []

    # Iterate over each sentence.
    for sentence in sentences:
        # Tokenize, lemmatize, and remove stop words and punctuation.
        tokenized = [token.lemma_ for token in sentence if not token.is_stop and not token.is_punct]
        # Add the tokenized sentence to the list.
        tokenized_report_sentences.append(tokenized)

    # Join each tokenized sentence into a single string, and store them in a list.
    joined_report_sentences = [' '.join(sentence) for sentence in tokenized_report_sentences]

    # Add the joined sentences for this report to joined_sentences.
    joined_sentences[report_name] = joined_report_sentences

# Print the joined sentences for each report.
for report_name, joined_report_sentences in joined_sentences.items():
    print(f"Report name: {report_name}")
    print(joined_report_sentences)
    print("\n")

### Extract sentence and sentiment analyze the sentences 

In [10]:
def analyze_sentiment_of_sentences_with_keywords(joined_sentences, keywords):
    # Initialize the sentiment analysis pipeline
    sentiment_analysis = pipeline("sentiment-analysis")

    # Initialize a dictionary to store the sentences and their sentiment scores for each report
    sentences_with_keywords_and_sentiment_dict = {}

    # Iterate over each report
    for report_name, joined_sentences in joined_sentences.items():

        # For each report, find sentences containing each keyword and analyze their sentiment
        sentences_with_keywords_and_sentiment = {keyword: [] for keyword in keywords}

        for sentence in joined_sentences:
            for keyword in keywords:
                if keyword in sentence.lower():
                    # Analyze the sentiment of the sentence
                    sentiment_result = sentiment_analysis(sentence)[0]
                    sentences_with_keywords_and_sentiment[keyword].append((sentence, sentiment_result))

        # Add the result to the dictionary
        sentences_with_keywords_and_sentiment_dict[report_name] = sentences_with_keywords_and_sentiment

    return sentences_with_keywords_and_sentiment_dict

# Call the function
keywords = ['revenue', 'forecast', 'profit']
sentences_with_keywords_and_sentiment = analyze_sentiment_of_sentences_with_keywords(joined_sentences, keywords)

# Print the results
for report_name, sentences_with_keywords_and_sentiment in sentences_with_keywords_and_sentiment.items():
    print(f"Report name: {report_name}")
    for keyword, sentences in sentences_with_keywords_and_sentiment.items():
        print(f"Keyword: {keyword}")
        for sentence, sentiment in sentences:
            print(f"Sentence: {sentence}")
            print(f"Sentiment: {sentiment}")
        print("\n")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Token indices sequence length is longer than the specified maximum sequence length for this model (1032 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The size of tensor a (1032) must match the size of tensor b (512) at non-singleton dimension 1

### sentiment analysis score for each sentence

In [11]:
import pandas as pd
from transformers import pipeline

def analyze_sentiment_of_sentences_with_keywords(joined_sentences, keywords):
    # Initialize the sentiment analysis pipeline
    sentiment_analysis = pipeline("sentiment-analysis")

    # Initialize a DataFrame to store the total polarity scores for each report
    sentiment_scores_df = pd.DataFrame(columns=['Report', 'Keyword', 'Score'])

    # Iterate over each report
    for report_name, joined_sentences in joined_sentences.items():
        # For each report, find sentences containing each keyword and analyze their sentiment
        for keyword in keywords:
            total_score = 0  # Initialize total score for each keyword
            for sentence in joined_sentences:
                if keyword in sentence.lower():
                    # Analyze the sentiment of the sentence
                    sentiment_result = sentiment_analysis(sentence)[0]
                    # Add to total score
                    if sentiment_result['label'] == 'POSITIVE':
                        total_score += sentiment_result['score']
                    else:
                        total_score -= sentiment_result['score']
            # Append the result to the DataFrame
            sentiment_scores_df = sentiment_scores_df.append({'Report': report_name, 'Keyword': keyword, 'Score': total_score}, ignore_index=True)

    return sentiment_scores_df

# Call the function
keywords = ['revenue', 'forecast', 'profit']
sentiment_scores_df = analyze_sentiment_of_sentences_with_keywords(joined_sentences, keywords)

# Print the results
sentiment_scores_df

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Token indices sequence length is longer than the specified maximum sequence length for this model (1032 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The size of tensor a (1032) must match the size of tensor b (512) at non-singleton dimension 1

In [12]:
# Pivot the DataFrame
sentiment_scores_pivot = sentiment_scores_df.pivot(index='Report', columns='Keyword', values='Score')

# Reset the index
sentiment_scores_pivot = sentiment_scores_pivot.reset_index()

# Print the results
sentiment_scores_pivot

Keyword,Report,forecast,profit,revenue
0,FINAL-Q1-21-Shareholder-Letter,-3.789053,0.994155,0.805946
1,FINAL-Q1-22-Shareholder-Letter,-4.052003,0.041319,4.62806
2,FINAL-Q1-23-Shareholder-Letter,-8.886862,-5.831779,-23.671309
3,FINAL-Q2-21-Shareholder-Letter,-5.115314,-0.225202,2.491843
4,FINAL-Q2-22-Shareholder-Letter,-4.61128,1.726416,-3.280668
5,FINAL-Q2-23-Shareholder-Letter,-4.075502,-3.951874,-24.853224
6,FINAL-Q3-21-Shareholder-Letter,-0.053019,0.0,-0.932398
7,FINAL-Q3-22-Shareholder-Letter,-6.496646,-1.676093,-11.70692
8,FINAL-Q4-21-Shareholder-Letter,-3.141478,0.995229,1.02829
9,FINAL-Q4-22-Shareholder-Letter,1.739604,3.429866,-1.125851


In [13]:
# Add a new column 'total_score'
sentiment_scores_pivot['total_score'] = sentiment_scores_pivot['revenue'] + sentiment_scores_pivot['forecast'] + sentiment_scores_pivot['profit']

# Print the results
sentiment_scores_pivot

Keyword,Report,forecast,profit,revenue,total_score
0,FINAL-Q1-21-Shareholder-Letter,-3.789053,0.994155,0.805946,-1.988953
1,FINAL-Q1-22-Shareholder-Letter,-4.052003,0.041319,4.62806,0.617376
2,FINAL-Q1-23-Shareholder-Letter,-8.886862,-5.831779,-23.671309,-38.38995
3,FINAL-Q2-21-Shareholder-Letter,-5.115314,-0.225202,2.491843,-2.848674
4,FINAL-Q2-22-Shareholder-Letter,-4.61128,1.726416,-3.280668,-6.165532
5,FINAL-Q2-23-Shareholder-Letter,-4.075502,-3.951874,-24.853224,-32.8806
6,FINAL-Q3-21-Shareholder-Letter,-0.053019,0.0,-0.932398,-0.985417
7,FINAL-Q3-22-Shareholder-Letter,-6.496646,-1.676093,-11.70692,-19.879659
8,FINAL-Q4-21-Shareholder-Letter,-3.141478,0.995229,1.02829,-1.117959
9,FINAL-Q4-22-Shareholder-Letter,1.739604,3.429866,-1.125851,4.043619


### Stock price processing

In [14]:
stock_price = pd.read_csv("NFLX.csv")
stock_price.loc[:, "open_close_diff"] = stock_price.Close - stock_price.Open
stock_price

Unnamed: 0,Quater,Date,Open,High,Low,Close,Adj Close,Volume,open_close_diff
0,FINAL-Q2-23-Shareholder-Letter,19/07/2023,476.86,485.0,470.0,477.59,477.59,20210900,0.73
1,FINAL-Q1-23-Shareholder-Letter,18/04/2023,335.0,337.190002,330.5,333.700012,333.700012,17944500,-1.299988
2,FINAL-Q4-22-Shareholder-Letter,19/01/2023,322.570007,324.890015,313.390015,315.779999,315.779999,18008200,-6.790008
3,FINAL-Q3-22-Shareholder-Letter,18/10/2022,249.800003,250.369995,237.729996,240.860001,240.860001,25776700,-8.940002
4,FINAL-Q2-22-Shareholder-Letter,19/07/2022,193.020004,201.970001,188.399994,201.630005,201.630005,28178700,8.610001
5,FINAL-Q1-22-Shareholder-Letter,19/04/2022,333.220001,351.679993,333.220001,348.609985,348.609985,20906900,15.389984
6,FINAL-Q4-21-Shareholder-Letter,20/01/2022,517.75,526.640015,506.929993,508.25,508.25,12659000,-9.5
7,FINAL-Q3-21-Shareholder-Letter,19/10/2021,636.969971,641.0,632.299988,639.0,639.0,7633100,2.030029
8,FINAL-Q2-21-Shareholder-Letter,20/07/2021,526.07,536.64,520.3,531.05,531.05,6930400,4.98
9,FINAL-Q1-21-Shareholder-Letter,20/04/2021,554.42,563.56,546.3,549.57,549.57,11257600,-4.85


### match the polarity score with open_close_diff 

In [15]:
# Merge sentiment_scores_pivot and stock_price DataFrames
final_df = pd.merge(sentiment_scores_pivot, stock_price, left_on='Report', right_on='Quater')

# Print the results
final_df

Unnamed: 0,Report,forecast,profit,revenue,total_score,Quater,Date,Open,High,Low,Close,Adj Close,Volume,open_close_diff
0,FINAL-Q1-21-Shareholder-Letter,-3.789053,0.994155,0.805946,-1.988953,FINAL-Q1-21-Shareholder-Letter,20/04/2021,554.42,563.56,546.3,549.57,549.57,11257600,-4.85
1,FINAL-Q1-22-Shareholder-Letter,-4.052003,0.041319,4.62806,0.617376,FINAL-Q1-22-Shareholder-Letter,19/04/2022,333.220001,351.679993,333.220001,348.609985,348.609985,20906900,15.389984
2,FINAL-Q1-23-Shareholder-Letter,-8.886862,-5.831779,-23.671309,-38.38995,FINAL-Q1-23-Shareholder-Letter,18/04/2023,335.0,337.190002,330.5,333.700012,333.700012,17944500,-1.299988
3,FINAL-Q2-21-Shareholder-Letter,-5.115314,-0.225202,2.491843,-2.848674,FINAL-Q2-21-Shareholder-Letter,20/07/2021,526.07,536.64,520.3,531.05,531.05,6930400,4.98
4,FINAL-Q2-22-Shareholder-Letter,-4.61128,1.726416,-3.280668,-6.165532,FINAL-Q2-22-Shareholder-Letter,19/07/2022,193.020004,201.970001,188.399994,201.630005,201.630005,28178700,8.610001
5,FINAL-Q2-23-Shareholder-Letter,-4.075502,-3.951874,-24.853224,-32.8806,FINAL-Q2-23-Shareholder-Letter,19/07/2023,476.86,485.0,470.0,477.59,477.59,20210900,0.73
6,FINAL-Q3-21-Shareholder-Letter,-0.053019,0.0,-0.932398,-0.985417,FINAL-Q3-21-Shareholder-Letter,19/10/2021,636.969971,641.0,632.299988,639.0,639.0,7633100,2.030029
7,FINAL-Q3-22-Shareholder-Letter,-6.496646,-1.676093,-11.70692,-19.879659,FINAL-Q3-22-Shareholder-Letter,18/10/2022,249.800003,250.369995,237.729996,240.860001,240.860001,25776700,-8.940002
8,FINAL-Q4-21-Shareholder-Letter,-3.141478,0.995229,1.02829,-1.117959,FINAL-Q4-21-Shareholder-Letter,20/01/2022,517.75,526.640015,506.929993,508.25,508.25,12659000,-9.5
9,FINAL-Q4-22-Shareholder-Letter,1.739604,3.429866,-1.125851,4.043619,FINAL-Q4-22-Shareholder-Letter,19/01/2023,322.570007,324.890015,313.390015,315.779999,315.779999,18008200,-6.790008


In [16]:
final_df = final_df.drop(columns=['Quater'])
final_df

Unnamed: 0,Report,forecast,profit,revenue,total_score,Date,Open,High,Low,Close,Adj Close,Volume,open_close_diff
0,FINAL-Q1-21-Shareholder-Letter,-3.789053,0.994155,0.805946,-1.988953,20/04/2021,554.42,563.56,546.3,549.57,549.57,11257600,-4.85
1,FINAL-Q1-22-Shareholder-Letter,-4.052003,0.041319,4.62806,0.617376,19/04/2022,333.220001,351.679993,333.220001,348.609985,348.609985,20906900,15.389984
2,FINAL-Q1-23-Shareholder-Letter,-8.886862,-5.831779,-23.671309,-38.38995,18/04/2023,335.0,337.190002,330.5,333.700012,333.700012,17944500,-1.299988
3,FINAL-Q2-21-Shareholder-Letter,-5.115314,-0.225202,2.491843,-2.848674,20/07/2021,526.07,536.64,520.3,531.05,531.05,6930400,4.98
4,FINAL-Q2-22-Shareholder-Letter,-4.61128,1.726416,-3.280668,-6.165532,19/07/2022,193.020004,201.970001,188.399994,201.630005,201.630005,28178700,8.610001
5,FINAL-Q2-23-Shareholder-Letter,-4.075502,-3.951874,-24.853224,-32.8806,19/07/2023,476.86,485.0,470.0,477.59,477.59,20210900,0.73
6,FINAL-Q3-21-Shareholder-Letter,-0.053019,0.0,-0.932398,-0.985417,19/10/2021,636.969971,641.0,632.299988,639.0,639.0,7633100,2.030029
7,FINAL-Q3-22-Shareholder-Letter,-6.496646,-1.676093,-11.70692,-19.879659,18/10/2022,249.800003,250.369995,237.729996,240.860001,240.860001,25776700,-8.940002
8,FINAL-Q4-21-Shareholder-Letter,-3.141478,0.995229,1.02829,-1.117959,20/01/2022,517.75,526.640015,506.929993,508.25,508.25,12659000,-9.5
9,FINAL-Q4-22-Shareholder-Letter,1.739604,3.429866,-1.125851,4.043619,19/01/2023,322.570007,324.890015,313.390015,315.779999,315.779999,18008200,-6.790008


In [17]:
final_df['match'] = np.sign(final_df['total_score']) == np.sign(final_df['open_close_diff'])

# Print the results
final_df

Unnamed: 0,Report,forecast,profit,revenue,total_score,Date,Open,High,Low,Close,Adj Close,Volume,open_close_diff,match
0,FINAL-Q1-21-Shareholder-Letter,-3.789053,0.994155,0.805946,-1.988953,20/04/2021,554.42,563.56,546.3,549.57,549.57,11257600,-4.85,True
1,FINAL-Q1-22-Shareholder-Letter,-4.052003,0.041319,4.62806,0.617376,19/04/2022,333.220001,351.679993,333.220001,348.609985,348.609985,20906900,15.389984,True
2,FINAL-Q1-23-Shareholder-Letter,-8.886862,-5.831779,-23.671309,-38.38995,18/04/2023,335.0,337.190002,330.5,333.700012,333.700012,17944500,-1.299988,True
3,FINAL-Q2-21-Shareholder-Letter,-5.115314,-0.225202,2.491843,-2.848674,20/07/2021,526.07,536.64,520.3,531.05,531.05,6930400,4.98,False
4,FINAL-Q2-22-Shareholder-Letter,-4.61128,1.726416,-3.280668,-6.165532,19/07/2022,193.020004,201.970001,188.399994,201.630005,201.630005,28178700,8.610001,False
5,FINAL-Q2-23-Shareholder-Letter,-4.075502,-3.951874,-24.853224,-32.8806,19/07/2023,476.86,485.0,470.0,477.59,477.59,20210900,0.73,False
6,FINAL-Q3-21-Shareholder-Letter,-0.053019,0.0,-0.932398,-0.985417,19/10/2021,636.969971,641.0,632.299988,639.0,639.0,7633100,2.030029,False
7,FINAL-Q3-22-Shareholder-Letter,-6.496646,-1.676093,-11.70692,-19.879659,18/10/2022,249.800003,250.369995,237.729996,240.860001,240.860001,25776700,-8.940002,True
8,FINAL-Q4-21-Shareholder-Letter,-3.141478,0.995229,1.02829,-1.117959,20/01/2022,517.75,526.640015,506.929993,508.25,508.25,12659000,-9.5,True
9,FINAL-Q4-22-Shareholder-Letter,1.739604,3.429866,-1.125851,4.043619,19/01/2023,322.570007,324.890015,313.390015,315.779999,315.779999,18008200,-6.790008,False
