In [2]:
# retrieve text from PDF
from tqdm.notebook import tqdm
from PyPDF2 import PdfReader
from tqdm.notebook import tqdm
from textblob import TextBlob
import numpy as np
import spacy  
import re
from collections import Counter
from nltk.corpus import wordnet
import nltk
nltk.download('omw-1.4')
import pandas as pd
import numpy as np
from transformers import pipeline
import pdfplumber
import os
from difflib import SequenceMatcher

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Я\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
import pickle
# Load pdf text and headings from the pickle file
pdf_texts = pickle.load(open("Src/pdf_texts.pkl", "rb"))
pdf_headings = pickle.load(open("Src/pdf_headings.pkl", "rb"))

### Tokenized earning reports

In [None]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

# Tokenize all the reports
tokenized_reports = {}

# Loop through each report in pdf_texts
for report_name, report_text in pdf_texts.items():
    # Use the nlp.pipe method to tokenize the report_text
    tokenized_report = [doc for doc in nlp.pipe([report_text])]
    # Add tokenized_report to tokenized_reports
    tokenized_reports[report_name] = tokenized_report

# Print the tokenized reports
# for report_name, tokenized_report in tokenized_reports.items():
#     print(f"Report name: {report_name}")
#     for doc in tokenized_report:
#         print([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])  
#     print("\n")

### Find the most frequent mentioned financial keywords in each report

In [None]:
def find_related_financial_keywords(tokenized_reports):
    # financial_terms.txt is a list of financial keywords from Tilburg University
    with open('Src/financial_terms.txt', 'r') as f:
        financial_keywords = [line.strip() for line in f]

    # Find all synsets related to financial keywords
    financial_synsets = [wordnet.synsets(keyword) for keyword in financial_keywords]

    # Flatten the list of synsets
    financial_synsets = [synset for sublist in financial_synsets for synset in sublist]

    # Find all lemmas for these synsets
    financial_lemmas = [lemma.name() for synset in financial_synsets for lemma in synset.lemmas()]

    # Initialize a dictionary to store the most frequent financial keywords for each document
    financial_keywords_counts_dict = {}

    # Iterate over each document in tokenized_reports
    for report_name, tokenized_report in tokenized_reports.items():

        # For each tokenized report, get a list of all tokens
        all_tokens = [token.text for doc in tokenized_report for token in doc]

        # Filter tokens to only include those in the financial lemmas list
        financial_tokens = [token for token in all_tokens if token in financial_lemmas]

        # Use Counter to count each financial token's occurrences
        financial_token_counts = Counter(financial_tokens)

        # Only keep the top 10 most frequent financial words
        top_10_financial_words = financial_token_counts.most_common(10)

        # Add the result to the dictionary
        financial_keywords_counts_dict[report_name] = top_10_financial_words

    return financial_keywords_counts_dict

# Call the function
financial_keywords_counts = find_related_financial_keywords(tokenized_reports)

# Print the results
# for report_name, financial_keywords_counts in financial_keywords_counts.items():
#     print(f"Report name: {report_name}")
#     for word, count in financial_keywords_counts:
#         print(f"Word: {word}, Count: {count}")
#     print("\n")

In [17]:
import spacy
nlp = spacy.load("en_core_web_sm")

def find_top10_frequent_words(tokenized_reports):
    # Select the two reports
    reports = ['FINAL-Q1-23-Shareholder-Letter', 'FINAL-Q2-23-Shareholder-Letter']

    # Initialize a dictionary to store the most frequent words for each selected report
    frequent_words_counts_dict = {}

    # Iterate over each selected report
    for report_name in reports:
        if report_name in tokenized_reports:
            tokenized_report = tokenized_reports[report_name]

            # For each tokenized report, get a list of all tokens
            all_tokens = [token.text.lower() for doc in tokenized_report for token in doc]

            # Filter out punctuation, stop words, and other non-alphabetic tokens
            all_tokens = [token for token in all_tokens if token.isalpha() and not nlp.vocab[token].is_stop]

            # Use Counter to count each token's occurrences
            token_counts = Counter(all_tokens)

            # Only keep the top 10 most frequent words
            top_10_words = token_counts.most_common(10)

            # Add the result to the dictionary
            frequent_words_counts_dict[report_name] = top_10_words

    return frequent_words_counts_dict

# Call the function
frequent_words_counts = find_top10_frequent_words(tokenized_reports)

# # Print the results
# for report_name, frequent_words_counts in frequent_words_counts.items():
#     print(f"Report name: {report_name}")
#     for word, count in frequent_words_counts:
#         print(f"Word: {word}, Count: {count}")
#     print("\n")

### Split documents into sentences

In [18]:
# Initialize a dictionary to store the joined sentences for each report.
joined_sentences = {}

# Iterate over each report in pdf_texts.
for report_name, report_text in pdf_texts.items():

    # Split the report text into sentences.
    sentences = nlp(report_text).sents

    # Initialize a list to hold the tokenized sentences for this report.
    tokenized_report_sentences = []

    # Iterate over each sentence.
    for sentence in sentences:
        # Tokenize, lemmatize, and remove stop words and punctuation.
        tokenized = [token.lemma_ for token in sentence if not token.is_stop and not token.is_punct]
        # Add the tokenized sentence to the list.
        tokenized_report_sentences.append(tokenized)

    # Join each tokenized sentence into a single string, and store them in a list.
    joined_report_sentences = [' '.join(sentence) for sentence in tokenized_report_sentences]

    # Add the joined sentences for this report to joined_sentences.
    joined_sentences[report_name] = joined_report_sentences

# # Print the joined sentences for each report.
# for report_name, joined_report_sentences in joined_sentences.items():
#     print(f"Report name: {report_name}")
#     print(joined_report_sentences)
#     print("\n")

### Extract sentences with keywords

In [19]:
def analyze_sentences_with_keywords(joined_sentences, keywords):
    # Initialize a dictionary to store the sentences for each report
    sentences_with_keywords_dict = {}

    # Iterate over each report
    for report_name, joined_sentences in joined_sentences.items():
        # For each report, find sentences containing each keyword
        sentences_with_keywords = {keyword: [] for keyword in keywords}

        for sentence in joined_sentences:
            for keyword in keywords:
                if keyword in sentence.lower():
                    # Add the sentence to the corresponding keyword list
                    sentences_with_keywords[keyword].append(sentence)

        # Add the result to the dictionary
        sentences_with_keywords_dict[report_name] = sentences_with_keywords

    return sentences_with_keywords_dict

# Call the function
keywords = ['revenue', 'forecast', 'profit']
sentences_with_keywords = analyze_sentences_with_keywords(joined_sentences, keywords)

# Print the results
# for report_name, keywords_sentences_dict in sentences_with_keywords.items():
#     print(f"Report name: {report_name}")
#     for keyword, sentences in keywords_sentences_dict.items():
#         print(f"Keyword: {keyword}")
#         for sentence in sentences:
#             print(f"Sentence: {sentence}")
#         print("\n")


#### sentiment analysis for each sentence

In [None]:
def analyze_sentiment_of_sentences_with_keywords(joined_sentences, keywords):
    # Initialize the sentiment analysis pipeline
    sentiment_analysis = pipeline("sentiment-analysis")

    # Initialize a dictionary to store the sentences and their sentiment scores for each report
    sentiment_results_dict = {}

    # Iterate over each report
    for report_name, sentences in joined_sentences.items():

        # Initialize a dictionary to store the sentiment analysis results for the current report
        report_sentiment_results = {keyword: [] for keyword in keywords}

        for sentence in sentences:
            if len(sentence) < 512: 
                for keyword in keywords:
                    if keyword in sentence.lower():
                        # Analyze the sentiment of the sentence
                        sentiment_result = sentiment_analysis(sentence)[0]
                        report_sentiment_results[keyword].append(sentiment_result)

        # Add the results to the dictionary
        sentiment_results_dict[report_name] = report_sentiment_results

    return sentiment_results_dict

keywords = ['revenue', 'forecast', 'profit']
sentences_with_keywords_and_sentiment = analyze_sentiment_of_sentences_with_keywords(joined_sentences, keywords)
# sentences_with_keywords_and_sentiment

#### total sentiment scores for each report

In [21]:
# Initialize a dictionary to hold total scores for each keyword in each report
total_scores = {report: {keyword: 0 for keyword in keywords} for report in sentences_with_keywords_and_sentiment.keys()}

# Calculate total scores for each keyword in each report
for report_name, keywords_dict in sentences_with_keywords_and_sentiment.items():
    for keyword, sentiments in keywords_dict.items():
        for sentiment in sentiments:
            # If the sentiment is POSITIVE, add the score
            # If the sentiment is NEGATIVE, subtract the score
            if sentiment['label'] == 'POSITIVE':
                total_scores[report_name][keyword] += sentiment['score']
            else:
                total_scores[report_name][keyword] -= sentiment['score']

# Convert the total_scores to a DataFrame
df = pd.DataFrame(total_scores).T
df.reset_index(inplace=True)
df.columns = ['report_name', 'revenue_score', 'forecast_score', 'profit_score']

In [22]:
df

Unnamed: 0,report_name,revenue_score,forecast_score,profit_score
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,-2.896049,-1.853488,0.204556
1,FINAL-Q1-18-Shareholder-Letter,1.513937,0.41912,0.0
2,FINAL-Q1-19-Shareholder-Letter,-0.938676,-1.8558,0.0
3,FINAL-Q1-20-Shareholder-Letter,-4.090055,-4.011067,-0.510048
4,FINAL-Q1-21-Shareholder-Letter,0.805947,-3.789053,0.994155
5,FINAL-Q1-22-Shareholder-Letter,4.628062,-4.052005,0.041319
6,Final-Q1-23-Shareholder-Letter,-22.68311,-7.898663,-5.831779
7,FINAL-Q2-18-Shareholder-Letter,-1.327651,-5.456451,0.0
8,FINAL-Q2-20-Shareholder-Letter-V3-with-Tables,-1.567408,-0.417644,0.0
9,FINAL-Q2-21-Shareholder-Letter,2.491845,-5.115308,-0.225201


In [23]:
# Add a new column 'total_score'
df['total_score'] = df['revenue_score'] + df['forecast_score'] + df['profit_score']

# Print the results
df

Unnamed: 0,report_name,revenue_score,forecast_score,profit_score,total_score
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,-2.896049,-1.853488,0.204556,-4.544981
1,FINAL-Q1-18-Shareholder-Letter,1.513937,0.41912,0.0,1.933057
2,FINAL-Q1-19-Shareholder-Letter,-0.938676,-1.8558,0.0,-2.794476
3,FINAL-Q1-20-Shareholder-Letter,-4.090055,-4.011067,-0.510048,-8.61117
4,FINAL-Q1-21-Shareholder-Letter,0.805947,-3.789053,0.994155,-1.988952
5,FINAL-Q1-22-Shareholder-Letter,4.628062,-4.052005,0.041319,0.617376
6,Final-Q1-23-Shareholder-Letter,-22.68311,-7.898663,-5.831779,-36.413553
7,FINAL-Q2-18-Shareholder-Letter,-1.327651,-5.456451,0.0,-6.784102
8,FINAL-Q2-20-Shareholder-Letter-V3-with-Tables,-1.567408,-0.417644,0.0,-1.985052
9,FINAL-Q2-21-Shareholder-Letter,2.491845,-5.115308,-0.225201,-2.848664


### Stock price processing

In [26]:
stock_price = pd.read_csv("Src/Selina_NFLX.csv")
stock_price.loc[:, "open_close_diff"] = stock_price.Close - stock_price.Open
stock_price

Unnamed: 0,Quater,Date,Open,High,Low,Close,Adj Close,Volume,open_close_diff
0,FINAL-Q2-23-Shareholder-Letter,19/07/2023,476.86,485.0,470.0,477.59,477.59,20210900,0.73
1,FINAL-Q1-23-Shareholder-Letter,18/04/2023,335.0,337.190002,330.5,333.700012,333.700012,17944500,-1.299988
2,FINAL-Q4-22-Shareholder-Letter,19/01/2023,322.570007,324.890015,313.390015,315.779999,315.779999,18008200,-6.790008
3,FINAL-Q3-22-Shareholder-Letter,18/10/2022,249.800003,250.369995,237.729996,240.860001,240.860001,25776700,-8.940002
4,FINAL-Q2-22-Shareholder-Letter,19/07/2022,193.020004,201.970001,188.399994,201.630005,201.630005,28178700,8.610001
5,FINAL-Q1-22-Shareholder-Letter,19/04/2022,333.220001,351.679993,333.220001,348.609985,348.609985,20906900,15.389984
6,FINAL-Q4-21-Shareholder-Letter,20/01/2022,517.75,526.640015,506.929993,508.25,508.25,12659000,-9.5
7,FINAL-Q3-21-Shareholder-Letter,19/10/2021,636.969971,641.0,632.299988,639.0,639.0,7633100,2.030029
8,FINAL-Q2-21-Shareholder-Letter,20/07/2021,526.07,536.64,520.3,531.05,531.05,6930400,4.98
9,FINAL-Q1-21-Shareholder-Letter,20/04/2021,554.42,563.56,546.3,549.57,549.57,11257600,-4.85


### match the polarity score with open_close_diff 

In [27]:
# Merge sentiment_scores_pivot and stock_price DataFrames
final_df = pd.merge(df, stock_price, left_on='report_name', right_on='Quater')
final_df = final_df.drop(columns=['Quater'])
# Print the results
final_df

Unnamed: 0,report_name,revenue_score,forecast_score,profit_score,total_score,Date,Open,High,Low,Close,Adj Close,Volume,open_close_diff
0,FINAL-Q1-18-Shareholder-Letter,1.513937,0.41912,0.0,1.933057,16/04/2018,315.98999,316.100006,304.0,307.779999,307.779999,20307900,-8.209991
1,FINAL-Q1-19-Shareholder-Letter,-0.938676,-1.8558,0.0,-2.794476,16/04/2019,355.0,364.480011,352.720001,359.459991,359.459991,18740200,4.459991
2,FINAL-Q1-20-Shareholder-Letter,-4.090055,-4.011067,-0.510048,-8.61117,21/04/2020,444.769989,447.0,425.600006,433.829987,433.829987,23177600,-10.940002
3,FINAL-Q1-21-Shareholder-Letter,0.805947,-3.789053,0.994155,-1.988952,20/04/2021,554.42,563.56,546.3,549.57,549.57,11257600,-4.85
4,FINAL-Q1-22-Shareholder-Letter,4.628062,-4.052005,0.041319,0.617376,19/04/2022,333.220001,351.679993,333.220001,348.609985,348.609985,20906900,15.389984
5,FINAL-Q2-18-Shareholder-Letter,-1.327651,-5.456451,0.0,-6.784102,16/07/2018,398.980011,403.359985,391.75,400.480011,400.480011,22960000,1.5
6,FINAL-Q2-21-Shareholder-Letter,2.491845,-5.115308,-0.225201,-2.848664,20/07/2021,526.07,536.64,520.3,531.05,531.05,6930400,4.98
7,FINAL-Q2-22-Shareholder-Letter,-3.280668,-4.611282,1.726416,-6.165534,19/07/2022,193.020004,201.970001,188.399994,201.630005,201.630005,28178700,8.610001
8,FINAL-Q2-23-Shareholder-Letter,-24.853224,-4.075503,-3.951874,-32.880601,19/07/2023,476.86,485.0,470.0,477.59,477.59,20210900,0.73
9,FINAL-Q3-18-Shareholder-Letter,1.234896,0.008132,0.2109,1.453928,16/10/2018,337.23999,347.950012,330.559998,346.399994,346.399994,20156400,9.160004


In [30]:
final_df['match'] = np.sign(final_df['total_score']) == np.sign(final_df['open_close_diff'])

# Print the results
final_df.head()

Unnamed: 0,report_name,revenue_score,forecast_score,profit_score,total_score,Date,Open,High,Low,Close,Adj Close,Volume,open_close_diff,match
0,FINAL-Q1-18-Shareholder-Letter,1.513937,0.41912,0.0,1.933057,16/04/2018,315.98999,316.100006,304.0,307.779999,307.779999,20307900,-8.209991,False
1,FINAL-Q1-19-Shareholder-Letter,-0.938676,-1.8558,0.0,-2.794476,16/04/2019,355.0,364.480011,352.720001,359.459991,359.459991,18740200,4.459991,False
2,FINAL-Q1-20-Shareholder-Letter,-4.090055,-4.011067,-0.510048,-8.61117,21/04/2020,444.769989,447.0,425.600006,433.829987,433.829987,23177600,-10.940002,True
3,FINAL-Q1-21-Shareholder-Letter,0.805947,-3.789053,0.994155,-1.988952,20/04/2021,554.42,563.56,546.3,549.57,549.57,11257600,-4.85,True
4,FINAL-Q1-22-Shareholder-Letter,4.628062,-4.052005,0.041319,0.617376,19/04/2022,333.220001,351.679993,333.220001,348.609985,348.609985,20906900,15.389984,True


In [31]:
counts = final_df['match'].value_counts()
counts

match
False    9
True     9
Name: count, dtype: int64

In [32]:
count_true = final_df['match'].sum()
accuracy = count_true/30
accuracy

0.3