In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import pickle

In [4]:
# Loading the Loughran and McDonald (LM) Lexicon CSV file
lm_lexicon_path = '../Src/Loughran-McDonald_MasterDictionary_1993-2021.csv'
lm_lexicon = pd.read_csv(lm_lexicon_path)

# Displaying the first few rows of the LM Lexicon
lm_lexicon.head()

Unnamed: 0,Word,Seq_num,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Strong_Modal,Weak_Modal,Constraining,Syllables,Source
0,AARDVARK,1,354,1.55008e-08,1.4226e-08,3.815486e-06,99,0,0,0,0,0,0,0,2,12of12inf
1,AARDVARKS,2,3,1.313627e-10,8.653817e-12,9.241714e-09,1,0,0,0,0,0,0,0,2,12of12inf
2,ABACI,3,9,3.940882e-10,1.169679e-10,5.290465e-08,7,0,0,0,0,0,0,0,3,12of12inf
3,ABACK,4,29,1.26984e-09,6.654735e-10,1.5951e-07,28,0,0,0,0,0,0,0,2,12of12inf
4,ABACUS,5,8570,3.752595e-07,3.809464e-07,3.529356e-05,1108,0,0,0,0,0,0,0,3,12of12inf


In [5]:
# Extracting positive and negative words from the LM Lexicon
positive_words_lm = lm_lexicon[lm_lexicon['Positive'] > 0]['Word'].str.lower().tolist()
negative_words_lm = lm_lexicon[lm_lexicon['Negative'] > 0]['Word'].str.lower().tolist()

# Summary of the positive and negative words count
len(positive_words_lm), len(negative_words_lm)

(347, 2345)

In [8]:
# Load pdf text and headings from the pickle file
pdf_texts = pickle.load(open("../Src/pdf_texts.pkl", "rb"))                        # Texts are extracted from the pdf

In [None]:
sentiment_scores = {}

lm_analysis = pd.DataFrame()
# Function to calculate sentiment score using the LM Lexicon
def calculate_lm_sentiment_score(text):
    # Tokenizing the text into words
    words = text.lower().split()
    
    # Counting occurrences of positive and negative words
    positive_count = sum(word in positive_words_lm for word in words)
    negative_count = sum(word in negative_words_lm for word in words)
    
    # Calculating sentiment score as the difference between positive and negative counts
    sentiment_score = positive_count - negative_count
    
    return sentiment_score

for pdf_file, text in pdf_texts.items():

    # print(pdf_file)
    # Calculating sentiment score for each text
    sentiment_score = calculate_lm_sentiment_score(text)
    
    sentiment_scores[pdf_file] = sentiment_score

In [18]:
# convert dictionary to dataframe
frequency_sentiment_df = pd.DataFrame.from_dict(sentiment_scores, orient='index', columns=['LM_Sentiment_Score']).reset_index()
frequency_sentiment_df.rename(columns={'index': 'pdf_name', 'LM_Sentiment_Score': 'polarity'}, inplace=True)

In [21]:
frequency_sentiment_df.to_csv('../Scores/baseline_frequency_polarity.csv', index=False)