# Proxy Voting Guidelines Evaluation

In [5]:
pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Downloading pdfminer.six-20240706-py3-none-any.whl (5.6 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/5.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/5.6 MB[0m [31m3.0 MB/s[0m eta [36m0:00:02[0m[2K   [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/5.6 MB[0m [31m10.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m3.7/5.6 MB[0m [31m35.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m5.6/5.6 MB[0m [31m47.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20240706


In [7]:
pip install openai



In [6]:
pip install sec-edgar-downloader

Collecting sec-edgar-downloader
  Downloading sec_edgar_downloader-5.0.3-py3-none-any.whl.metadata (11 kB)
Collecting pyrate-limiter>=3.6.0 (from sec-edgar-downloader)
  Downloading pyrate_limiter-3.7.0-py3-none-any.whl.metadata (25 kB)
Downloading sec_edgar_downloader-5.0.3-py3-none-any.whl (14 kB)
Downloading pyrate_limiter-3.7.0-py3-none-any.whl (28 kB)
Installing collected packages: pyrate-limiter, sec-edgar-downloader
Successfully installed pyrate-limiter-3.7.0 sec-edgar-downloader-5.0.3


In [8]:
# Import necessary libraries
import numpy as np
import pandas as pd
import pdfminer
import random
import spacy
import nltk
import string
import requests
import re

from pdfminer.high_level import extract_text
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal, LTChar, LTTextLineHorizontal
from transformers import pipeline
from spacy.matcher import Matcher
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI
from google.colab import drive
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from bs4 import BeautifulSoup
from sec_edgar_downloader import Downloader

In [9]:
# Voting guideline PDFs will be kept in Google Drive
# Mount the drive that contains these PDFs
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Set up API connection to OpenAI
# GPT models will be used for large analyis tasks
  # This includes extracting voting guideliens from blocks of text, formatting these guidelines, and comparing them to Microsoft's proxy statement
client = OpenAI(api_key="placeholder, replace with actual key")

completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": "Write a haiku about recursion in programming."
        }
    ]
)

test = completion.choices[0].message

In [None]:
# Test that connection is working
test.content

'Code calls back to self,  \nLayers entwined, depth unfolds,  \nEndless loops of thought.'

## Step 1: Extract information from PDFs
Currently, the data is in a relatively unstructured format. The PDFs are long, and are written with HTML code. To find the proxy statements, we must extract the bodies of text and the tables within the PDFs. However, they are interspersed between headers, footers, images, and other formatting elements. The goal of this step is to remove all the unneccessary information in the PDFs, leaving only the bodies of text. The next step will be to analyze these texts, extracting any potential proxy statements from them.

In [None]:
# All bodies of text in the PDFs are located under headers or subheaders. Using the pdfminer library, find these headers and subheaders
# The text that is extracted will be mapped back to its header and/or subheader, providing its context

# Create a function that allows for extraction by font size
# Allows for parsing based on section headers
def extract_text_by_font_size(pdf_path, target_font_size):
    for page_layout in extract_pages(pdf_path): # Each page in the PDF
        for element in page_layout: # Each element on the page
            if isinstance(element, LTTextBoxHorizontal): # Check if an element is a text element
                for text_line in element:
                    for character in text_line:
                        if isinstance(character, LTChar) and character.size == target_font_size: # If the character's font size matches the target font size, print it
                            print(character.get_text(), end='')

In [None]:
# Get all unique font sizes within a PDF
unique_font_sizes_blackrock = []

for page_layout in extract_pages("/content/blackrock_guidelines.pdf"):
    for element in page_layout:
        if isinstance(element, LTTextBoxHorizontal): # Check if an element is a text element
            for text_line in element:
                for character in text_line:
                    if isinstance(character, LTChar) and character.size not in unique_font_sizes_blackrock: # If font size not already in list, append it
                        unique_font_sizes_blackrock.append(character.size)


unique_font_sizes_vanguard = []

for page_layout in extract_pages("/content/vanguard_guidelines.pdf"):
    for element in page_layout:
        if isinstance(element, LTTextBoxHorizontal): # Check if an element is a text element
            for text_line in element:
                for character in text_line:
                    if isinstance(character, LTChar) and character.size not in unique_font_sizes_vanguard: # If font size not already in list, append it
                        unique_font_sizes_vanguard.append(character.size)

In [None]:
# unique_font_sizes_vanguard
unique_font_sizes_blackrock

[59.999999999999886,
 60.00000000000006,
 60.0,
 21.95999999999998,
 14.04000000000002,
 8.0,
 36.0,
 11.039999999999964,
 12.0,
 9.959999999999923,
 11.04000000000002,
 14.039999999999964,
 8.04,
 9.96,
 18.0,
 9.960000000000036,
 9.95999999999998,
 6.9599999999999795,
 6.960000000000008,
 9.960000000000008,
 6.959999999999994,
 6.960000000000036,
 6.959999999999923,
 8.04000000000002,
 5.0400000000000205,
 8.039999999999992,
 5.039999999999992,
 8.040000000000006,
 6.0,
 5.040000000000006,
 8.039999999999964,
 14.039999999999992,
 9.959999999999994,
 9.0,
 8.519999999999982,
 17.999999999999943,
 15.95999999999998]

In [None]:
# Test returned font sizes and find which font size is for headers and subheaders in PDFs
extract_text_by_font_size("/content/blackrock_guidelines.pdf", 14.039999999999964)

 Oversight role of the board Risk oversight Classified board of directors/staggered terms Director qualifications and skills Board term limits and director tenure Cumulative voting Blank check preferred stock Reimbursement of expense for successful shareholder campaigns Equity compensation plans Option exchanges Climate risk Natural capital Human capital management IPO governance Adjourn meeting to solicit additional votes Bundled proposals  Other business Right to act by written consent Consent solicitation Virtual meetings 

----- Blackrock font sizes -----

Headers are under sizes 18.0 and 17.999999999999943

Subheaders are under sizes 14.039999999999964, 14.039999999999992, and 14.04000000000002


----- Vanguard font sizes -----

Headers are under size 16.0

Subheaders are under sizes 11.0 and 11.000000000000014

In [None]:
# Create helper function to extract lines of text of specific font sizes
def extract_lines_by_font_size(pdf_path, target_font_sizes):

  matching_lines = [] # Lines of text in the PDF that match the target font size

  for page_layout in extract_pages(pdf_path):
        for element in page_layout:
            if isinstance(element, LTTextBoxHorizontal):  # Check if an element is a text element
                for line in element:
                    if isinstance(line, LTTextLineHorizontal):
                        # Check the font size of each character in the line
                        font_sizes = [
                            char.size for char in line if isinstance(char, LTChar)
                        ]
                        # If all characters in the line share the target font size, add the line
                        if font_sizes and all(size in target_font_sizes for size in font_sizes):
                            matching_lines.append(line.get_text().strip())

  return matching_lines

In [None]:
# Get headers from each PDF based on manual review of target font sizes
blackrock_headers = extract_lines_by_font_size("/content/blackrock_guidelines.pdf", [18.0, 17.999999999999943])
vanguard_headers = extract_lines_by_font_size("/content/vanguard_guidelines.pdf", [16.0])

In [None]:
# Get subheaders from each PDF based on manual review of target font sizes
blackrock_subheaders = extract_lines_by_font_size("/content/blackrock_guidelines.pdf", [14.039999999999964, 14.039999999999992, 14.04000000000002])
vanguard_subheaders = extract_lines_by_font_size("/content/vanguard_guidelines.pdf", [11.0, 11.000000000000014])

In [10]:
 # Get all headers and subheaders from each PDF based on manual review of target font sizes
 # Will be used to determine which lines of text are headers/subheaders, and which are body text
 blackrock_all_headers = extract_lines_by_font_size("/content/blackrock_guidelines.pdf", [18.0, 17.999999999999943, 14.039999999999964, 14.039999999999992, 14.04000000000002])
 vanguard_all_headers = extract_lines_by_font_size("/content/vanguard_guidelines.pdf", [16.0, 11.0, 11.000000000000014])

NameError: name 'extract_lines_by_font_size' is not defined

In [None]:
blackrock_headers

['Introduction',
 'Voting guidelines',
 'Boards and directors',
 'Board Structure',
 'Board composition and effectiveness',
 'Board responsiveness and shareholder rights',
 'Auditors and audit-related issues',
 'Capital structure proposals',
 'Mergers, acquisitions, transactions, and other special',
 'situations',
 'Executive compensation',
 'Material sustainability-related risks and opportunities',
 'General corporate governance matters',
 'Shareholder protections']

In [None]:
blackrock_all_headers

['Effective as of January 2024',
 'Introduction',
 'Voting guidelines',
 'Boards and directors',
 'Independence',
 'Oversight role of the board',
 'Sufficient capacity',
 'Risk oversight',
 'Board Structure',
 'Classified board of directors/staggered terms',
 'Independent leadership',
 'CEO and management succession planning',
 'Director compensation and equity programs',
 'Board composition and effectiveness',
 'Director qualifications and skills',
 'Board term limits and director tenure',
 'Board diversity',
 'Board size',
 'Board responsiveness and shareholder rights',
 'Shareholder rights',
 'Responsiveness to shareholders',
 'Majority vote requirements',
 'Cumulative voting',
 'Auditors and audit-related issues',
 'Capital structure proposals',
 'Equal voting rights',
 'Blank check preferred stock',
 'Increase in authorized common shares',
 'Increase or issuance of preferred stock',
 'Stock splits',
 'Mergers, acquisitions, transactions, and other special',
 'situations',
 'Merger

### BlackRock voting guidelines
The text has now been extracted from the PDFs, and is formatted as paragraphs. Now, the text within these paragraphs must be evaluated, and any voting guidelines within them must be returned. First, these paragraphs must be broken down into individual sentences or tokens. Then, each token must be evaluated to see if it is a voting guideline. Finally, if it is a guideline, it must be formatted in a way to ensure normality between all guidelines.

In [None]:
# Iterate through all headers and subheaders, add to dataframe
blackrock_contents = []
current_header = np.nan # Mark the current header we are extracting text from

# Iterate through all Blackrock headers and subheaders
for i in range(len(blackrock_all_headers)-1):

  first_header_reached = False # Has the first header been reached? - used to ignore text before first header
  next_header_reached = False # Has the next header been reached? - used to stop iterating when the end of a section is reached
  sub_header_reached = False # Has the next subheader been reached? - used to stop iterating when the end of a section is reached
  last_header_reached = False # Has the last header been reached? - used to extract body text after the last header

  current_sub_header = None # Mark the current subheader, if any, we are extracting text from

  text_contents = "" # The body text we are extracting

  for page_layout in extract_pages("/content/blackrock_guidelines.pdf"):
        for element in page_layout:
            if isinstance(element, LTTextBoxHorizontal):
                for text_line in element:
                    if isinstance(text_line, LTTextLineHorizontal): # If an element is a single text line, compare it to list of (sub)headers
                        # If we have reached last header, set boolean to True
                        if text_line.get_text().strip() == blackrock_all_headers[len(blackrock_all_headers)-1]:
                          last_header_reached = True

                        # If we have reached the (sub)header after the current one, stop adding contents to text
                        if text_line.get_text().strip() == blackrock_all_headers[i+1]:
                            next_header_reached = True

                        # If we are in a body text section, append contents into a string
                        # This code iterates line by line, appending to a string makes it a single paragraph
                        if (first_header_reached == True) and (next_header_reached == False) and (last_header_reached == False):
                          text_contents += (' ' + text_line.get_text().strip())

                        # If the last header has been reached, get all the text after it
                        if (last_header_reached == True):
                          text_contents += (' ' + text_line.get_text().strip())

                        # If first header is reached, start appending to contents text
                        if text_line.get_text().strip() == blackrock_all_headers[i]:
                          first_header_reached = True

  # If (sub)header is a header, set current_header to it
  # current_sub_header is left blank, as there may be text directly under a header, with no sub-header
  if (blackrock_all_headers[i] in blackrock_headers):
    current_header = blackrock_all_headers[i]
    current_sub_header = np.nan

  # If (sub)header is a subheader, set current_sub_header to it
  if (blackrock_all_headers[i] in blackrock_subheaders):
    current_sub_header = blackrock_all_headers[i]

  # Append header, subheader (if any), and subsequent body text to the table
  blackrock_contents.append({
        "header": current_header,
        "subheader": current_sub_header,
        "content": text_contents
      })


In [11]:
# Create dataframe from table
blackrock_df = pd.DataFrame(blackrock_contents)

NameError: name 'blackrock_contents' is not defined

In [None]:
blackrock_df.shape

(67, 3)

Once indivual sentences/sentence groups of body text are extracted, they will be evaluated to check if they contain voting guidelines. If adjacent sentences provide very similar context, we want to group them together to be evaluated as a single proxy statement. To do this, we will use cosine similarity.

In [None]:
# Helper function to calculate cosine similarity of two vectors
def calculate_similarity(vec1, vec2):
    return cosine_similarity([vec1], [vec2])[0][0]

In [None]:
# For an input block of text, return each sentence or group of contextually similar sentences as an element in a list
def group_similar_sentences(text, similarity_threshold=0.8):

    doc = nlp(text) # Convert block of text to an NLP object
    sentences = list(doc.sents) # Break NLP object into individual sentences
    groups = [] # Return object containing sentences grouped together by context
    current_group = [] # Current group of sentences with similar context

    for i, sent in enumerate(sentences):
        if not current_group: # Append first sentence to the empty group
            current_group.append(sent)
        else:
            # Calculate similarity of the next sentence with the current group
            last_sentence_vector = nlp(" ".join([s.text for s in current_group])).vector # Create vector of sentences in current group
            current_sentence_vector = sent.vector # Create vector of sentence to be evaluated
            similarity = calculate_similarity(last_sentence_vector, current_sentence_vector)

            # Compare cosine similarity of current group and sentence to be evaluated
            # If the evaluated sentence context is similar enough to the current group, append it to the group
            if similarity >= similarity_threshold:
                current_group.append(sent)
            # If the evaluated sentence does not have similar enough context, add it as its own group
            else:
                groups.append(" ".join([s.text for s in current_group]))
                current_group = [sent] # Since the context of the text has shifted, use the evaluated sentence as the benchmark for context in the next group

    # Add current group to return array
    if current_group:
        groups.append(" ".join([s.text for s in current_group]))

    # Return all the groups of sentences, grouped together by context
    return groups

In [None]:
# Test the above function
group_similar_sentences(blackrock_df['content'][14], 0.8)

[' ',
 'We encourage boards to periodically review director qualifications and skills to ensure relevant experience and diverse perspectives are represented in the boardroom.',
 'To this end, performance reviews and skills assessments should be conducted by the nominating/governance committee or the Lead Independent Director.',
 'This process may include internal board evaluations; however, boards may also find it useful to periodically conduct an assessment with a third party.',
 'We encourage boards to disclose their approach to evaluations, including objectives of the evaluation; if an external party conducts the evaluation; the frequency of the evaluations; and, whether that evaluation occurs on an individual director basis.',
 'Virtual meetings Shareholders should have the opportunity to participate in the annual and special meetings for the companies in which they are invested, as these meetings are an opportunity for shareholders to provide feedback and hear from the board and m

Now, the body text of the PDFs has been tokenized into groups of sentences with similar context. These groups contain all the body text of the PDFs, and may not be relevant in terms of voting guidelines. Now, we must extract the actual guidelines, if any, from this text.

In [None]:
# First, find each sentence group that matches common expressions that may represent a voting guideline
# These will be fed into OpenAI, which will further determine if something is a voting guideline
# This first step is done so that the entire document is not given to OpenAI, which might cause inefficencies and inaccuracies
# Common expressions were found doing exploratory analysis and manual review of the PDFs

# Match sentences to common expressions
nlp = spacy.load("en_core_web_sm")

# Create Matcher object
matcher = Matcher(nlp.vocab)

# Find sentences where the words "must", "shall", "should", or "may" preceed a verb
policy_pattern_1 = [
    {"LOWER": {"IN": ["must", "shall", "should", "may"]}},
    {"POS": "VERB"}
]

# Find sentences that contain the phrase "vote against"
policy_pattern_2 = [
    {"LOWER": "vote"},
    {"LOWER": "against"}
]

# Test out sentence grouping

#doc = nlp(blackrock_df['content'][10])
#
#group = [] # Sentence groups with sentences that match the regular expressions
#
## Add the rules to the Matcher so that it can find matches
#matcher.add("policy_pattern_1", [policy_pattern_1])
#matcher.add("policy_pattern_2", [policy_pattern_2])
#
## Go through all Blackrock sentence groups from a single block of text
#for sentence_group in group_similar_sentences(blackrock_df['content'][10], 0.8):
#
#  sentence_in_group_relevant = False # Does any sentence in the group match the regular expressions?
#
#  doc = nlp(sentence_group)
#  sentences = list(doc.sents) # Get individual sentences from each group
#  for sentence in sentences:
#    matches = matcher(sentence) # Compare each sentence to the rules of the matcher
#    if matches: # If a sentence within the group matches the regular expressions, mark the whole group as relevant
#      sentence_in_group_relevant = True
#
#  # If a sentence within a group matches the regular expressions, add the entire group
#    if sentence_in_group_relevant:
#      group.append(sentence_group)



In [None]:
# Get all sentence groups that contain a sentence that matches the regular expressions
relevant_sentences = []

# Iterate through each block of text in the BlackRock PDF
for i in range(blackrock_df.shape[0]):

  sentence_groups = [] # Grouped sentences based on context

  # Group similar sentences in the block of text
  for sentence_group in group_similar_sentences(blackrock_df['content'][i], 0.8):

    sentence_in_group_relevant = False

    doc = nlp(sentence_group)
    sentences = list(doc.sents) # Get individual sentences from each group
    for sentence in sentences:
      matches = matcher(sentence) # Compare each sentence to the rules of the matcher
      if matches: # If a sentence within the group matches the regular expressions, mark the whole group as relevant
        sentence_in_group_relevant = True

    # If a sentence within a group matches the regular expressions, add the entire group
    if sentence_in_group_relevant:
      sentence_groups.append(sentence_group)

  # Add the relevant groups from this block of text to the global list
  relevant_sentences.append(sentence_groups)


In [None]:
# Add relevant sentences to the dataframe, mapped to the block of text they originated from
blackrock_df['relevant_sentences'] = relevant_sentences

In [None]:
# Create new dataframe, one row for each relevant sentence
blackrock_df_exploded = blackrock_df.explode('relevant_sentences').reset_index(drop=True)

In [None]:
# Complete prompt engineering for OpenAI
# OpenAI will evaluate each sentence group and determine if it is actually a voting guideline
# If it is a voting guideline, format it to be of the format "May vote FOR/AGAINST..." if the guideline is not a solid rule,
#   and of the format "Votes FOR/AGAINST..." if the guideline is a solid rule
sentence_of_interest = "We may oppose boards that appear to have an insufficient mix of short-, medium-, and long-tenured directors"

completion = client.chat.completions.create(
    model="gpt-4o-mini", # OpenAI model to be used
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": f"I will provide a piece of text that may or may not contain proxy voting guidelines, giving customers the ability to understand expectations and guidelines on certain policies. If it is a guideline, return it in the form 'may vote FOR/AGAINST...' if it is not a solid rule, and in the form 'Votes FOR/AGAINST...' if it is a solid rule. Some examples of output guidelines are 'May vote AGAINST a non-executive director serving on more than 4 public boards', 'Votes FOR shareholder proposals to declassify the board, unless the board provides a strategic rationale for classification', 'May vote AGAINST committee members responsible for CEO/management succession if there are significant concerns regarding the planning efforts'. If there is no rule, return 'Not a proxy guideline'. Do not respond with any additional context, only the text that I asked you to provide. Here is the text: '{sentence_of_interest}'"
        }
    ]
)

# Hold contents of the return message
test = completion.choices[0].message

In [None]:
# Get text returned by OpenAI
test.content

'May vote AGAINST boards that appear to have an insufficient mix of short-, medium-, and long-tenured directors.'

In [None]:
# Create a list of all responses from ChatGPT
chatgpt_responses = []

# Run prompt on all relevant sentence groups found in the BlackRock PDF
for relevant_sentence in blackrock_df_exploded['relevant_sentences'].tolist():

  completion = client.chat.completions.create(
    model="gpt-4o-mini", # OpenAI model to be used
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": f"I will provide a piece of text that may or may not contain proxy voting guidelines, giving customers the ability to understand expectations and guidelines on certain policies. If it is a guideline, return it in the form 'may vote FOR/AGAINST...' if it is not a solid rule, and in the form 'Votes FOR/AGAINST...' if it is a solid rule. Some examples of output guidelines are 'May vote AGAINST a non-executive director serving on more than 4 public boards', 'Votes FOR shareholder proposals to declassify the board, unless the board provides a strategic rationale for classification', 'May vote AGAINST committee members responsible for CEO/management succession if there are significant concerns regarding the planning efforts'. If there is no rule, return 'Not a proxy guideline'. Do not respond with any additional context, only the text that I asked you to provide. Here is the text: '{relevant_sentence}'"
        }
    ]
  )

  result = completion.choices[0].message

  # Add text returned by OpenAI to the list
  chatgpt_responses.append(result.content)

In [None]:
# Add OpenAI response to the dataframe
blackrock_df_exploded['chatgpt_response'] = chatgpt_responses

In [None]:
# Don't want to lose progress, save dataframe so far to a pickle
blackrock_df_exploded.to_pickle('/content/drive/My Drive/blackrock_df_exploded.pkl')

NameError: name 'blackrock_df_exploded' is not defined

In [None]:
blackrock_df_exploded.head()

Unnamed: 0,header,subheader,content,relevant_sentences,chatgpt_response
0,,Effective as of January 2024,NM0424U-3545100-1/26 Contents Introduction .....,Virtual meetings Shareholders should have the ...,Not a proxy guideline.
1,,Effective as of January 2024,NM0424U-3545100-1/26 Contents Introduction .....,Shareholders should have a meaningful opportun...,Not a proxy guideline.
2,Introduction,,BlackRock’s clients depend on us to help them...,3. Contributing to emerging thinking on stewa...,Not a proxy guideline.
3,Introduction,,BlackRock’s clients depend on us to help them...,Virtual meetings Shareholders should have the ...,Not a proxy guideline.
4,Introduction,,BlackRock’s clients depend on us to help them...,Shareholders should have a meaningful opportun...,Not a proxy guideline.


In [None]:
# Load pickle back if needed

blackrock_df_exploded = pd.read_pickle('/content/drive/My Drive/blackrock_df_exploded.pkl')

### Vanguard voting guidelines
The code below is the same as the above for BlackRock, but now it is completed for the Vanguard PDF

In [None]:
# Do the same for vanguard
# iterate through all headers and subheaders, add to dataframe
vanguard_contents = []
current_header = np.nan

for i in range(len(vanguard_all_headers)-1):
#for i in range(3):

  first_header_reached = False
  next_header_reached = False
  sub_header_reached = False
  last_header_reached = False

  current_sub_header = None

  text_contents = ""

  for page_layout in extract_pages("/content/vanguard_guidelines.pdf"):
        for element in page_layout:
            if isinstance(element, LTTextBoxHorizontal):
                for text_line in element:
                    if isinstance(text_line, LTTextLineHorizontal):
                        # If we have reached last header, handle differently
                        if text_line.get_text().strip() == vanguard_all_headers[len(vanguard_all_headers)-1]:
                          last_header_reached = True

                        # If we have reached the next header, stop adding contents to text
                        if text_line.get_text().strip() == vanguard_all_headers[i+1]:
                            next_header_reached = True

                        # If we are in header section, append contents to text
                        if (first_header_reached == True) and (next_header_reached == False) and (last_header_reached == False):
                          text_contents += (' ' + text_line.get_text().strip())

                        if (last_header_reached == True):
                          text_contents += (' ' + text_line.get_text().strip())

                        # If first header is reached, start appending to contents text
                        if text_line.get_text().strip() == vanguard_all_headers[i]:
                          first_header_reached = True

  if (vanguard_all_headers[i] in vanguard_headers):
    current_header = vanguard_all_headers[i]
    current_sub_header = np.nan

  if (vanguard_all_headers[i] in vanguard_subheaders):
    current_sub_header = vanguard_all_headers[i]

  vanguard_contents.append({
        "header": current_header,
        "subheader": current_sub_header,
        "content": text_contents
      })

In [None]:
vanguard_df = pd.DataFrame(vanguard_contents)

In [None]:
relevant_sentences = []

for i in range(vanguard_df.shape[0]):

  sentence_groups = []

  for sentence_group in group_similar_sentences(vanguard_df['content'][i], 0.8):

    sentence_in_group_relevant = False

    doc = nlp(sentence_group)
    sentences = list(doc.sents)
    for sentence in sentences:
      matches = matcher(sentence)
      if matches:
        sentence_in_group_relevant = True

    if sentence_in_group_relevant:
      sentence_groups.append(sentence_group)

  relevant_sentences.append(sentence_groups)

In [None]:
vanguard_df['relevant_sentences'] = relevant_sentences

In [None]:
# Create new dataframe, one row for each relevant sentence
vanguard_df_exploded = vanguard_df.explode('relevant_sentences').reset_index(drop=True)

NameError: name 'vanguard_df' is not defined

In [None]:
# Create a list of all responses from ChatGPT
chatgpt_responses = []

for relevant_sentence in vanguard_df_exploded['relevant_sentences'].tolist():

  completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": f"I will provide a piece of text that may or may not contain proxy voting guidelines, giving customers the ability to understand expectations and guidelines on certain policies. If it is a guideline, return it in the form 'may vote FOR/AGAINST...' if it is not a solid rule, and in the form 'Votes FOR/AGAINST...' if it is a solid rule. Some examples of output guidelines are 'May vote AGAINST a non-executive director serving on more than 4 public boards', 'Votes FOR shareholder proposals to declassify the board, unless the board provides a strategic rationale for classification', 'May vote AGAINST committee members responsible for CEO/management succession if there are significant concerns regarding the planning efforts'. If there is no rule, return 'Not a proxy guideline'. Do not respond with any additional context, only the text that I asked you to provide. Here is the text: '{relevant_sentence}'"
        }
    ]
  )

  result = completion.choices[0].message

  chatgpt_responses.append(result.content)

In [None]:
vanguard_df_exploded['chatgpt_response'] = chatgpt_responses

In [None]:
# Don't want to lose progress, save dataframe so far to a pickle
vanguard_df_exploded.to_pickle('/content/drive/My Drive/vanguard_df_exploded.pkl')

NameError: name 'vanguard_df_exploded' is not defined

In [None]:
# Load pickle back if needed

vanguard_df_exploded = pd.read_pickle('/content/drive/My Drive/vanguard_df_exploded.pkl')

### All voting guidelines

In [None]:
# Get list of all unique proxy guidelines
all_proxy_guidelines = []

# Filter out results that are not a voting guideline
for response in vanguard_df_exploded['chatgpt_response'].tolist():
  if response not in all_proxy_guidelines and response != "Not a proxy guideline.":
    all_proxy_guidelines.append(response)

for response in blackrock_df_exploded['chatgpt_response'].tolist():
  if response not in all_proxy_guidelines and response != "Not a proxy guideline.":
    all_proxy_guidelines.append(response)

In [None]:
# Some OpenAI responses have a '\n' character, split these out into separate elements
blackrock_responses_split = []
vanguard_responses_split = []

for response in blackrock_df_exploded['chatgpt_response'].tolist():
  blackrock_responses_split.append(response.split('\n'))

for response in vanguard_df_exploded['chatgpt_response'].tolist():
  vanguard_responses_split.append(response.split('\n'))

# Add individual response to dataframe
blackrock_df_exploded['chatgpt_response_split'] = blackrock_responses_split
vanguard_df_exploded['chatgpt_response_split'] = vanguard_responses_split

In [None]:
# Add source PDF to each dataframe
blackrock_df_exploded['source'] = 'BlackRock'
vanguard_df_exploded['source'] = 'Vanguard'

In [None]:
# If OpenAI responses are on multiple lines, create one row for each line
blackrock_df_final = blackrock_df_exploded.explode('chatgpt_response_split').reset_index(drop=True)
vanguard_df_final = vanguard_df_exploded.explode('chatgpt_response_split').reset_index(drop=True)

In [None]:
# Combine the dataframes from the two sources into a consolidated dataframe
# This dataframe contains the header, subheader, content text, potential voting guidelines, and responses from OpenAI
combined_df = pd.concat([blackrock_df_final, vanguard_df_final], ignore_index=True)

In [None]:
combined_df.head(10)

Unnamed: 0,header,subheader,content,relevant_sentences,chatgpt_response,source,chatgpt_response_split,final_proxy_guideline,classification
0,,Effective as of January 2024,NM0424U-3545100-1/26 Contents Introduction .....,Virtual meetings Shareholders should have the ...,Not a proxy guideline.,BlackRock,Not a proxy guideline.,Not a proxy guideline.,
1,,Effective as of January 2024,NM0424U-3545100-1/26 Contents Introduction .....,Shareholders should have a meaningful opportun...,Not a proxy guideline.,BlackRock,Not a proxy guideline.,Not a proxy guideline.,
2,Introduction,,BlackRock’s clients depend on us to help them...,3. Contributing to emerging thinking on stewa...,Not a proxy guideline.,BlackRock,Not a proxy guideline.,Not a proxy guideline.,
3,Introduction,,BlackRock’s clients depend on us to help them...,Virtual meetings Shareholders should have the ...,Not a proxy guideline.,BlackRock,Not a proxy guideline.,Not a proxy guideline.,
4,Introduction,,BlackRock’s clients depend on us to help them...,Shareholders should have a meaningful opportun...,Not a proxy guideline.,BlackRock,Not a proxy guideline.,Not a proxy guideline.,
5,Voting guidelines,,These guidelines are divided into eight key t...,Shareholder protections Virtual meetings Share...,Not a proxy guideline.,BlackRock,Not a proxy guideline.,Not a proxy guideline.,
6,Voting guidelines,,These guidelines are divided into eight key t...,Shareholders should have a meaningful opportun...,Not a proxy guideline.,BlackRock,Not a proxy guideline.,Not a proxy guideline.,
7,Boards and directors,,We believe that an effective and well-functio...,Disclosure of material risks that may affect a...,Not a proxy guideline.,BlackRock,Not a proxy guideline.,Not a proxy guideline.,
8,Boards and directors,,We believe that an effective and well-functio...,Virtual meetings Shareholders should have the ...,Not a proxy guideline.,BlackRock,Not a proxy guideline.,Not a proxy guideline.,
9,Boards and directors,,We believe that an effective and well-functio...,Shareholders should have a meaningful opportun...,Not a proxy guideline.,BlackRock,Not a proxy guideline.,Not a proxy guideline.,


Now, we want a list of all proxy voting guidelines. Since previously, there was no comparison between the BlackRock and Vanguard PDFs, they may contain very similar information. We need the voting guidelines to be unique, so we will de-duplicate ones that are very contextually similar.

In [None]:
# Preprocess text by creating tokens, removing stop words
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum()]
    return [word for word in tokens if word not in stop_words]

In [None]:
# Create a unique list of all proxy guidelines from BlackRock and Vanguard
all_proxy_guidelines = []

for guideline in combined_df['chatgpt_response_split'].tolist():
  if guideline not in all_proxy_guidelines:
    all_proxy_guidelines.append(guideline)

In [None]:
# Remove voting guidelines that are very contextually similar
def preprocess_context(text):
    return ' '.join(text.lower().split())

# Create a vectorization of all the guidelines and create a TF-IDF matrix
context_vectorizer = TfidfVectorizer(preprocessor=preprocess_context)
context_tfidf_matrix = context_vectorizer.fit_transform(combined_df['chatgpt_response_split'].tolist())

In [None]:
# Find the cosine similarities between all guidelines
similarity_matrix = cosine_similarity(context_tfidf_matrix, context_tfidf_matrix)

# Identify duplicates (threshold > 0.75 for similarity)
threshold = 0.75
groups = [] # Contains groups of similar-context guidelines
visited = set()

for i, row in enumerate(similarity_matrix):
    if i not in visited:
        similar = [j for j, score in enumerate(row) if score > threshold and j != i]
        group = [i] + similar
        groups.append(group)
        visited.update(group)

# Reduce guidelines
reduced_guidelines = []
for group in groups:
    # Combine similar guidelines into one statement or pick the first
    merged = ' / '.join([combined_df['chatgpt_response_split'].tolist()[i] for i in group])
    reduced_guidelines.append(merged)

In [None]:
# Create an empty list to hold new reduced guidelines
empty_list = list(['' for _ in range(len(combined_df['chatgpt_response_split'].tolist()))])

In [None]:
# groups

In [None]:
# Add guidelines to the empty list
# If there are multiple guidelines associated with each other, choose the first guideline in the group to represent them all
# i.e. many to one mapping
for list_ in groups:
    empty_list[list_[0]] = combined_df['chatgpt_response_split'].tolist()[list_[0]] # Choose the first guideline in each context group
    if len(list_) > 1:
      for i in range(1, len(list_)):
        empty_list[list_[i]] = combined_df['chatgpt_response_split'].tolist()[list_[0]] # Choose the first guideline in each context group

In [None]:
# Add formatted guidelines to dataframe
# One guideline may appear multiple times in empty_list, but the unique list of values here represents the reduced number of guidelines
combined_df['final_proxy_guideline'] = empty_list

In [None]:
len(reduced_guidelines)

NameError: name 'reduced_guidelines' is not defined

## Step 2: Create a list of voting guidelines with categories
Now that there is a consolidated list of voting guidelines, they must be split into categories and sub-categories. This will be done with LDA to extract topics, and zero-shot classification to map guidelines to topics.

In [None]:
# Some guidelines are appended together with a "/" character, split these out into a final list
final_guidelines = []

for guideline in reduced_guidelines:
  final_guidelines.append(guideline.split(' / ')[0])

In [None]:
# Preprocess guidelines and remove stopwords
guidelines = final_guidelines
vectorizer = TfidfVectorizer(preprocessor=preprocess, tokenizer=lambda x: x)
tfidf_matrix = vectorizer.fit_transform(guidelines)



In [None]:
# Get topics discussed across the voting guidelines
lda = LatentDirichletAllocation(n_components=6, random_state=42)  # Number of topics, manually tweaked for ideal number
lda.fit(tfidf_matrix)

# Print out topics found for each category
for idx, topic in enumerate(lda.components_):
    print(f"Topic {idx}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])

Topic 0:
['relevant', 'ha', 'company', 'committee', 'proposal', 'shareholder', 'may', 'board', 'vote', 'director']
Topic 1:
['given', 'audit', 'adopt', 'fee', 'committee', 'member', 'board', 'may', 'vote', 'proposal']
Topic 2:
['failing', 'term', 'consider', 'compensation', 'pay', 'limit', 'say', 'vote', 'may', 'director']
Topic 3:
['bonus', 'management', 'sitting', '4', 'board', 'director', 'vote', 'may', 'proposal', 'public']
Topic 4:
['nominating', 'independent', 'ha', 'compensation', 'board', 'shareholder', 'may', 'vote', 'director', 'committee']
Topic 5:
['may', 'pay', 'performance', 'committee', 'majority', 'proposal', 'company', 'compensation', 'vote', 'board']


Categories are compensation, board and directors, independence, corporate governance, shareholder rights. This is based on a manual review of the above code output. These are overarching categories, but the individual guidelines still must be mapped to these categories. We will use zero-shot classification for this task, as it contains pre-trained LLM models.

In [None]:
# Test zero-shot categorization to classify each guideline under one of the 6 categories.
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Random selection from the voting guidelines
test_text = "May vote AGAINST a non-executive director serving on more than 4 public boards"

# Categories found from LDA
candidate_labels = ["compensation", "board and directors", "independence", "corporate governance", "shareholder rights"]



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cpu


In [None]:
# Get category of sample text
classifier(test_text, candidate_labels)["labels"][0]

'board and directors'

In [None]:
# Use zero-shot classification to classify each requirement
classifications = [] # Array holding all classifications for each voting guideline
for i in range(combined_df.shape[0]):
  if combined_df['final_proxy_guideline'][i] != "Not a proxy guideline.": # Only want to look at actual guidelines
    classifications.append(classifier(combined_df['final_proxy_guideline'][i], candidate_labels)["labels"][0]) # Choose the most likely label (the one with the highest score is first in the list)
  else:
    classifications.append(np.nan) # NULL if it is not a guideline

In [None]:
# Add classification to dataframe
combined_df["classification"] = classifications

In [None]:
# Save progress to pickle
combined_df.to_pickle('/content/drive/My Drive/combined_df.pkl')

In [None]:
# Load pickle back into dataframe if needed
combined_df = pd.read_pickle('/content/drive/My Drive/combined_df.pkl')

In [None]:
# Find count of voting guidelines under each category
combined_df[combined_df['classification'] != np.nan].groupby('classification').count()

Unnamed: 0_level_0,header,subheader,content,relevant_sentences,chatgpt_response,source,chatgpt_response_split,final_proxy_guideline
classification,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
board and directors,67,29,67,67,67,67,67,67
compensation,31,13,31,31,31,31,31,31
corporate governance,11,4,11,11,11,11,11,11
independence,116,71,116,116,116,116,116,116
shareholder rights,88,60,88,88,88,88,88,88


In [None]:
# Split larger categories into sub categories

# Create an array for each category
independence_guidelines = []
compensation_guidelines = []
board_and_directors_guidelines = []
corporate_governance_guidelines = []
shareholder_rights_guidelines = []

# Iterate through the dataframe, append guidelines under each category to the relevant array
for item in combined_df[combined_df['classification'] == 'independence']['final_proxy_guideline'].tolist():
  if item not in independence_guidelines:
    independence_guidelines.append(item)

for item in combined_df[combined_df['classification'] == 'compensation']['final_proxy_guideline'].tolist():
  if item not in compensation_guidelines:
    compensation_guidelines.append(item)

for item in combined_df[combined_df['classification'] == 'board and directors']['final_proxy_guideline'].tolist():
  if item not in board_and_directors_guidelines:
    board_and_directors_guidelines.append(item)

for item in combined_df[combined_df['classification'] == 'corporate governance']['final_proxy_guideline'].tolist():
  if item not in corporate_governance_guidelines:
    corporate_governance_guidelines.append(item)

for item in combined_df[combined_df['classification'] == 'shareholder rights']['final_proxy_guideline'].tolist():
  if item not in shareholder_rights_guidelines:
    shareholder_rights_guidelines.append(item)

In [None]:
# Export all guidelines and categories to csv for easier processing

# Remove duplicates, as a single guideline may appear a few times in the dataframe
guidelines_and_categories = combined_df[['source','final_proxy_guideline', 'classification']].drop_duplicates()

# Save to csv
guidelines_and_categories.to_csv('guidelines_and_categories.csv', index=False)

In [None]:
# Get OpenAI to create subcategories
# Zero-shot classification may not work here to find subcategories, as they are likely all fairly conextually similar when in the same category
# OpenAI will likely provide more granularity

# Append all the guidelines in a single category into a list string
# For use in the OpenAI prompt
list_of_guidelines = '"['

# Append to list
for item in shareholder_rights_guidelines:
  list_of_guidelines += item + ', '

# Complete the list formatting in the string
list_of_guidelines += ']"'

completion = client.chat.completions.create(
  model="gpt-4o-mini", # OpenAI model to be used
  messages=[
      {"role": "system", "content": "You are a helpful assistant."},
      {
          "role": "user",
          "content": f"I will provide a list of proxy voting guidelines. These ones are related to shareholder rights. Please split these into further subcategories. Only provide a list of the subcategories, do not return any additional text. Please keep it to 3 subcategories max. Here are the guidelines: {list_of_guidelines}"
      }
  ]
)

# Get results of the OpenAI prompt
result = completion.choices[0].message

# Print text of result to evaluate categories
result.content

'1. Disclosure and Transparency\n2. Shareholder Proposals and Voting Rights\n3. Director Accountability and Governance'

In [None]:
# Use zero-shot classification to sub-classify each requirement
def return_sub_classification(guidelines, labels):

  # Use pre-trained LLM for classification
  classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

  sub_classifications = [] # Array to hold sub classifications for each guideline
  for i in range(len(guidelines)):
    sub_classifications.append({
        "guideline": guidelines[i], # Append the actual guideline
        "subcategory": classifier(guidelines[i], labels)["labels"][0] # Append its classification based on the labels and choose the most likely label
      })

  # Return the list of sub classifications
  return sub_classifications

In [None]:
# Get the subclassifications for each category
# Labels are based on OpenAI outputs
compensation_sub_classifications = return_sub_classification(compensation_guidelines, ["Pay and Performance Alignment", "Compensation Committee Acccountability", "Equity Compensation Structures", "Other"])
board_sub_classifications = return_sub_classification(board_and_directors_guidelines, ["Attendance", "Board Composition and Diversity", "Accountability and Oversight", "Other"])
independence_sub_classifications = return_sub_classification(independence_guidelines, ['Board Indepepndence and Structure', 'Committee Accountability and Performance', 'Auditor Independence and Oversight', 'Other'])
governance_sub_classifications = return_sub_classification(corporate_governance_guidelines, ['Committee Accountability and Responsiveness', 'Financial Flexibility and Governance', 'Other'])
rights_sub_classifications = return_sub_classification(shareholder_rights_guidelines, ['Disclosure and Transpararency', 'Shareholder Proposals', 'Voting Rights', 'Other'])

Device set to use cpu


In [None]:
# Create csv files for each categorization for easier processing
pd.DataFrame(compensation_sub_classifications).to_csv('compensation_categories.csv', index=False)
pd.DataFrame(board_sub_classifications).to_csv('board_categories.csv', index=False)
pd.DataFrame(independence_sub_classifications).to_csv('independence_categories.csv', index=False)
pd.DataFrame(governance_sub_classifications).to_csv('governance_categories.csv', index=False)
pd.DataFrame(rights_sub_classifications).to_csv('rights_categories.csv', index=False)

# Step 3: Compare to Microsoft proxy statement
Now, we have a full, cleaned, and categorized list of proxy voting guidelines from BlackRock and Vanguard. Now, we must apply it to Microsoft's proxy statement and observe if there are any violations of the guidelines. This will largely be done using OpenAI, as there are many text comparisons and contextual tasks that need to be completed.

In [None]:
# Use SEC edgar downloader to download a local copy of the statement
# Define parameters for the search
dl = Downloader(company_name="BDO USA, PC", email_address="sashasidach01@gmail.com")

ticker = "MSFT"  # Microsoft's ticker symbol
filing_type = "DEF 14A"  # Filing type for proxy statements
start_date = "2024-01-01"  # Only get filings after 2024

# Download the filing
# This will save the filings to a local directory (default is './sec-edgar-filings/')
filing_count = dl.get(filing_type, ticker, after=start_date)

# Check files that were downloaded, we only want one
print(f"Downloaded {filing_count} {filing_type} filings for {ticker}.")

Downloaded 1 DEF 14A filings for MSFT.


In [None]:
# Get the file in a usable location
import os

# Define the file path
ticker = "MSFT"
filing_type = "DEF 14A"

# Base directory for sec-edgar-downloader
base_dir = "./sec-edgar-filings"

# Get the path to the filing folder
filing_dir = os.path.join(base_dir, ticker, filing_type)

# List all subdirectories (filing IDs) within the DEF 14A folder
filing_ids = os.listdir(filing_dir)

if not filing_ids:
    print(f"No {filing_type} filings found for {ticker}.")
else:
    # Select the first filing (or loop through all if needed)
    filing_id = filing_ids[0]
    file_path = os.path.join(filing_dir, filing_id, "full-submission.txt")

    # Open and read the file
    with open(file_path, "r", encoding="utf-8") as file:
        content = file.read()

    # Print the first 500 characters for confirmation
    print(f"Contents of {file_path}:\n")
    print(content[:2000])  # Preview the beginning of the file


Contents of ./sec-edgar-filings/MSFT/DEF 14A/0001193125-24-242883/full-submission.txt:

<SEC-DOCUMENT>0001193125-24-242883.txt : 20241024
<SEC-HEADER>0001193125-24-242883.hdr.sgml : 20241024
<ACCEPTANCE-DATETIME>20241024163113
ACCESSION NUMBER:		0001193125-24-242883
CONFORMED SUBMISSION TYPE:	DEF 14A
PUBLIC DOCUMENT COUNT:		85
CONFORMED PERIOD OF REPORT:	20241210
FILED AS OF DATE:		20241024
DATE AS OF CHANGE:		20241024

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			MICROSOFT CORP
		CENTRAL INDEX KEY:			0000789019
		STANDARD INDUSTRIAL CLASSIFICATION:	SERVICES-PREPACKAGED SOFTWARE [7372]
		ORGANIZATION NAME:           	06 Technology
		IRS NUMBER:				911144442
		STATE OF INCORPORATION:			WA
		FISCAL YEAR END:			0630

	FILING VALUES:
		FORM TYPE:		DEF 14A
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	001-37845
		FILM NUMBER:		241393067

	BUSINESS ADDRESS:	
		STREET 1:		ONE MICROSOFT WAY
		CITY:			REDMOND
		STATE:			WA
		ZIP:			98052-6399
		BUSINESS PHONE:		425-882-8080

	MAIL ADDRESS:	
		

The file is in HTML format. We can use this knowledge to extract text based on its position, size, etc. After inspecting the HTML code, the below parameters were found to distinguish headers and subheaders:

Sub Headers:

\<p style="margin-top:12pt; margin-bottom:0pt; font-size:12pt; font-family:arial; font-weight:bold">...\</p>
\<p style="margin-top:0pt; margin-bottom:0pt; font-size:12pt; font-family:arial; font-weight:bold">...\</p>



Headers:

\<span style="font-weight:bold">...\</span>



Section Headers:

\<p style="margin-top:12pt; margin-bottom:0pt; font-size:25pt; font-family:arial; font-weight:bold">...\</p>

In [None]:
# Split into different sections based on the 4 major headers / sections
section_1_text = content.split('<p style="margin-top:0pt; margin-bottom:0pt; font-size:25pt; font-family:arial;font-weight:bold">1. Governance and our Board of Directors ')[1].split(
    '<p style="margin-top:0pt; margin-bottom:0pt; font-size:25pt; font-family:arial;font-weight:bold">2.'
)[0]

section_2_text = content.split('<p style="margin-top:0pt; margin-bottom:0pt; font-size:25pt; font-family:arial;font-weight:bold">2.')[1].split(
    '<p style="margin-top:0pt; margin-bottom:0pt; font-size:25pt; font-family:arial;font-weight:bold">3.'
)[0]

section_3_text = content.split('<p style="margin-top:0pt; margin-bottom:0pt; font-size:25pt; font-family:arial;font-weight:bold">3.')[1].split(
    '<p style="margin-top:0pt; margin-bottom:0pt; font-size:23pt; font-family:arial;font-weight:bold">4.'
)[0]

section_4_text = content.split('<p style="margin-top:0pt; margin-bottom:0pt; font-size:23pt; font-family:arial;font-weight:bold">4.')[1].split(
    '<p style="margin-top:0pt; margin-bottom:0pt; font-size:25pt; font-family:arial;font-weight:bold">5.'
)[0]

In [None]:
# There is some extra HTML formatting within the text, this should be removed as follows:

# Cleaning steps
# Replace &#8217; with '
# Replace <span style="text-decoration:underline"> with ''
# Replace </span> with ''
# Replace <span style="white-space:nowrap"> with ''

In [None]:
# Within each of the 4 sections, extract the body text
# Define the regular expression
text_pattern = r'<p style="margin-top:6pt; margin-bottom:0pt; font-size:9pt; font-family:arial">(.*?)</p>'

subheader_pattern = r'<p style="margin-top:12pt; margin-bottom:0pt; font-size:12pt; font-family:arial;font-weight:bold">(.*?)</p> <p style="margin-top:6pt; margin-bottom:0pt; font-size:9pt; font-family:arial">'

# Use re.findall to extract all matches for body text
section_1_text_sections = re.findall(text_pattern, section_1_text, re.DOTALL)

In [None]:
# Observe the outputs and find any HTML sections that need cleaning
# Replace HTML code with blanks or quotations as necessary
section_1_text_sections_step_1 = []

for section in section_1_text_sections:
  section = section.replace('<span style="text-decoration:underline">', '')
  section = section.replace('</span>', '')
  section = section.replace('<span style="white-space:nowrap">', '')
  section = section.replace('&#8217;', "'")
  section_1_text_sections_step_1.append(section) # Append to cleaned array

In [None]:
# section_1_text_sections_step_1

In [None]:
# After another round of manual review, there are more HTML sections for cleaning
# Replace HTML code with blanks or quotations as necessary
section_1_text_sections_step_2 = []

for section in section_1_text_sections_step_1:
  section = section.replace('&#8220;', '"')
  section = section.replace('&#8221;', '"')
  section = section.replace('&#160;', ' ')
  section = section.replace('&#8211;', '-')
  section_1_text_sections_step_2.append(section) # Append to cleaned array

In [None]:
# Renaming for clarity
section_1_text_sections_final = section_1_text_sections_step_2

In [None]:
# Find all blocks of body text in section 2
section_2_text_sections = re.findall(text_pattern, section_2_text, re.DOTALL)

In [None]:
# Based on the manual review of section 1 and further review, clean all HTML formatting that is present in section 2
# Replace HTML code with blanks or quotations as necessary
section_2_text_sections_clean = []

for section in section_2_text_sections:
  section = section.replace('<span style="text-decoration:underline">', '')
  section = section.replace('</span>', '')
  section = section.replace('<span style="white-space:nowrap">', '')
  section = section.replace('&#8217;', "'")
  section = section.replace('&#8220;', '"')
  section = section.replace('&#8221;', '"')
  section = section.replace('&#160;', ' ')
  section = section.replace('&#8211;', '-')
  section_2_text_sections_clean.append(section) # Append to cleaned array

In [None]:
# Find all blocks of body text in section 3
section_3_text_sections = re.findall(text_pattern, section_3_text, re.DOTALL)

In [None]:
# Based on the manual review of section 1 and further review, clean all HTML formatting that is present in section 3
# Replace HTML code with blanks or quotations as necessary
section_3_text_sections_clean = []

for section in section_3_text_sections:
  section = section.replace('<span style="text-decoration:underline">', '')
  section = section.replace('</span>', '')
  section = section.replace('<span style="white-space:nowrap">', '')
  section = section.replace('&#8217;', "'")
  section = section.replace('&#8220;', '"')
  section = section.replace('&#8221;', '"')
  section = section.replace('&#160;', ' ')
  section = section.replace('&#8211;', '-')
  section_3_text_sections_clean.append(section) # Append to cleaned array

In [None]:
# Find all blocks of body text in section 4
section_4_text_sections = re.findall(text_pattern, section_4_text, re.DOTALL)

In [None]:
# Based on the manual review of section 1 and further review, clean all HTML formatting that is present in section 4
# Replace HTML code with blanks or quotations as necessary
section_4_text_sections_clean = []

for section in section_4_text_sections:
  section = section.replace('<span style="text-decoration:underline">', '')
  section = section.replace('</span>', '')
  section = section.replace('<span style="white-space:nowrap">', '')
  section = section.replace('&#8217;', "'")
  section = section.replace('&#8220;', '"')
  section = section.replace('&#8221;', '"')
  section = section.replace('&#160;', ' ')
  section = section.replace('&#8211;', '-')
  section_4_text_sections_clean.append(section) # Append to cleaned array

In [None]:
# Find number of voting guidelines from each of the two sources
combined_df[combined_df['final_proxy_guideline'] != 'Not a proxy guideline.'].groupby('source').count()

Unnamed: 0_level_0,header,subheader,content,relevant_sentences,chatgpt_response,chatgpt_response_split,final_proxy_guideline,classification
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BlackRock,44,41,44,44,44,44,44,44
Vanguard,269,136,269,269,269,269,269,269


In [None]:
#section_2_text_sections_clean

In [None]:
# Append each paragraph of body text from section 1 into one paragraph
section_1_all_text = ''

# Append each section to the end with a space
for section in section_1_text_sections_final:
  section_1_all_text += (section + ' ')

In [None]:
# Append each paragraph of body text from section 2 into one paragraph
section_2_all_text = ''

# Append each section to the end with a space
for section in section_2_text_sections_clean:
  section_2_all_text += (section + ' ')

In [None]:
# Append each paragraph of body text from section 3 into one paragraph
section_3_all_text = ''

# Append each section to the end with a space
for section in section_3_text_sections_clean:
  section_3_all_text += (section + ' ')

In [None]:
# Append each paragraph of body text from section 4 into one paragraph
section_4_all_text = ''

# Append each section to the end with a space
for section in section_4_text_sections_clean:
  section_4_all_text += (section + ' ')

In [None]:
# Compare section text to guidelines
# For each section and each category of guidelines, use OpenAI to find if/where those guidelines have been violated
section_1_potential_violations = []

# What has been done so far
# independence           x
# compensation           x
# board and directors    x
# shareholder rights     x
# corporate governance   x

# Iterate over each guideline
for item in board_and_directors_guidelines:

    # The prompt here looks at the individual guideline, and finds anywhere in the section of Microsoft proxy text that might violate this guideline
    # Returns the guideline that has been violated, as well as what text in the proxy statement shows the violation of that guideline
    # Returns "Not Applicable" if no violation has been found
    completion = client.chat.completions.create(
      model="gpt-4o-mini", # Model to be used
      messages=[
          {"role": "system", "content": "You are a helpful assistant."},
            {
              "role": "user",
              "content": f'I will provide a proxy voting guideline for BlackRock. This outlines if they will vote FOR or AGAINST certain policies and actions that take place in the company they invest in. In this case, the company being invested in, and that BlackRock has a decision in, is Microsoft. I will also provide a piece of text from Microsofts proxy statement, which outlines all information that is relevant to shareholders. I want you to find Microsofts policies and information from the proxy statement that BlackRock would vote against based on the guideline I provide. If they would vote AGAINST something in the proxy statement, please return "AGAINST - " along with the guideline that I provide. Please also provide the exact quote in Microsofts proxy statement that shows what is being voted against. The output should look like "VIOLATION - [specific guideline]. TEXT - [quote fromm proxy statement]". If it is not applicable, please return "Not Applicable". Only return what I asked you to, no additional text. Here is the BlackRock guideline: "{item}". Here is the text from the Microsoft proxy statement: "{section_1_all_text}"'
            }
        ]
      )

    # Get results from prompt
    result = completion.choices[0].message

    # Append text of result to the potential violations
    section_1_potential_violations.append(result.content)


# Note that this process has been done for each combination of sections / guidelines
# The sections have been removed for clarity, as there is much repetitive code


In [None]:
# Print results to evaluate
section_1_potential_violations

['AGAINST - May vote AGAINST directors who we do not consider to be independent. TEXT - "The Board has determined that Mses. List, MacGregor, Peterson, Pritzker, and Walmsley, and Messrs. Hoffman, Johnston, Mason, Rodriguez, Scharf, and Stanton (as well as Padmasree Warrior and John Thompson, whose Board service ended on December 7, 2023) meet the standards of independence under our Corporate Governance Guidelines, the director independence guidelines, and applicable Nasdaq listing standards, including that each member is free of any relationship that would interfere with his or her individual exercise of independent judgment."',
 'AGAINST - May vote AGAINST a director who has acted in a manner that compromises their ability to represent the best long-term economic interests of shareholders. TEXT - "Mr. Hoffman is a co-founder of and serves on the board of directors of Inflection. As of the date of the agreement with Inflection, Reprogrammed Interchange LLC ("Reprogrammed") and entitie

In [None]:
# There are many potential violations found by OpenAI
# After manual review, it seems that many of them are irrelevant / the proxy text does not provide the proper context
# Use OpenAI again to review each guideline and violation, and judge if it is an actual violation
final_section_4_violations = [] # Final list of violations + guidelines

# Iterate through each potential violation
for item in section_4_potential_violations:

    # Ignore ones that are not applicable, append "Not Applicable" to determine which are applicable
    if item == "Not Applicable":
      final_section_4_violations.append(item)

    else:

      # The prompt here provides the guideline, as well as the text that may or may not violate that guideline
      # If it is a violation, the prompt will return the violation and the guideline, as formatted in the input
      # If it is not a violation, it will return "Not Applicable"
      completion = client.chat.completions.create(
        model="gpt-4o-mini", # Model to be used
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
              {
                "role": "user",
                "content": f'I will provide you with a  string of the format "AGAINST - [guideline]. TEXT - ...". The guideline is for proxy voting, and states a policy that an investment firm will vote AGAINST for the company it holds shares in. The TEXT states a part of the companys policies, which the investment firm may vote against. If the text gives enough context and would be voted against based on the guideline, please return the string that I give you, with no additional response. If the TEXT does not provide enough context to say if the investment firm will vote for or against, please return "Not Applicable". Here is the string: {item}'
              }
          ]
        )

      # Get results from prompt
      result = completion.choices[0].message

      # Append text of result to the potential violations
      final_section_4_violations.append(result.content)


# Note that this step was taken for all 4 sections
# The sections have been removed for clarity, as there is much repetitive code



In [None]:
final_section_4_violations

['Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'Not Applicable',
 'AGAINST - May vote AGAINST a director who has acted in a manner that compromises their ability to represent the best long-term economic interests of shareholders. TEXT - "Involvement in the development of weapons poses a serious risk to a company\'s reputation, especially for investors and stakeholders."',
 'Not 