In [836]:
import re
import pandas as pd
from pypdf import PdfReader

#extracting text from the pdf
def extract_pdf_text(pdf_file):
    all_text = ""
    with open(pdf_file, "rb") as file:
        reader = PdfReader(file)
        num_pages = len(reader.pages)

        for page_number in range(num_pages):
            page = reader.pages[page_number]
            page_text = page.extract_text()
            all_text += page_text + "\n"

    return all_text

#cleaning the text by removing the headers and footers
def remove_date_name(text):
    patterns = [
        r'Page\s+\d+\s+of\s+\d+', 
        r'Page \d+ of \d+\n?',
        r'Company Name\s*\n?(January|February|March|April|May|June|July|August|September|October|November|December)\s*\d{1,2}, \d{4}\n?',
        r'Company Name Limited\s*\n?(January|February|March|April|May|June|July|August|September|October|November|December)\s*\d{1,2}, \d{4}\n?' 
    ]
    
    cleaned_text = text
    for pattern in patterns:
        cleaned_text = re.sub(pattern, '', cleaned_text)
    cleaned_text = cleaned_text.strip()

    return cleaned_text

#seperating sections as intro part and QnA part
def separate_sections(text, pattern):
    match = re.search(pattern, text, flags=re.IGNORECASE)

    if match:
        intro_part = text[:match.start()].strip()
        qna_part = text[match.start():].strip()
    else:
        intro_part = text.strip()
        qna_part = ""
    
    return intro_part, qna_part

#identifying stopwords as per LM dictionary
def preprocess_text(text):
    with open("stopwords.txt", "r") as f:
        stopwords = f.read().split("\n")[:-1]  
    words = text.split()
    words = [w.lower() for w in words]
    words = [w for w in words if w not in stopwords]
    words = [w for w in words if w.isalpha()]
    return " ".join(words)

#removing blank spaces
def blank_spaces(text):
    pattern = r' {3}\n'
    replacement = '  \n'
    modified_text = re.sub(pattern, replacement, text)
    return modified_text

#analysing the text and categorising as per LM dictionary
def analyze_text(text, lm_dict):
    pos_words = lm_dict[lm_dict["Positive"] != 0]["Word"].str.lower().to_list()
    neg_words = lm_dict[lm_dict["Negative"] != 0]["Word"].str.lower().to_list()
    uncern_words = lm_dict[lm_dict["Uncertainty"] != 0]["Word"].str.lower().to_list()
    lit_words = lm_dict[lm_dict["Litigious"] != 0]["Word"].str.lower().to_list()
    str_mdl__words = lm_dict[lm_dict["Strong_Modal"] != 0]["Word"].str.lower().to_list()
    wk_mdl__words = lm_dict[lm_dict["Weak_Modal"] != 0]["Word"].str.lower().to_list()
    cons_words = lm_dict[lm_dict["Constraining"] != 0]["Word"].str.lower().to_list()
    comp_words = lm_dict[lm_dict["Complexity"] != 0]["Word"].str.lower().to_list()


    n = len(text.split())
    n_pos = len([w for w in text.split() if w in pos_words])
    n_neg = len([w for w in text.split() if w in neg_words])
    n_uncern = len([w for w in text.split() if w in uncern_words])
    n_lit = len([w for w in text.split() if w in lit_words])
    n_str_modal = len([w for w in text.split() if w in str_mdl__words])
    n_wk_modal = len([w for w in text.split() if w in wk_mdl__words])
    n_cons = len([w for w in text.split() if w in cons_words])
    n_comp = len([w for w in text.split() if w in comp_words])

    results = {
        "Number of words": n,
        "Uncertain words": n_uncern,
        "Positive words": n_pos,
        "Negative words": n_neg,
        "Litigious words": n_lit,
        "Strong Modal": n_str_modal,
        "Weak Modal": n_wk_modal,
        "Constraints words": n_cons,
        "Complexity": n_comp
    }

    return results

#patterns of names in order to classify them as names of speakers
def add_newline_before_names(text):
    name_patterns = [
        r'\b[A-Z][a-z]+ [A-Z][a-z]+:',  
        r'\b[A-Z][a-z]+\s[A-Z][a-z]+:',
        r'\b[A-Z][a-z]+ [A-Z]\. [A-Z][a-z]+:',
        r'\b[A-Z] [A-Z][a-z]+:',
        r'\b[A-Z]{2} [A-Z][a-z]+:',
        r'\b[A-Z]\.[A-Z]\. [A-Z][a-z]+:',
        r'\b[A-Z]\. [A-Z]\. [A-Z][a-z]+:',
        r'\b[A-Z]\.[A-Z]\s[A-Z][a-z]+:',
        r'\b[A-Z]\s[A-Z][a-z]+:',
        r'\bDr\. [A-Z][a-z]+ [A-Z][a-z]+:',
        r'\b[A-Z]\.\s[A-Z]\.\s[A-Z][a-z]+:'
        r'\b[A-Z]\.[A-Z]\. [A-Z][a-z]+ [A-Z][a-z]+:',
        r'\bDr\. [A-Z][a-z]+ [A-Z]\. [A-Z][a-z]+:',
        r'\b[A-Z][a-z]+ [A-Z][a-z]+ [A-Z][a-z]+:',
        r'\b[A-Z]{2}+\s[A-Z][a-z]+\s[A-Z][a-z]+:',
        r'\b[A-Z][a-z]+\s[A-Z]\s[A-Z][a-z]+:',
        r'\b[A-Z]\. [A-Z][a-z]+ [A-Z][a-z]+:',
        r'\bDr\. [A-Z][a-z]+ [A-Z]\b [A-Z][a-z]+:',
        r'\b[A-Z]\. [A-Z][a-z]+:',      
        r'\b[A-Z][a-z]+ [A-Z]+:',      
        r'\b[A-Z]\. [A-Z]+:',          
        r'\b[A-Z][a-z]+:',
        r'\b[A-Za-z]+:',
        r'\b[A-Z]\.\s[A-Z][a-z]+:',
        r'\b[A-Z][a-z]+\s[A-Z]\.\s[A-Z][a-z]+:',
        r'\b[A-Z]\.\s[A-Z]\.\s[A-Z][a-z]+:',
        r'\b([A-Z]\.){3}\s[A-Z][a-z]+\s[A-Z][a-z]+:',
        r'\b([A-Z]\.\s){3}[A-Z][a-z]+\s[A-Z][a-z]+:',
        r'\b[A-Z][a-z]+\s[A-Z][a-z]+\s([A-Z]\.){3}+:',
        r'\b[A-Z][a-z]+\s[A-Z]\.\s[A-Z]\.:',
        r'\b[A-Z][a-z]+\s[A-Z]\.\s[A-Z][a-z]+:',
        r'\b[A-Z]\.\s[A-Z][a-z]+\s[A-Z][a-z]+:',
        r'\b[A-Z]\s[A-Z]\s[A-Z][a-z]+\s[A-Z][a-z]+:',
        r'\bDr\.\s[A-Z][a-z]+\s[A-Z][a-z]+:',
        r'\b[A-Z]\.\s[A-Z]\.\s[A-Z][a-z]+:',
        r'\b[A-Z]\.\s[A-Z]\s[A-Z][a-z]+:',
        r'\b[A-Z]\.[A-Z]\.\s[A-Z][a-z]+:',
        r'\b[A-Z]{2}\s[A-Z][a-z]+:',
        r'\b[A-Z][a-z]+-[A-Z][a-z]+\s[A-Z][a-z]+:',
        r'\b([A-Z]\.){2}[A-Z][a-z]+:',
        r'\b(Mr|Ms|Mrs|Dr|Prof)\.\s[A-Z][a-z]+\s[A-Z][a-z]+:',
        r'\b(Mr|Ms|Mrs|Dr|Prof)\.\s([A-Z]\.\s){2}[A-Z][a-z]+:',
        r'\b(Mr|Ms|Mrs|Dr|Prof)\.\s[A-Z]\s[A-Z][a-z]+:',
        r'\b(Mr|Ms|Mrs|Dr|Prof)\.\s[A-Z]{2}\s[A-Z][a-z]+:',
        r'\b[A-Z][a-z]+\s[A-Z]\.\s[A-Z][a-z]+:',
        r'\b[A-Z]\.\s[A-Z]\.\s[A-Z][a-z]+:',
        r'\b[A-Z]\.[A-Z]\.\s[A-Z][a-z]+:',
        r'\b[A-Z]\.\s[A-Z][a-z]+\s[A-Z][a-z]+:',
        r'\b[A-Z][a-z]+\s[A-Z][’\w]+:',
        r'\b([A-Z]\s)+[A-Z][a-z]+:'
    ]
    
    combined_pattern = r'|'.join(name_patterns)
    regex = re.compile(combined_pattern)
    matches = regex.finditer(text)
    
    segments = []
    last_end = 0

    for match in matches:
        start, end = match.start(), match.end()

        if start > 0 and text[start - 1] != '\n':
            segments.append(text[last_end:start] + '\n' + text[start:end])
        else:
            segments.append(text[last_end:end])

        last_end = end

    segments.append(text[last_end:])
    modified_text = ''.join(segments)

    return modified_text


def clean_text(text):
    pattern = r'(?<!\n)\n(?!([A-Z][a-z]*)(?: [A-Z][a-z]*){0,2} :| {3}[A-Z][a-z]*(?: [A-Z][a-z]*){0,2} :)'
    cleaned_text = re.sub(pattern, ' ', text)
    cleaned_text = cleaned_text.strip()
    return cleaned_text

#seperating questions and answers from the QnA part based on the speakers
def separate_questions_answers(text, management):
    paragraphs = text.split('\n')
    questions = []
    answers = []
    
    management_pattern = re.compile(r'\b(' + '|'.join(map(re.escape, management)) + r')\b\s*:')

    for paragraph in paragraphs:
        paragraph = paragraph.strip()
        
        if management_pattern.match(paragraph):
            questions.append(paragraph)
        else:
            answers.append(paragraph)

    return " ".join(answers), " ".join(questions)

#analysis
def process_pdf_analysis(pdf_file, lm_dict_file):
    lm_dict = pd.read_csv(lm_dict_file)
    all_text = extract_pdf_text(pdf_file)
    all_text = remove_date_name(all_text)
    all_text = clean_text(all_text)
    all_text = add_newline_before_names(all_text)
    pattern = r"" #enter the line seperating the Introduction and QnA session
    intro_part, qna_part = separate_sections(all_text, pattern)
    
    intro_part = preprocess_text(intro_part)
    results_intro = analyze_text(intro_part, lm_dict)
    
    qna_part = blank_spaces(qna_part)
    management =  [] #enter the names of the speakers of company's Management
    questions_text, answers_text = separate_questions_answers(qna_part, management)
    
    questions_text = preprocess_text(questions_text)
    answers_text = preprocess_text(answers_text)
    
    results_questions = analyze_text(questions_text, lm_dict)   
    results_answers = analyze_text(answers_text, lm_dict)

    return results_intro, results_questions, results_answers

#saving the results
def save_results_to_csv(results, csv_file, company, year):
    index_tuples = [
        (company, year, "Introductory Part"),
        (company, year, "Questions"),
        (company, year, "Answers")
    ]
    multi_index = pd.MultiIndex.from_tuples(index_tuples, names=["Company", "Year", "Section"])
    
    df = pd.DataFrame(results, index=multi_index)
    df.to_csv(csv_file, mode='a', header=not pd.io.common.file_exists(csv_file))

In [837]:
pdf_file = r"" #enter the path of the earning transcript
lm_dict_file = "Loughran-McDonald_MasterDictionary.csv" #load LM dictionary
csv_file = "final_sentiment_count.csv" #save the results

company = "" #enter the company name
year = "" #enter the FY and financial quarter 


results_intro, results_questions, results_answers = process_pdf_analysis(pdf_file, lm_dict_file)
results = [results_intro, results_questions, results_answers]
save_results_to_csv(results, csv_file, company, year)

print(f"Results saved to {csv_file}")

Results saved to final_sentiment_count.csv
