In [None]:
# for Google collab
# !pip install markdownit-py
# !pip install pymupdf4llm

# for vscode
# %pip install markdownit-py
# %pip install pymupdf4llm

In [11]:
import re
import os
import json
from collections import OrderedDict

import pymupdf4llm
from markdown_it import MarkdownIt
import pandas as pd

In [32]:
class TextCleaner():
    """
    A class for cleaning text data by removing dot strings, replacing newlines, and fixing sentence spacing.
    """

    def __init__(self):
        pass

    def clean(self, text):
        """
        Cleans the given text by removing dot strings, replacing newlines, and fixing sentence spacing.

        Args:
            text (str): The text to be cleaned.

        Returns:
            str: The cleaned text.
        """
        self.text = text
        self._remove_dot_strings()
        self._replace_newline()
        self._fix_sentence_spacing()
        return self.text

    def _remove_dot_strings(self):
        pattern = r'\.{4,}'
        self.text = re.sub(pattern, '', self.text)
        
    def _replace_newline(self, ):
        self.text = self.text.replace('\n', ' ')
    
    def _fix_sentence_spacing(self):
        pattern = r'\.(?=[A-Z])'
        self.text = re.sub(pattern, '. ', self.text)


class JPMMarkdownParser():
    """Parser for JPM Earnings Call Transcripts.
        Input: markdown file
        Output: JSON object with the following structure:
        {
            section_number: {
                name: name of the speaker,
                job_title: job title of the speaker,
                type: 'Q' or 'A',
                content: transcript content
            }
        }
        Initialise with the markdown file and call the "parse" method to process.
        Use parser.get_transcript() to get the JSON object.
    """

    def __init__(self, markdown_text=None, markdown_file=None):
        self.transcript = OrderedDict()
        if markdown_file:
            with open(markdown_file, 'r') as f:
                self.markdown_content = f.read()
        elif markdown_text:
            self.markdown_content = markdown_text
        self.md = MarkdownIt()
        self._set_current_section(0)
        self.transcript[self.current_section] = {}
        self.textcleaner = TextCleaner()
        
    def _get_tokens(self):
        self.tokens = self.md.parse(self.markdown_content)

    def _is_name(self, token):
        return (token.type == 'heading_open') and (token.tag == 'h4')

    def _is_job_title(self, token):
        return (token.type == 'heading_open') and (token.tag == 'h6')
    
    def _is_qora(self, token):
        return (token.type == 'heading_open') and (token.tag == 'h1')
    
    def _is_paragraph(self, token):
        return token.type == 'paragraph_open'

    def _new_section(self):
        section = self._get_current_section() + 1
        self.transcript[section] = {}
        self._set_current_section(section)

    def _get_current_section(self):
        return self.current_section
    
    def _set_current_section(self, section):
        self.current_section = section
    
    def _add_name(self, name):
        self.transcript[self._get_current_section()]['name'] = name

    def _add_job_title(self, job_title):
        self.transcript[self._get_current_section()]['job_title'] = job_title

    def _add_qora(self, qora):
        self.transcript[self._get_current_section()]['type'] = qora

    def _add_content(self, content):
        try:
            self.transcript[self._get_current_section()]['content'] += ' ' + content
        except KeyError:
            self.transcript[self._get_current_section()]['content'] = content
        
    def _merge_duplicate_sections(self):
        duplicate_sections = []
        for section in self.transcript:
            try:
                if self.transcript[section]['name'] == self.transcript[section+1]['name']:
                    self.transcript[section]['content'] += ' ' + self.transcript[section+1]['content']
                    duplicate_sections.append(section+1)
                    # self.transcript.pop(section+1)
            except KeyError: # last section
                pass
        for s in duplicate_sections:
            self.transcript.pop(s)

    def _clean_text(self):
        for section in self.transcript:
            text = self.transcript[section]['content']
            cleaned_text = self.textcleaner.clean(text)
            self.transcript[section]['content'] = cleaned_text
                
    def parse(self, merge=True, clean=True):
        self._get_tokens()
        for i, t in enumerate(self.tokens):
            if self._is_name(t):
                self._new_section()
                self._add_name(self.tokens[i+1].content)

            if self._is_job_title(t):
                self._add_job_title(self.tokens[i+1].content)

            if self._is_qora(t):
                self._add_qora(self.tokens[i+1].content)

            if self._is_paragraph(t):
                self._add_content(self.tokens[i+1].content)

        if merge:
            self._merge_duplicate_sections()
            
        if clean:
            self._clean_text()

    def get_transcript(self):
        return json.dumps(self.transcript, indent=4)


class JPMTranscriptProcessor():
    """
    Processor for JPM earnings transcripts.
        Input: path to PDF file
        Output: CSV file with the following columns:
        - name: name of the speaker
        - job_title: job title of the speaker
        - type: 'Q' or 'A'
        - content: transcript content

        Initialise with the path to the PDF file and call the "process" method to process.
        Use the "dataframe_path" argument to specify the path to save the CSV file.
        
    """
    def __init__(self, path_to_pdf):
        self.path_to_pdf = path_to_pdf
        self._get_base_filename(path_to_pdf)

    def _get_base_filename(self, path):
        self.base_filename = os.path.basename(path).split('.')[0]
    
    def _pdf_to_markdown(self, save_md=False):
        self.md_text = pymupdf4llm.to_markdown(self.path_to_pdf)

    def _parse_markdown(self, save_json=False):
        self.parser = JPMMarkdownParser(markdown_text=self.md_text)
        self.parser.parse()
        self.transcript_json = self.parser.get_transcript()

    def _json_to_dataframe(self):
        self.df = pd.DataFrame.from_dict(json.loads(self.transcript_json), orient='index')

    def _save_dataframe(self):
        path_to_save = os.path.join(self.dataframe_path, self.base_filename + '.csv')
        try:
            self.df.to_csv(path_to_save, index=False)
        except OSError:
            os.makedirs(self.dataframe_path)
            self.df.to_csv(path_to_save, index=False)
    
    def process(self, dataframe_path='./csv_transcripts/'):
        self.dataframe_path = dataframe_path
        self._pdf_to_markdown()
        self._parse_markdown()
        self._json_to_dataframe()
        self._save_dataframe()

In [33]:
pdf_path = './transcripts/JPM/jpm-2024-3q-earnings-call-transcript-final.pdf'
processor = JPMTranscriptProcessor(pdf_path)
processor.process()


Processing ./transcripts/JPM/jpm-2024-3q-earnings-call-transcript-final.pdf...


In [40]:
df = pd.read_csv('./csv_transcripts/jpm-2024-3q-earnings-call-transcript-final.csv')
df

Unnamed: 0,content,name,job_title,type
0,"**Operator: Good morning, ladies and gentleme...",,,
1,"Thank you and good morning, everyone. Starting...",Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase & Co.",
2,"Hey, good morning. So, Jeremy, as you highligh...",Jim Mitchell,"Analyst, Seaport Global Securities LLC",Q
3,"Yeah. Sure, Jim. I'll try to answer both quest...",Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase & Co.",A
4,All right. Thanks a lot.,Jim Mitchell,"Analyst, Seaport Global Securities LLC",Q
...,...,...,...,...
79,Okay. That's helpful. So it sounds like you're...,Saul Martinez,"Analyst, HSBC Securities (USA), Inc.",Q
80,Very good returns in business in both Banking ...,Jamie Dimon,"Chairman & Chief Executive Officer, JPMorgan C...",A
81,Yes. Got it. Okay. That's helpful. Thanks a lo...,Saul Martinez,"Analyst, HSBC Securities (USA), Inc.",Q
82,"Thanks, Saul. Thanks, everyone.",Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase & Co.",A


In [42]:
pdf_path = 'transcripts/JPM/jpm-2021-3q-earnings-Call-Final-Transcript-PDF.pdf'
processor = JPMTranscriptProcessor(pdf_path)
processor.process()

Processing transcripts/JPM/jpm-2021-3q-earnings-Call-Final-Transcript-PDF.pdf...


In [43]:
df = pd.read_csv('csv_transcripts/jpm-2021-3q-earnings-Call-Final-Transcript-PDF.csv')
df

Unnamed: 0,content,name,job_title,type
0,"**Operator:** Good morning, ladies and gentle...",,,
1,"Thanks, operator. Good morning, everyone. The ...",Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase & Co.",
2,"Good morning, Jeremy. Wanted to ask about the ...",John E. McDonald,"Analyst, Autonomous Research",Q
3,"Yeah. John, good question and good calc there....",Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase & Co.",A
4,"Okay. And as a follow up, your cash balances c...",John E. McDonald,"Analyst, Autonomous Research",Q
...,...,...,...,...
80,No. It's the opposite. I'm telling you I don't...,Jamie Dimon,"Chairman & Chief Executive Officer, JPMorgan C...",A
81,Right. Got it. Thanks for the clarity on that....,Andrew Lim,"Analyst, Société Générale SA (UK)",Q
82,"Yeah. So I think the base case, the central ca...",Jeremy Barnum,"Chief Financial Officer, JPMorgan Chase & Co.",A
83,"Jeremy, just really quickly. I've got to go be...",Jamie Dimon,"Chairman & Chief Executive Officer, JPMorgan C...",A
