In [2]:
import os
import requests
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
#load openAI api key from environment
load_dotenv()

True

In [4]:
chatLLM = ChatOpenAI(model="gpt-4o-mini-2024-07-18", temperature=0.6) #LLM model

In [5]:
def get_transcripts(symbol): 
    api_key = os.getenv("FMP_API_KEY")    
    base_url = f"https://financialmodelingprep.com/api/v3/earning_call_transcript/{symbol}"

    def get_transcript(year, quarter):
        url = f"{base_url}?year={year}&quarter={quarter}&apikey={api_key}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            if data:
                return data[0]
        return None
    
    # Retrieve the latest transcript to determine the current year and quarter
    response = requests.get(f"{base_url}?apikey={api_key}")
    if response.status_code != 200:
        print(f"Failed to retrieve latest transcript: {response.status_code}")
        return None
    
    latest_data = response.json()
    if not latest_data:
        print("No transcripts found")
        return None
    
    latest_transcript = latest_data[0]
    latest_year = latest_transcript['year']
    latest_quarter = latest_transcript['quarter']
    
    transcripts = []
    for i in range(8): # 2 years or 8 quarters of transcripts
        quarter_offset = latest_quarter - i
        year = latest_year

        # Adjust year and quarter if necessary
        if quarter_offset <= 0:
            year -= (abs(quarter_offset) // 4) + 1
            quarter = quarter_offset % 4
            if quarter == 0:
                quarter = 4
        else:
            quarter = quarter_offset

        transcript = get_transcript(year, quarter)
        if transcript:
            transcripts.append(transcript)
        else:
            print(f"Transcript not found for {year} Q{quarter}")

    return transcripts

In [6]:
transcripts= get_transcripts(symbol='AAPL')

In [7]:
transcript_contents = [entry['content'] for entry in transcripts]

In [8]:
transcript_contents #list of strings with each item being the audio transcript of an earnings call

["Suhasini Chandramouli: Good afternoon and welcome to the Apple Q3 Fiscal Year 2024 Earnings Conference Call. My name is Suhasini Chandramouli, Director of Investor Relations. Today's call is being recorded. Speaking first today is Apple’s CEO, Tim Cook, and he'll be followed by CFO, Luca Maestri. After that, we'll open the call to questions from analysts. Please note that some of the information you'll hear during our discussion today will consist of forward-looking statements, including, without limitation, those regarding revenue, gross margin, operating expenses, other income and expense, taxes, capital allocation, and future business outlook including the potential impact of macroeconomic conditions on the company's business and results of operations. These statements involve risks and uncertainties that may cause actual results or trends to differ materially from our forecast. For more information, please refer to the risk factors discussed in Apple's most recently filed Annual 

In [9]:
characters_per_quarter= ([(len(transcript)) for transcript in transcript_contents]) #the number of characters in the audio transcripts of each of the 8 quarters
characters_per_quarter

[46244, 47006, 44898, 49348, 46817, 46470, 45583, 47018]

In [10]:
print(f'The total number of character in 8 quarters of audio transcript: {sum(characters_per_quarter)}')

The total number of character in 8 quarters of audio transcript: 373384


In [11]:
# Defining a the Recursive Text Splitter
# separators=["\n\n", "\n", " ", ""] # Default list, no need to specify
text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", " ", ""],  # Default list
        chunk_size=700, 
        chunk_overlap=100, #parameter specifies the number of characters that overlap between consecutive chunks. In your example, this is set to 100 characters
        length_function=len,  #using the built-in Python len function to determine the number of characters in a chunk.
        is_separator_regex=False, # specifying that the separators should be treated as plain text characters rather than regular expressions. This means that the text splitter will look for exact matches of the specified separators in the text
    )


In [122]:
# Create LangChain document type, which will store the chunks of data that the text splitter split all the combined transcripts into

# Each of the item in docs will contain the respective chunk
docs = text_splitter.create_documents(transcript_contents) 

In [123]:
print(f'The total number of chunks that the document was split into: {len(docs)}') 

The total number of chunks that the document was split into: 717


In [104]:
docs[0].page_content # a sample chunk of text data

"Suhasini Chandramouli: Good afternoon and welcome to the Apple Q3 Fiscal Year 2024 Earnings Conference Call. My name is Suhasini Chandramouli, Director of Investor Relations. Today's call is being recorded. Speaking first today is Apple’s CEO, Tim Cook, and he'll be followed by CFO, Luca Maestri. After that, we'll open the call to questions from analysts. Please note that some of the information you'll hear during our discussion today will consist of forward-looking statements, including, without limitation, those regarding revenue, gross margin, operating expenses, other income and expense, taxes, capital allocation, and future business outlook including the potential impact of"

In [124]:
chunk_sizes= [len(doc.page_content) for doc in docs]
chunk_sizes

[687,
 694,
 693,
 687,
 699,
 695,
 698,
 696,
 690,
 694,
 698,
 699,
 698,
 696,
 696,
 697,
 693,
 570,
 697,
 698,
 697,
 698,
 698,
 693,
 698,
 699,
 697,
 697,
 695,
 697,
 165,
 261,
 555,
 666,
 550,
 696,
 693,
 428,
 651,
 409,
 315,
 697,
 694,
 138,
 668,
 437,
 490,
 607,
 463,
 441,
 699,
 697,
 691,
 142,
 170,
 674,
 527,
 608,
 698,
 694,
 633,
 580,
 331,
 484,
 556,
 686,
 693,
 186,
 696,
 128,
 699,
 460,
 201,
 687,
 509,
 645,
 208,
 680,
 393,
 469,
 689,
 668,
 646,
 352,
 699,
 277,
 639,
 96,
 688,
 696,
 698,
 698,
 695,
 698,
 696,
 698,
 698,
 693,
 692,
 699,
 696,
 692,
 698,
 696,
 574,
 697,
 692,
 696,
 695,
 696,
 698,
 698,
 699,
 699,
 698,
 698,
 698,
 695,
 321,
 687,
 693,
 214,
 392,
 695,
 274,
 675,
 322,
 452,
 658,
 676,
 297,
 578,
 697,
 260,
 657,
 697,
 254,
 695,
 241,
 458,
 511,
 490,
 448,
 686,
 322,
 237,
 655,
 699,
 177,
 587,
 698,
 345,
 192,
 695,
 430,
 588,
 601,
 647,
 693,
 375,
 505,
 698,
 432,
 211,
 633,
 464,
 464,

Basically in the following we keep the **chunk_size=700**, while varying the overlap

In [112]:
#overlap= 0
print(f'Average chunk size: {sum(chunk_sizes)//len(chunk_sizes)}; max chunk size: {max(chunk_sizes)}; min chunk size: {min(chunk_sizes)}')

Average chunk size: 552; max chunk size: 699; min chunk size: 10


In [116]:
# overlap= 100
print(f'Average chunk size: {sum(chunk_sizes)//len(chunk_sizes)}; max chunk size: {max(chunk_sizes)}; min chunk size: {min(chunk_sizes)}')

Average chunk size: 570; max chunk size: 699; min chunk size: 54


In [120]:
# overlap= 50
print(f'Average chunk size: {sum(chunk_sizes)//len(chunk_sizes)}; max chunk size: {max(chunk_sizes)}; min chunk size: {min(chunk_sizes)}')

Average chunk size: 560; max chunk size: 699; min chunk size: 54


For the following we will be keeping **chunk_size=700** and **chunk_overlap= 100**

In [157]:
a= [doc.page_content for doc in docs if len(doc.page_content)< 100] #listing out chunks which are less than 100 characters
len(a)
a

["Operator: Once again this does conclude today's conference. We do appreciate your participation.",
 "Operator: Once again, this does conclude today's conference. We do appreciate your participation.",
 "Operator: Once again, this does conclude today's conference. We do appreciate your participation.",
 'Operator: Our next question is from David Vogt with UBS.',
 "Operator: Once again, this does conclude today's conference. We do appreciate your participation.",
 'Operator: Our next question is from David Vogt of UBS.',
 'Operator: Our next question is from Samik Chatterjee of JPMorgan.',
 "Operator: Once again, this concludes today's conference. We do appreciate your participation."]

In [158]:
a= [doc.page_content for doc in docs if len(doc.page_content)> 100 and len(doc.page_content) < 150] #listing out chunks which are between 100 and 150 characters
len(a)
a

["we feel -- we're well positioned. And as you know well, these are very high levels of gross margin for us and we are pleased where we are.",
 'fiscal year. But in spite of that, we delivered a level of growth that was better than what we were expecting at the beginning of the quarter.',
 'in that sort of outlook particularly as you have Apple Intelligence hopefully stoking the fire for demand going forward? Thanks.',
 "which speak only as of the date they are made. I'd now like to turn the call over to Tim for introductory remarks.",
 'then in the March quarter, what the different puts and takes would be? And then, I have a follow-up please. Thank you.',
 'our product mix has been very strong during the last couple of cycles. So we will continue to push on that front.',
 'Operator: [Operator Instructions] We will go ahead and take our first question from Erik Woodring of Morgan Stanley.',
 'the monetization that you have need to do to get back to those growth levels. and I have a fol

From the above examples, we can clearly note that chunks less than 100 characters provide no meaningful context for answering questions related to our application, while chunks between 100 to 150 characters provide at least provide some level of meaningful context. ***So for our application, we will be removing chunks less than 100 characters.*** 