In [None]:
## after getting one transcription text object from whisper (concatenated chunks) 
## use this notebook to develop further processing, like:
## - calculate tokens for text
## - cost of embedding 
## - embed in chunks 
## - schema to save file _ embedding in mongo 
## - derive "timebar insight" on the call - [INTRO, MANAGEMENT NOTE, ANALYST QA, ] 
## - derive "highlights" to summarise key takeaways based on intro and questions asked & answered 
## - in both the cases seek user to relevant time period upon choosing highlight/timestamp


In [1]:
## open raw transcription i
from dirs import *
import json 
filename = 'conc_earning_call_morepen.json'
path = Path(TS_DIR,filename)
with open(path,'r') as fr:
    tx = json.load(fr)

print(tx)

{'text': "Ladies and gentlemen, good day and welcome to Morepen Laboratories Limited Q4 FY24 conference call, hosted by Motilal Oswal Financial Services Limited. As a reminder, all participant lines will be in the listen-only mode, and there will be an opportunity for you to ask questions after the presentation concludes. Should you need assistance during the conference call, please signal an operator by pressing star then zero on your touch-tone phone. Please note that this conference is being recorded. I now hand the conference over to Mr. Tushar Manojani from Motilal Oswal Financial Services Limited. Thank you and over to you, Mr. Manojani. Thanks, Renju. Good afternoon and I welcome you all for the fourth quarter FY24 earnings call of Morepen Labs, hosted by Motilal Oswal Financial Services. From the management side, we have Mr. Sushil Suri, Chairman and Managing Director. Mr. Ajay Kumar Sharma, Chief Financial Officer. Mr. Nishant Doshi, Vice President, Corporate Finance and Inves

In [5]:
def count_words(text):
    words = text.split()
    return len(words)
count_words(tx['text'])

11954

In [6]:
def split_document(doc, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(doc):
        end = start + chunk_size
        chunks.append(doc[start:end])
        start = end - overlap  # Move back by 'overlap' characters for the next chunk
    return chunks

def split_document_in_words(doc, chunk_size=1000, overlap=200):
    words = doc.split()
    chunks = []
    current_chunk = []
    current_length = 0
    
    for word in words:
        if current_length + len(word) + 1 > chunk_size:  # +1 for the space
            chunks.append(' '.join(current_chunk))
            current_chunk = current_chunk[-overlap:]  # Keep overlap number of words
            current_length = sum(len(w) + 1 for w in current_chunk)  # Recalculate length with spaces
            
        current_chunk.append(word)
        current_length += len(word) + 1  # Add word length and a space
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

In [8]:
tx_word_chunks = split_document_in_words(tx['text'],)


In [10]:
tx_word_chunks[1]

'Ladies and gentlemen, good day and welcome to Morepen Laboratories Limited Q4 FY24 conference call, hosted by Motilal Oswal Financial Services Limited. As a reminder, all participant lines will be in the listen-only mode, and there will be an opportunity for you to ask questions after the presentation concludes. Should you need assistance during the conference call, please signal an operator by pressing star then zero on your touch-tone phone. Please note that this conference is being recorded. I now hand the conference over to Mr. Tushar Manojani from Motilal Oswal Financial Services Limited. Thank you and over to you, Mr. Manojani. Thanks, Renju. Good afternoon and I welcome you all for the fourth quarter FY24 earnings call of Morepen Labs, hosted by Motilal Oswal Financial Services. From the management side, we have Mr. Sushil Suri, Chairman and Managing Director. Mr. Ajay Kumar Sharma, Chief Financial Officer. Mr. Nishant Doshi, Vice President, Corporate Finance and Investor Relat

In [11]:
tx_chunks = split_document(tx['text'],)

In [15]:
tx_chunks[2]

"terday. I know that I'm sitting in front of the computer, so I have access on the numbers, but please forgive me if I'm talking too many numbers. But quarter as a whole, it was an outstanding performance for the quarter, but year as a whole also the company has done pretty well. Since it's an annual result, so in the annual thing, we discuss the company as a whole also, but on the quarterly basis, these are the pure simple numbers. Just to appraise the participants who are joining with these type of calls for the first time, we have primarily two type of business segments. One is medical devices and one is the bulk drugs or APIs, which are modernly called. So 80% of the revenue comes from medical devices and APIs. And API, of course, is our traditional business, our old business, which is our cash cow. And the company started from the API only. Medical devices is relatively new. When you say relatively new, it's still almost 20 years old now. But medical devices is the fastest growing

In [16]:
len(tx_chunks)

84

In [18]:
import tiktoken

def count_tokens(text, model="gpt-3.5-turbo"):
    # Load the appropriate encoding for the specified model
    encoding = tiktoken.encoding_for_model(model)
    
    # Encode the text using the model's tokenizer
    tokens = encoding.encode(text)
    
    # Return the number of tokens
    return len(tokens)

In [20]:
def split_doc_for_task(doc):
    return split_document(doc,chunk_size=2500,overlap=350)

In [21]:
chunks = split_doc_for_task(tx['text'])
len(chunks)

32

In [22]:
total_tokens = 0
for i, chunk in enumerate(chunks):
    chunk_tokens = count_tokens(chunk)
    total_tokens += chunk_tokens
    print(f"Chunk {i+1} token count: {chunk_tokens}")

print(f"Total number of tokens: {total_tokens}")

Chunk 1 token count: 555
Chunk 2 token count: 556
Chunk 3 token count: 572
Chunk 4 token count: 583
Chunk 5 token count: 550
Chunk 6 token count: 585
Chunk 7 token count: 566
Chunk 8 token count: 545
Chunk 9 token count: 548
Chunk 10 token count: 560
Chunk 11 token count: 586
Chunk 12 token count: 610
Chunk 13 token count: 614
Chunk 14 token count: 579
Chunk 15 token count: 596
Chunk 16 token count: 631
Chunk 17 token count: 558
Chunk 18 token count: 618
Chunk 19 token count: 586
Chunk 20 token count: 552
Chunk 21 token count: 561
Chunk 22 token count: 578
Chunk 23 token count: 609
Chunk 24 token count: 569
Chunk 25 token count: 583
Chunk 26 token count: 582
Chunk 27 token count: 561
Chunk 28 token count: 569
Chunk 29 token count: 586
Chunk 30 token count: 626
Chunk 31 token count: 566
Chunk 32 token count: 52
Total number of tokens: 17992


In [26]:
from constants import DEFAULT_TEMPERATURE, DEFAULT_TOP_P
from utils import get_openai_client


client = get_openai_client() 

response = client.chat.completions.create(
                                            model="gpt-3.5-turbo",
                                            messages=[{
                                                         "role": "system", "content": "You are a helpful assistant.",
                                                    },
                                                     {"role": "user", "content": f"Identify the section and summarize the following text:\n\n{chunk}\n\nSections can be: [INTRO, MANAGEMENT NOTE, ANALYST QA, CONCLUSION]."
                                                    }],
                                            temperature=DEFAULT_TEMPERATURE,
                                            max_tokens=150,
                                            top_p=DEFAULT_TOP_P
                                        )


In [61]:

def identify_section_and_summarize(chunk, model="gpt-3.5-turbo"):
    # Call OpenAI API to identify the section and get the summary
    client = get_openai_client() 

    response = client.chat.completions.create(
                                                model=model,
                                                messages=[{
                                                            "role": "system", "content": "You are a helpful assistant to summarise an EARNINGS CONFERENCE CALL. The transcript of the call will be provided in chunks. Identify sections out of the Sections : [INTRO, MANAGEMENT NOTE, ANALYST QA, CONCLUSION]. Assign a tag to each section only out of these. Also summarise the chunk. Return the data in form of JSON `{section : '', summary: '' }`.  I repeat -- FOLLOW THE FORMAT OF JSON AND CHOOSE SECTION FROM PROVIDED SECTIONS ONLY. KEEP THE SUMMARY DETAILED AND AROUNDO 300 words.",
                                                        },
                                                        {"role": "user", "content": f"Identify the section and summarize the following text:\n\n{chunk}\n\n"
                                                        }],
                                                temperature=DEFAULT_TEMPERATURE,
                                                max_tokens=300,
                                                top_p=DEFAULT_TOP_P
                                            )
    print(type(response))
    print("r", response)
    # Extract and return the identified section and summary from the response
    response_content = response.choices[0].message.content
    print(response_content)
    print(response_content)
    # Parse the response to extract section and summary
    return response_content

def process_document_chunks(chunks):

    doc_meta = { }
    print("running gpt on the chunks, len :", len(chunks))
    for i in range(len(chunks)):
        current_chunk = chunks[i]
        res = identify_section_and_summarize(current_chunk) 
        try:
            doc_t = res.removeprefix('```json') # try catch
            ddict =json.loads( doc_t.removesuffix('```'))
        except Exception as e:
            print(e)
            print(":::",res) 
            ddict = {'section':None, 'summary':None}

        chunk_meta =  {'chunk_text':current_chunk ,
                            'section':ddict['section'],
                            'summary':ddict['summary']
                          }
        doc_meta[i]=chunk_meta
    return doc_meta

doc_summary_dict = process_document_chunks(chunks)
doc_summary_dict

running gpt on the chunks, len : 32
<class 'openai.types.chat.chat_completion.ChatCompletion'>
r ChatCompletion(id='chatcmpl-9ZdfbyHxIuXLavGgwCxwuSs9uDLWi', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{\n\t"section": "INTRO",\n\t"summary": "The introduction section of the earnings conference call begins with the host welcoming the participants to Morepen Laboratories Limited Q4 FY24 conference call. The host reminds that all lines will be in listen-only mode, and questions can be asked after the presentation. The management team includes Mr. Sushil Suri, Chairman and Managing Director, Mr. Ajay Kumar Sharma, Chief Financial Officer, Mr. Nishant Doshi, Vice President of Corporate Finance and Investor Relations, and Mr. Vipul Kumar Srivastava, Company Secretary and Compliance Officer. Mr. Suri starts with opening remarks, highlighting the outstanding performance of the company in the quarter and the year as a whole. He mentions the

{0: {'chunk_text': "Ladies and gentlemen, good day and welcome to Morepen Laboratories Limited Q4 FY24 conference call, hosted by Motilal Oswal Financial Services Limited. As a reminder, all participant lines will be in the listen-only mode, and there will be an opportunity for you to ask questions after the presentation concludes. Should you need assistance during the conference call, please signal an operator by pressing star then zero on your touch-tone phone. Please note that this conference is being recorded. I now hand the conference over to Mr. Tushar Manojani from Motilal Oswal Financial Services Limited. Thank you and over to you, Mr. Manojani. Thanks, Renju. Good afternoon and I welcome you all for the fourth quarter FY24 earnings call of Morepen Labs, hosted by Motilal Oswal Financial Services. From the management side, we have Mr. Sushil Suri, Chairman and Managing Director. Mr. Ajay Kumar Sharma, Chief Financial Officer. Mr. Nishant Doshi, Vice President, Corporate Finance

In [56]:
doc_summary_dict

{0: {'chunk_text': 'Ladies and gentlemen, good day and welcome to Morepen Laboratories Limited Q4 FY24 conference call, hosted by Motilal Oswal Financial Services Limited. As a reminder, all participant lines will be in the listen-only mode, and there will be an opportunity for you to ask questions after the presentation concludes. Should you need assistance during the conference call, please signal an operator by pressing star then zero on your touch-tone phone. Please note that this conference is being recorded. I now hand the conference over to Mr. Tushar Manojani from Motilal Oswal Financial Services Limited. Thank you and over to you, Mr. Manojani. Thanks, Renju. Good afternoon and I welcome you all for the fourth quarter FY24 earnings call of Morepen Labs, hosted by Motilal Oswal Financial Services. From the management side, we have Mr. Sushil Suri, Chairman and Managing Director. Mr. Ajay Kumar Sharma, Chief Financial Officer. Mr. Nishant Doshi, Vice President, Corporate Finance

In [62]:
# from dirs import SUMMARY_DIR
with open(Path(TS_DIR,'morepen_summary_32.json'), 'w') as fw:
    json.dump(doc_summary_dict,fw)

In [60]:
x = "Ladies and gentlemen, good day and welcome to Morepen Laboratories Limited Q4 FY24 conference call, hosted by Motilal Oswal Financial Services Limited. As a reminder, all participant lines will be in the listen-only mode, and there will be an opportunity for you to ask questions after the presentation concludes. Should you need assistance during the conference call, please signal an operator by pressing star then zero on your touch-tone phone. Please note that this conference is being recorded. I now hand the conference over to Mr. Tushar Manojani from Motilal Oswal Financial Services Limited. Thank you and over to you, Mr. Manojani. Thanks, Renju. Good afternoon and I welcome you all for the fourth quarter FY24 earnings call of Morepen Labs, hosted by Motilal Oswal Financial Services. From the management side, we have Mr. Sushil Suri, Chairman and Managing Director. Mr. Ajay Kumar Sharma, Chief Financial Officer. Mr. Nishant Doshi, Vice President, Corporate Finance and Investor Relati"
len(x)


1000