### Run Inference: Generate titles for chapterized components from the processed data
---

In this notebook:

1. We are extracted the processed data, including the messages from speakers in transcripts

2. Loading those into our bedrock models: ***Titan, Claude, Llama***

3. Generate chapter titles for each chapter from respective meetings

In [None]:
# import libraries
import os
import ray
import json
import yaml
import copy
import time
import boto3
import logging
import pandas as pd  
from typing import Dict
from pathlib import Path
from litellm import completion ## support for text generation models on bedrock

#### Set a logger 

In [None]:
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
if ray.is_initialized():
    ray.shutdown()
ray.init()

#### Load the config file: Contains model information, data directory information

In [None]:
## load the config file
# global constants
CONFIG_FILE_PATH = "config.yml"

In [None]:
# read the config yaml file
fpath = CONFIG_FILE_PATH
with open(fpath, 'r') as yaml_in:
    config = yaml.safe_load(yaml_in)
logger.info(f"config read from {fpath} -> {json.dumps(config, indent=2)}")

#### Asynchronous function series to generate titles to chapters from models specified in the [`config file`](config.yml)

In [None]:
# function to encapsulate call to any Bedrock model for text generation
def generate_chapter_title(model_id: str, prompt: str) -> Dict:
    # represents the service name
    service_name: str = "bedrock"
    # represents creating the bedrock model to invoke the litellm api for response for titan, llama and claude
    bedrock_model: str = f"{service_name}/{model_id}"
    # represents the current aws region
    aws_region = boto3.Session().region_name
    logger.info(f"model_id={model_id}, prompt length is {len(prompt)} characters, {len(prompt.split())} words")      

    # initialize the response dict
    ret = dict(exception = None,
               prompt = prompt,
               completion = None,
               file_name = None,
               chapter_id = None, 
               model_id = model_id,
               time_taken_in_seconds = None, 
               # initializing to 0 since none type throws an error later, this is used to calculate price per token input/output on ODT pricing
               completion_token_count = 0,
               # initializing to 0 since none type throws an error later
               prompt_token_count=0,
               input_token_price = None, 
               output_token_pricing = None, 
               chapter_text = None)
    body = ret['prompt']
    # set the env var for aws_region
    os.environ["AWS_REGION_NAME"] = aws_region

    # get the inference parameters from the config file    
    parameters = config['inference_parameters_for_title_generation']
    temperature = parameters.get('temperature', 0.1)
    caching = parameters.get('caching', False)
    
    try:
        logger.info(f"Invoking {bedrock_model}......")
        response = completion(model=bedrock_model,
                              messages=[{ "content": body,"role": "user"}],
                              temperature=temperature,
                              caching=caching)
        # iterate through the entire model response
        for idx, choice in enumerate(response.choices):
            # extract the message and the message's content from litellm
            if choice.message and choice.message.content:
                # extract the response from the dict
                ret["completion"] = choice.message.content.strip()
                prefixes_to_remove = config['response_prefix_to_remove']
                if prefixes_to_remove:
                    for p in prefixes_to_remove:
                        ret["completion"] = ret["completion"].replace(p, "")
                response_suffix_to_clip = config['response_suffix_to_clip']
                if response_suffix_to_clip:
                    ret['completion'] = ret['completion'].split(response_suffix_to_clip)[0].strip()
                logger.info(f"idx={idx}, choice.message.content={choice.message.content}")
        # Extract number of input and completion prompt tokens (this is the same structure for embeddings and text generation models on Amazon Bedrock)
        ret['prompt_token_count'] = response.usage.prompt_tokens
        ret['completion_token_count'] = response.usage.completion_tokens
        # Extract latency in seconds
        latency_ms = response._response_ms
        ret['time_taken_in_seconds']  = latency_ms / 1000
    except Exception as e:
        logger.error(f"Exception occurred during invoking {model_id}, exception={e}")
        ret['exception'] = e
    return ret


#### Running invocations for all bedrock models to generate chapter titles for meetings

In [None]:
## Represents all of the processed csv files to be used to generate titles to chapters from each
processed_data_chapters_fpath = os.path.join(config['dir']['processed'], config['dir']['chapterized_file'])
logger.info(f"going to read data from {processed_data_chapters_fpath}")
# we want to run through all the rows of the processed dataframe for each model
# that we want to evaluate. So we will run a loop to `apply` each model on each row
df = pd.read_csv(processed_data_chapters_fpath)
logger.info(f"there are {len(df.file_name.unique())} {config['dir']['file_type_to_process']} files to process")
df.head()

In [None]:
def get_inference(i: int, row: Dict, total: int, model_info: Dict) -> Dict:
    #print(f"row={row}")
    logger.info(f"row {i}/{total}, prompt_template={model_info['prompt_template']}, model_id={model_info['model']}")
    model_id = model_info['model']
    fpath = os.path.join(config['dir']['prompts'], model_info['prompt_template'])
    prompt_template = Path(fpath).read_text()
    # create the payload for model inference
    prompt = prompt_template.format(row['text']) 
    # generate the chapter title based on the given chapter in the prompt 
    resp = generate_chapter_title(model_id, prompt) 
    # store the chapter text to return in the metrics dir
    resp['chapter_text'] = row['text']
    resp['file_name'] = row['file_name']
    resp['chapter_id'] = row['chapter_id']
    if row.get('title') is not None:
        resp['original_title'] = row['title']
    resp['prompt_token_count'] = int(resp['prompt_token_count'])
    resp['completion_token_count'] = int(resp['completion_token_count'])
    # calculate the input and output token price for all of the calls
    resp['input_token_price'] = (resp['prompt_token_count']/1000) * model_info['input_tokens_pricing']
    logger.info(f"The price for {resp['prompt_token_count']} tokens for filename={row['file_name']} chapter={row['chapter_id']} is {resp['input_token_price']}")
    resp['output_token_pricing'] = (resp['completion_token_count']/1000) * model_info['output_tokens_pricing']
    logger.info(f"The price for {resp['completion_token_count']} tokens for filename={row['file_name']} chapter={row['chapter_id']} is {resp['output_token_pricing']}")

    dir_path = os.path.join(config['dir']['completions'], row['file_name'], model_id.replace(":", "-"))
    os.makedirs(dir_path, exist_ok=True)
    fpath = os.path.join(dir_path, f"chapter_{row['chapter_id']}.json")
    logger.info(f"writing response={resp} to {fpath}")
    Path(fpath).write_text(json.dumps(resp, default=str, indent=2))
    return resp

@ray.remote
def async_get_inference(i: int, row: Dict, total: int, model_info: Dict) -> Dict:
    logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
    logger = logging.getLogger(__name__)
    return get_inference(i, row, total, model_info)
                

In [None]:
df_as_json = json.loads(df.to_json(orient='records'))
n: int = config['parallel_inference_count']
from typing import List
resp_list: List = []
for experiment in config['experiments']:
    exp_name = experiment['name'] 
    model_list = experiment['model_list']
    for model_info in model_list:
        st = time.perf_counter()
        logger.info(f"------ running inference for {model_info['model']} -----")
        list_of_lists = [df_as_json[i * n:(i + 1) * n] for i in range((len(df_as_json) + n - 1) // n )]
        logger.info(f"split input list of size {len(df_as_json)} into {len(list_of_lists)} lists")
        for idx, l in enumerate(list_of_lists):
            logger.info(f"getting inference for list {idx+1}/{len(list_of_lists)}, size of list={len(l)} ")
            resp_list.extend(ray.get([async_get_inference.remote(i+1, e, len(l), model_info) for i, e in enumerate(l)]))
        elapsed_time = time.perf_counter() - st
        logger.info(f"------ model={model_info['model']} completed in {elapsed_time} ------ ")
    time.sleep(config['sleep_in_seconds_between_models'])
