In [1]:
import os
import sys
import json
import yaml
import glob
import logging
import pandas as pd
from pathlib import Path


In [2]:
logger = logging.getLogger()
logging.basicConfig(format='%(asctime)s,%(module)s,%(processName)s,%(levelname)s,%(message)s', level=logging.INFO, stream=sys.stderr)


In [3]:
# global constants
CONFIG_FILE_PATH = "config.yaml"


In [4]:
# read the config yaml file
fpath = CONFIG_FILE_PATH
with open(fpath, 'r') as yaml_in:
    config = yaml.safe_load(yaml_in)
logger.info(f"config read from {fpath} -> {json.dumps(config, indent=2)}")


2024-01-04 18:08:31,533,2625127137,MainProcess,INFO,config read from config.yaml -> {
  "app_name": "contact-center-transcript-summarization",
  "aws": {
    "region": "us-east-1",
    "sagemaker_execution_role": "Admin"
  },
  "dir": {
    "data": "data",
    "raw": "data/raw",
    "golden": "data/raw/golden",
    "prompts": "data/prompts",
    "models": "data/models",
    "metrics": "data/metrics",
    "completions": "data/completions"
  },
  "data": {
    "raw_data_file": "data.csv",
    "golden_transcript": "data/raw/golden/transcript.txt",
    "golden_transcript_summary": "data/raw/golden/summary.txt"
  },
  "prompt": {
    "very_large_prompt": {
      "sleep_time": 180,
      "threshold": 70000
    },
    "normal_prompt": {
      "sleep_time": 60
    }
  },
  "max_retries": 3,
  "desired_word_count_for_summary": 80,
  "experiments": [
    {
      "name": "single-line-reason",
      "prompt_template": null,
      "reps": 3,
      "model_list": [
        {
          "model": "anthr

In [5]:
file_list = glob.glob(os.path.join(config['dir']['raw'], "*.txt"))
file_list = [f for f in file_list if 'labels.txt' not in f]
# exclude labels.txt

logger.info(f"there are {len(file_list)} files to read {file_list}")


2024-01-04 18:08:31,540,3397252753,MainProcess,INFO,there are 5 files to read ['data/raw/call_center_transcript_1.txt', 'data/raw/call_center_transcript_0.txt', 'data/raw/call_center_transcript_2.txt', 'data/raw/call_center_transcript_3.txt', 'data/raw/call_center_transcript_4.txt']


In [6]:
# read the labels.txt which has the golden summary
fpath = os.path.join(os.path.join(config['dir']['raw'], "labels.txt"))
with open(fpath) as f:
    labels = f.readlines()

summary = dict()
for l in labels:
    if l == '\n':
        continue
    tokens = l.split('|')
    #summary.append(dict(fname=f"{tokens[0].strip()}.txt", problems=tokens[1]))
    k = f"{tokens[0].strip()}.txt"
    v = tokens[1].strip()
    summary[k] = v
summary                     


{'call_center_transcript_0.txt': '<output> Here are the action items I gathered for each person: A: - Document any other user entry concerns to provide to B B: - Work with James from another team to simplify additional forms and sign up workflow C:  - Work on landing page to make product more discoverable </output>',
 'call_center_transcript_1.txt': 'Here are the action items I gathered for each person from the conversation: <output> A: - Set up a follow-up meeting to brainstorm ideas for where generative AI could be applicable in our products - Outline high-level ideas for where generative AI could drive automation or enhance user experience in our products B: - Research current generative AI initiatives at other tech companies to analyze the competitive landscape C: - Develop a validation framework to rigorously assess accuracy, ethics and safety of any generative AI applications we develop </output>',
 'call_center_transcript_2.txt': 'Here are the action items I gathered for each pe

In [7]:
for i, f in enumerate(file_list):
    logger.info(f"file number={i+1}, file={f}")
    dir_path = os.path.join(config['dir']['raw'], str(i))
    os.makedirs(dir_path, exist_ok=True)
    fname = os.path.basename(f)
    fpath = os.path.join(dir_path, f"{fname.replace('.txt', '')}_transcript.txt")
    Path(fpath).write_text(Path(f).read_text())

    # find the golden summary corresponding to this file
    golden_summary = summary.get(fname)
    if golden_summary is None:
        logger.error(f"no summary found for {fname}")
    else:
        fpath = os.path.join(dir_path, f"{fname.replace('.txt', '')}_golden_summary.txt")
        Path(fpath).write_text(golden_summary)



2024-01-04 18:08:31,552,3809902715,MainProcess,INFO,file number=1, file=data/raw/call_center_transcript_1.txt
2024-01-04 18:08:31,554,3809902715,MainProcess,INFO,file number=2, file=data/raw/call_center_transcript_0.txt
2024-01-04 18:08:31,556,3809902715,MainProcess,INFO,file number=3, file=data/raw/call_center_transcript_2.txt
2024-01-04 18:08:31,558,3809902715,MainProcess,INFO,file number=4, file=data/raw/call_center_transcript_3.txt
2024-01-04 18:08:31,561,3809902715,MainProcess,INFO,file number=5, file=data/raw/call_center_transcript_4.txt
