In [1]:
import os
import sys
import json
import glob
import yaml
import logging
import pandas as pd
from pathlib import Path

from botocore.config import Config



In [2]:
!pip install boto3 --force-reinstall
# !pip install botocore --force-reinstall

[0mCollecting boto3
  Using cached boto3-1.34.13-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<1.35.0,>=1.34.13 (from boto3)
  Using cached botocore-1.34.13-py3-none-any.whl.metadata (5.6 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Using cached jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3)
  Using cached s3transfer-0.10.0-py3-none-any.whl.metadata (1.7 kB)
Collecting python-dateutil<3.0.0,>=2.1 (from botocore<1.35.0,>=1.34.13->boto3)
  Using cached python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)
Collecting urllib3<2.1,>=1.25.4 (from botocore<1.35.0,>=1.34.13->boto3)
  Using cached urllib3-2.0.7-py3-none-any.whl.metadata (6.6 kB)
Collecting six>=1.5 (from python-dateutil<3.0.0,>=2.1->botocore<1.35.0,>=1.34.13->boto3)
  Using cached six-1.16.0-py2.py3-none-any.whl (11 kB)
Using cached boto3-1.34.13-py3-none-any.whl (139 kB)
Using cached botocore-1.34.13-py3-none-any.whl (11.9 MB)
Using cached s3transfer-0.10.0-py3-none-a

In [3]:
!pip install --upgrade botocore

[0m

In [4]:
logger = logging.getLogger()
logging.basicConfig(format='%(asctime)s,%(module)s,%(processName)s,%(levelname)s,%(message)s', level=logging.INFO, stream=sys.stderr)


In [5]:
# global constants
CONFIG_FILE_PATH = "config.yaml"


In [6]:
# read the config yaml file
fpath = CONFIG_FILE_PATH
with open(fpath, 'r') as yaml_in:
    config = yaml.safe_load(yaml_in)
logger.info(f"config read from {fpath} -> {json.dumps(config, indent=2)}")


2024-01-04 18:12:27,654,2625127137,MainProcess,INFO,config read from config.yaml -> {
  "app_name": "contact-center-transcript-summarization",
  "aws": {
    "region": "us-east-1",
    "sagemaker_execution_role": "Admin"
  },
  "dir": {
    "data": "data",
    "raw": "data/raw",
    "golden": "data/raw/golden",
    "prompts": "data/prompts",
    "models": "data/models",
    "metrics": "data/metrics",
    "completions": "data/completions"
  },
  "data": {
    "raw_data_file": "data.csv",
    "golden_transcript": "data/raw/golden/transcript.txt",
    "golden_transcript_summary": "data/raw/golden/summary.txt"
  },
  "prompt": {
    "very_large_prompt": {
      "sleep_time": 180,
      "threshold": 70000
    },
    "normal_prompt": {
      "sleep_time": 60
    }
  },
  "max_retries": 3,
  "desired_word_count_for_summary": 80,
  "experiments": [
    {
      "name": "single-line-reason",
      "prompt_template": null,
      "reps": 3,
      "model_list": [
        {
          "model": "anthr

In [7]:
fpath = os.path.join(config['dir']['metrics'], "**", "*", "*.json")
file_list = glob.glob(fpath)
logger.info(f"there are {len(file_list)} files in {fpath}")


2024-01-04 18:12:27,661,1113939681,MainProcess,INFO,there are 45 files in data/metrics/**/*/*.json


In [8]:
metrics = []
for f in file_list:
    transcript_id = "_".join(os.path.basename(f).split('_')[:-1])
    metrics.append(json.loads(Path(f).read_text()) | dict(transcript_id=transcript_id))
df = pd.DataFrame(metrics)
logger.info(f"all metrics data is read into a dataframe of shape {df.shape}")


2024-01-04 18:12:27,676,2271422989,MainProcess,INFO,all metrics data is read into a dataframe of shape (45, 11)


In [9]:
df['wc_gt_than_desired'] = df.completion_word_count > config['desired_word_count_for_summary']


In [10]:
df.head()


Unnamed: 0,exception,prompt,completion,model_id,time_taken_in_seconds,completion_token_count,prompt_token_count,cost,completion_word_count,experiment,transcript_id,wc_gt_than_desired
0,,You are an AI bot that is good at determining ...,<output>\nHere are the action items I gathered...,amazon.titan-text-express-v1,5.222832,167,793,0.000902,122,single-line-reason,call_center_transcript_2_amazon.titan-text-exp...,True
1,,You are an AI bot that is good at determining ...,<output>\nHere are the action items I gathered...,amazon.titan-text-express-v1,5.17584,167,793,0.000902,122,single-line-reason,call_center_transcript_2_amazon.titan-text-exp...,True
2,,You are an AI bot that is good at determining ...,<output>\nA: \n- Research mobile gaming trends...,cohere.command-text-v14,7.171072,220,788,0.001622,165,single-line-reason,call_center_transcript_2_cohere.command-text-v14,True
3,,Human: Read the conversation between the call ...,<output>\nA: \n- Research recent trends and gr...,anthropic.claude-instant-v1,1.307436,110,775,0.001869,72,single-line-reason,call_center_transcript_2_anthropic.claude-inst...,False
4,,Human: Read the conversation between the call ...,<output>\nA: \n- Research recent trends and gr...,anthropic.claude-instant-v1,1.199874,99,775,0.001809,63,single-line-reason,call_center_transcript_2_anthropic.claude-inst...,False


In [11]:
df['wc_gt_than_desired'].value_counts()


wc_gt_than_desired
True     24
False    21
Name: count, dtype: int64

In [12]:
## make sure the boto3 version is up to date 
import boto3
print(boto3.__version__)

1.34.13


In [13]:
from itertools import combinations
from utils import get_rouge_l_score, get_cosine_similarity
import numpy as np
def compare_results(df):
    pairs = list(combinations(df.completion, 2))
    rouge_l_f1scores = []
    cosine_similarities = []
    logger.info(f"there are {len(pairs)} pairs in this dataframe of shape {df.shape}")
    for p in pairs:
        rouge_l_f1scores.append(get_rouge_l_score(p[0], p[1]))
        cosine_similarities.append(get_cosine_similarity(p[0], p[1]))
    # print(f"{np.mean(rouge_l_f1scores)}, {np.mean(cosine_similarities)}")
    return (np.mean(rouge_l_f1scores), np.mean(cosine_similarities))


df_per_model_completion_similarity = df.groupby(['transcript_id', 'model_id', 'experiment']).apply(compare_results).reset_index()
df_per_model_completion_similarity.columns = ['transcript_id', 'model_id', 'experiment', 'similarity_metrics']
df_per_model_completion_similarity['rouge_l_f1_score_within_responses_mean'] = df_per_model_completion_similarity['similarity_metrics'].map(lambda x: x[0])
df_per_model_completion_similarity['cosine_similarity_within_responses_mean'] = df_per_model_completion_similarity['similarity_metrics'].map(lambda x: x[1])

df_per_model_completion_similarity


2024-01-04 18:12:28,179,3153552436,MainProcess,INFO,there are 3 pairs in this dataframe of shape (3, 12)
2024-01-04 18:12:28,179,rouge_scorer,MainProcess,INFO,Using default tokenizer.
2024-01-04 18:12:28,193,credentials,MainProcess,INFO,Found credentials in shared credentials file: ~/.aws/credentials


Create new client
  Using region: None
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-east-1.amazonaws.com)


2024-01-04 18:12:28,503,rouge_scorer,MainProcess,INFO,Using default tokenizer.
2024-01-04 18:12:28,748,rouge_scorer,MainProcess,INFO,Using default tokenizer.
2024-01-04 18:12:28,960,3153552436,MainProcess,INFO,there are 3 pairs in this dataframe of shape (3, 12)
2024-01-04 18:12:28,960,rouge_scorer,MainProcess,INFO,Using default tokenizer.
2024-01-04 18:12:29,157,rouge_scorer,MainProcess,INFO,Using default tokenizer.
2024-01-04 18:12:29,362,rouge_scorer,MainProcess,INFO,Using default tokenizer.
2024-01-04 18:12:29,573,3153552436,MainProcess,INFO,there are 3 pairs in this dataframe of shape (3, 12)
2024-01-04 18:12:29,574,rouge_scorer,MainProcess,INFO,Using default tokenizer.
2024-01-04 18:12:29,794,rouge_scorer,MainProcess,INFO,Using default tokenizer.
2024-01-04 18:12:30,036,rouge_scorer,MainProcess,INFO,Using default tokenizer.
2024-01-04 18:12:30,318,3153552436,MainProcess,INFO,there are 3 pairs in this dataframe of shape (3, 12)
2024-01-04 18:12:30,319,rouge_scorer,MainProcess,INFO

Unnamed: 0,transcript_id,model_id,experiment,similarity_metrics,rouge_l_f1_score_within_responses_mean,cosine_similarity_within_responses_mean
0,call_center_transcript_0_amazon.titan-text-exp...,amazon.titan-text-express-v1,single-line-reason,"(0.7942, 0.9739638039604354)",0.7942,0.973964
1,call_center_transcript_0_anthropic.claude-inst...,anthropic.claude-instant-v1,single-line-reason,"(0.9286, 0.9978085157422109)",0.9286,0.997809
2,call_center_transcript_0_cohere.command-text-v14,cohere.command-text-v14,single-line-reason,"(0.4117, 0.9544265222292876)",0.4117,0.954427
3,call_center_transcript_1_amazon.titan-text-exp...,amazon.titan-text-express-v1,single-line-reason,"(1.0, 0.9999999999999997)",1.0,1.0
4,call_center_transcript_1_anthropic.claude-inst...,anthropic.claude-instant-v1,single-line-reason,"(1.0, 1.0)",1.0,1.0
5,call_center_transcript_1_cohere.command-text-v14,cohere.command-text-v14,single-line-reason,"(0.40863333333333335, 0.9623230770004018)",0.408633,0.962323
6,call_center_transcript_2_amazon.titan-text-exp...,amazon.titan-text-express-v1,single-line-reason,"(1.0, 0.9999999999999997)",1.0,1.0
7,call_center_transcript_2_anthropic.claude-inst...,anthropic.claude-instant-v1,single-line-reason,"(0.9576666666666668, 0.9954751847106111)",0.957667,0.995475
8,call_center_transcript_2_cohere.command-text-v14,cohere.command-text-v14,single-line-reason,"(0.41923333333333335, 0.9205654164813888)",0.419233,0.920565
9,call_center_transcript_3_amazon.titan-text-exp...,amazon.titan-text-express-v1,single-line-reason,"(0.7423333333333334, 0.9228218774015087)",0.742333,0.922822


In [14]:
df_per_model_completion_similarity.shape


(15, 6)

In [15]:
df_per_model_rouge_l_f1_score = df_per_model_completion_similarity.groupby(['model_id', 'experiment'])['rouge_l_f1_score_within_responses_mean'].mean().reset_index()
df_per_model_rouge_l_f1_score


Unnamed: 0,model_id,experiment,rouge_l_f1_score_within_responses_mean
0,amazon.titan-text-express-v1,single-line-reason,0.81418
1,anthropic.claude-instant-v1,single-line-reason,0.977253
2,cohere.command-text-v14,single-line-reason,0.480867


In [16]:
df_per_model_cosine_similarity = df_per_model_completion_similarity.groupby(['model_id', 'experiment'])['cosine_similarity_within_responses_mean'].mean().reset_index()
df_per_model_cosine_similarity


Unnamed: 0,model_id,experiment,cosine_similarity_within_responses_mean
0,amazon.titan-text-express-v1,single-line-reason,0.963901
1,anthropic.claude-instant-v1,single-line-reason,0.998657
2,cohere.command-text-v14,single-line-reason,0.949246


In [17]:
df_counts = df[['model_id', 'experiment']].value_counts().reset_index()
df_counts.columns = ['model_id', 'experiment', 'count']
df_counts.head()


Unnamed: 0,model_id,experiment,count
0,amazon.titan-text-express-v1,single-line-reason,15
1,anthropic.claude-instant-v1,single-line-reason,15
2,cohere.command-text-v14,single-line-reason,15


In [18]:
df_pct_gt_wc = df[df.wc_gt_than_desired == True].value_counts(['model_id', 'experiment'], normalize=False).reset_index()
df_pct_gt_wc.columns = ['model_id', 'experiment', 'count_of_completions_gt_wc']
df_pct_gt_wc


Unnamed: 0,model_id,experiment,count_of_completions_gt_wc
0,cohere.command-text-v14,single-line-reason,13
1,amazon.titan-text-express-v1,single-line-reason,8
2,anthropic.claude-instant-v1,single-line-reason,3


In [19]:
rows_to_add = []
for r in df[['model_id', 'experiment']].drop_duplicates().iterrows():
    model_id = r[1]['model_id']
    experiment = r[1]['experiment']
    df_temp = df_pct_gt_wc[(df_pct_gt_wc.model_id == model_id) & (df_pct_gt_wc.experiment == experiment)]
    if df_temp.shape[0] == 0:
        rows_to_add.append(dict(model_id=model_id, experiment=experiment, count_of_completions_gt_wc=0))
rows_to_add
df_pct_gt_wc = pd.concat([df_pct_gt_wc, pd.DataFrame(rows_to_add)])
df_pct_gt_wc
    

Unnamed: 0,model_id,experiment,count_of_completions_gt_wc
0,cohere.command-text-v14,single-line-reason,13
1,amazon.titan-text-express-v1,single-line-reason,8
2,anthropic.claude-instant-v1,single-line-reason,3


In [20]:
df.head()


Unnamed: 0,exception,prompt,completion,model_id,time_taken_in_seconds,completion_token_count,prompt_token_count,cost,completion_word_count,experiment,transcript_id,wc_gt_than_desired
0,,You are an AI bot that is good at determining ...,<output>\nHere are the action items I gathered...,amazon.titan-text-express-v1,5.222832,167,793,0.000902,122,single-line-reason,call_center_transcript_2_amazon.titan-text-exp...,True
1,,You are an AI bot that is good at determining ...,<output>\nHere are the action items I gathered...,amazon.titan-text-express-v1,5.17584,167,793,0.000902,122,single-line-reason,call_center_transcript_2_amazon.titan-text-exp...,True
2,,You are an AI bot that is good at determining ...,<output>\nA: \n- Research mobile gaming trends...,cohere.command-text-v14,7.171072,220,788,0.001622,165,single-line-reason,call_center_transcript_2_cohere.command-text-v14,True
3,,Human: Read the conversation between the call ...,<output>\nA: \n- Research recent trends and gr...,anthropic.claude-instant-v1,1.307436,110,775,0.001869,72,single-line-reason,call_center_transcript_2_anthropic.claude-inst...,False
4,,Human: Read the conversation between the call ...,<output>\nA: \n- Research recent trends and gr...,anthropic.claude-instant-v1,1.199874,99,775,0.001809,63,single-line-reason,call_center_transcript_2_anthropic.claude-inst...,False


In [21]:
df_summary_mean = df_per_model_completion_similarity.groupby(['model_id', 'experiment']).agg({
    'cosine_similarity_within_responses_mean': 'mean', 
    'rouge_l_f1_score_within_responses_mean': 'mean'
}).reset_index()

df_summary_mean.columns = ['model_id', 'experiment', 'cosine_similarity_mean', 'rouge_l_f1_score_mean']

df_summary_mean



Unnamed: 0,model_id,experiment,cosine_similarity_mean,rouge_l_f1_score_mean
0,amazon.titan-text-express-v1,single-line-reason,0.963901,0.81418
1,anthropic.claude-instant-v1,single-line-reason,0.998657,0.977253
2,cohere.command-text-v14,single-line-reason,0.949246,0.480867


In [22]:
# df_summary_q95 = df.groupby(['model_id', 'experiment'])['time_taken_in_seconds', 'completion_token_count', 'prompt_token_count', 'completion_word_count', 'cost'].quantile(0.95).reset_index()
# df_summary_q95.columns = ['model_id', 'experiment', 'time_taken_in_seconds_q95', 'completion_token_count_q95', 'prompt_token_count_q95', 'completion_word_count_q95', 'cost_q95']
# df_summary_q95 = df.groupby(['model_id', 'experiment'])[['time_taken_in_seconds', 'completion_token_count', 'prompt_token_count', 'completion_word_count', 'cost']].quantile(0.95).reset_index()
# df_summary_q95.columns = ['model_id', 'experiment', 'time_taken_in_seconds_q95', 'completion_token_count_q95', 'prompt_token_count_q95', 'completion_word_count_q95', 'cost_q95']


# df_summary_q95

df_summary_q95 = df.groupby(['model_id', 'experiment'])[['time_taken_in_seconds', 'completion_token_count', 'prompt_token_count', 'completion_word_count', 'cost']].quantile(0.95).reset_index()

# Renaming columns
df_summary_q95.columns = ['model_id', 'experiment', 'time_taken_in_seconds_q95', 'completion_token_count_q95', 'prompt_token_count_q95', 'completion_word_count_q95', 'cost_q95']

df_summary_q95



Unnamed: 0,model_id,experiment,time_taken_in_seconds_q95,completion_token_count_q95,prompt_token_count_q95,completion_word_count_q95,cost_q95
0,amazon.titan-text-express-v1,single-line-reason,7.132016,213.9,976.0,149.2,0.001123
1,anthropic.claude-instant-v1,single-line-reason,2.573874,122.0,950.0,89.0,0.002221
2,cohere.command-text-v14,single-line-reason,8.835256,238.6,963.0,179.1,0.001804


In [23]:
df_pct_gt_wc


Unnamed: 0,model_id,experiment,count_of_completions_gt_wc
0,cohere.command-text-v14,single-line-reason,13
1,amazon.titan-text-express-v1,single-line-reason,8
2,anthropic.claude-instant-v1,single-line-reason,3


In [24]:
import numpy as np
df_summary_all = pd.merge(df_summary_q95, df_summary_mean)
print(df_summary_all.shape)
df_summary_all = pd.merge(df_summary_all, df_counts)
print(df_summary_all.shape)

df_summary_all = pd.merge(df_summary_all, df_pct_gt_wc)
print(df_summary_all.shape)

df_summary_all = pd.merge(df_summary_all, df_per_model_cosine_similarity)
print(df_summary_all.shape)

df_summary_all = pd.merge(df_summary_all, df_per_model_rouge_l_f1_score)
print(df_summary_all.shape)

df_summary_all.shape



(3, 9)
(3, 10)
(3, 11)
(3, 12)
(3, 13)


(3, 13)

In [25]:
import numpy as np
df_summary_all = pd.merge(df_summary_q95, df_summary_mean)
df_summary_all = pd.merge(df_summary_all, df_counts)
df_summary_all = pd.merge(df_summary_all, df_pct_gt_wc)
df_summary_all = pd.merge(df_summary_all, df_per_model_cosine_similarity)
df_summary_all = pd.merge(df_summary_all, df_per_model_rouge_l_f1_score)
df_summary_all.time_taken_in_seconds_q95 = np.round(df_summary_all.time_taken_in_seconds_q95).astype(int)
df_summary_all.prompt_token_count_q95 = np.round(df_summary_all.prompt_token_count_q95).astype(int)
df_summary_all.completion_token_count_q95 = np.round(df_summary_all.completion_token_count_q95).astype(int)
df_summary_all.completion_word_count_q95 = np.round(df_summary_all.completion_word_count_q95).astype(int)
df_summary_all.cost_q95 = np.round(df_summary_all.cost_q95, 4)
df_summary_all['percent_of_completions_gt_wc'] = np.round((df_summary_all['count_of_completions_gt_wc']/df_summary_all['count'])*100, 2)
df_summary_all.drop(['count_of_completions_gt_wc'], axis=1, inplace=True)

#, df_counts
df_summary_all


Unnamed: 0,model_id,experiment,time_taken_in_seconds_q95,completion_token_count_q95,prompt_token_count_q95,completion_word_count_q95,cost_q95,cosine_similarity_mean,rouge_l_f1_score_mean,count,cosine_similarity_within_responses_mean,rouge_l_f1_score_within_responses_mean,percent_of_completions_gt_wc
0,amazon.titan-text-express-v1,single-line-reason,7,214,976,149,0.0011,0.963901,0.81418,15,0.963901,0.81418,53.33
1,anthropic.claude-instant-v1,single-line-reason,3,122,950,89,0.0022,0.998657,0.977253,15,0.998657,0.977253,20.0
2,cohere.command-text-v14,single-line-reason,9,239,963,179,0.0018,0.949246,0.480867,15,0.949246,0.480867,86.67


In [26]:
fpath = os.path.join(config['dir']['metrics'], "model_metrics.csv")
df_summary_all.to_csv(fpath, index=False)


### With ground truth for each file transcript

In [27]:
import os
import pandas as pd
import json

def read_ground_truths(transcripts_dir):
    ground_truths = {}
    labels_file = os.path.join(transcripts_dir, "labels.txt")
    if os.path.exists(labels_file):
        with open(labels_file, 'r') as file:
            for line in file:
                parts = line.split('|')
                if len(parts) == 2:
                    transcript_name, ground_truth = parts[0].strip(), parts[1].strip()
                    ground_truths[transcript_name] = ground_truth
    return ground_truths

def generate_csv(metrics_dir, completions_dir, transcripts_dir, output_csv):
    data = []
    ground_truths = read_ground_truths(transcripts_dir)

    metrics_reason_dir = os.path.join(metrics_dir, "single-line-reason")
    completions_reason_dir = os.path.join(completions_dir, "single-line-reason")
    transcripts_reason_dir = transcripts_dir

    # Iterate through each subfolder in the 'single-line-reason' directory within metrics
    for folder in os.listdir(metrics_reason_dir):
        metrics_subfolder = os.path.join(metrics_reason_dir, folder)
        completions_subfolder = os.path.join(completions_reason_dir, folder)

        if os.path.isdir(metrics_subfolder) and os.path.isdir(completions_subfolder):
            # Iterate through each file in the metrics subfolder
            for file in os.listdir(metrics_subfolder):
                if file.endswith("_rep1.json"):
                    base_name = '_'.join(file.split('_')[0:-2])  # Extract the base name before the model name
                    json_file_path = os.path.join(metrics_subfolder, file)
                    completion_file = file.replace('.json', '.txt')
                    transcript_file = base_name + ".txt"

                    # Reading the transcript data
                    try:
                        with open(os.path.join(transcripts_reason_dir, transcript_file), 'r') as f:
                            transcript = f.read().strip()
                    except FileNotFoundError:
                        transcript = "Transcript not found"

                    # Get the ground truth summary
                    ground_truth = ground_truths.get(base_name, "Ground truth not found")

                    # Read the model ID and completion from the JSON file
                    try:
                        with open(json_file_path, 'r') as json_file:
                            json_data = json.load(json_file)
                            model_id = json_data.get("model_id", "")

                        with open(os.path.join(completions_subfolder, completion_file), 'r') as f:
                            completion = f.read().strip()

                        data.append([base_name, transcript, model_id, completion, ground_truth])
                    except FileNotFoundError:
                        # File not found
                        continue

    # Create a DataFrame from the data
    df = pd.DataFrame(data, columns=['Transcript Name', 'Transcript', 'Model ID', 'Completion', 'Ground Truth'])

    # Pivot the DataFrame
    df_pivot = df.pivot_table(index=['Transcript Name', 'Transcript', 'Ground Truth'], columns='Model ID', values='Completion', aggfunc='first')

    # Reset the index to make 'Transcript Name', 'Transcript', and 'Ground Truth' 
    df_pivot.reset_index(inplace=True)
    df_pivot.fillna('', inplace=True)

    # Save to CSV
    df_pivot.to_csv(output_csv, index=False)

# Define the directories for metrics, completions, and transcripts
metrics_dir = "data/metrics"
completions_dir = "data/completions"
transcripts_dir = "data/raw"  

# Define the output CSV file path
output_csv = "model_ground_truth_comparison.csv"

# Generate the CSV file
generate_csv(metrics_dir, completions_dir, transcripts_dir, output_csv)


## Golden summaries and model completions for transcripts

In [28]:
import os
import pandas as pd
import json

def read_golden_summary(transcripts_dir, base_name):
    # Assuming the golden summary file follows a specific naming pattern
    golden_summary_file = f"{base_name}_golden_summary.txt"
    for subdir in os.listdir(transcripts_dir):
        subdir_path = os.path.join(transcripts_dir, subdir)
        if os.path.isdir(subdir_path) and golden_summary_file in os.listdir(subdir_path):
            with open(os.path.join(subdir_path, golden_summary_file), 'r') as file:
                return file.read().strip()
    return "Golden summary not found"

def generate_csv(metrics_dir, completions_dir, transcripts_dir, output_csv):
    data = []

    metrics_reason_dir = os.path.join(metrics_dir, "single-line-reason")
    completions_reason_dir = os.path.join(completions_dir, "single-line-reason")
    transcripts_reason_dir = transcripts_dir

    # Iterate through each subfolder in the 'single-line-reason' directory within metrics
    for folder in os.listdir(metrics_reason_dir):
        metrics_subfolder = os.path.join(metrics_reason_dir, folder)
        completions_subfolder = os.path.join(completions_reason_dir, folder)

        if os.path.isdir(metrics_subfolder) and os.path.isdir(completions_subfolder):
            # Iterate through each file in the metrics subfolder
            for file in os.listdir(metrics_subfolder):
                if file.endswith("_rep1.json"):
                    base_name = '_'.join(file.split('_')[0:-2])  # Extract the base name before the model name
                    json_file_path = os.path.join(metrics_subfolder, file)
                    completion_file = file.replace('.json', '.txt')
                    transcript_file = base_name + ".txt"

                    # Reading the transcript data
                    try:
                        with open(os.path.join(transcripts_reason_dir, transcript_file), 'r') as f:
                            transcript = f.read().strip()
                    except FileNotFoundError:
                        transcript = "Transcript not found"

                    # Get the golden summary
                    golden_summary = read_golden_summary(transcripts_reason_dir, base_name)

                    # Read the model ID and completion from the JSON file
                    try:
                        with open(json_file_path, 'r') as json_file:
                            json_data = json.load(json_file)
                            model_id = json_data.get("model_id", "")

                        with open(os.path.join(completions_subfolder, completion_file), 'r') as f:
                            completion = f.read().strip()

                        data.append([base_name, transcript, model_id, completion, golden_summary])
                    except FileNotFoundError:
                        # File not found
                        continue

    # Create a DataFrame from the data
    df = pd.DataFrame(data, columns=['Transcript Name', 'Transcript', 'Model ID', 'Completion', 'Golden Summary'])

    # Pivot the DataFrame
    df_pivot = df.pivot_table(index=['Transcript Name', 'Transcript', 'Golden Summary'], columns='Model ID', values='Completion', aggfunc='first')

    # Reset the index to make 'Transcript Name', 'Transcript', and 'Golden Summary' regular columns
    df_pivot.reset_index(inplace=True)
    df_pivot.fillna('', inplace=True)

    # Save to CSV
    df_pivot.to_csv(output_csv, index=False)

# Define the directories for metrics, completions, and transcripts
metrics_dir = "data/metrics"
completions_dir = "data/completions"
transcripts_dir = "data/raw"

# Define the output CSV file path
output_csv = "golden_summary_model_comparison.csv"

# Generate the CSV file
generate_csv(metrics_dir, completions_dir, transcripts_dir, output_csv)
