In [3]:
import configparser
import importlib
import pandas as pd
from termcolor import colored
import cohere
import time
from tqdm import tqdm
import scripts.run_on_dir as run_on_dir
importlib.reload(run_on_dir)

config = configparser.ConfigParser()
config.read('./config.ini')
cohere_api_key = config['cohere']['api_key']

In [18]:
import os

def embed_transcript_file_func(file_path, ignore_if_exists=True):
    if file_path.endswith('.csv'):
        if file_path.endswith('_embedded.csv') and ignore_if_exists:
            print(f"File already embedded, skipping.")
            return [file_path, colored(f"Embedding skipped: {file_path}", 'yellow')]
        output_file_path = file_path[:-4] + '_embedded.csv'

        df = pd.read_csv(file_path, quotechar='"', escapechar='\\')
        co = cohere.Client(cohere_api_key)
        embeddings = []
        rate_limit_calls = 100
        rate_limit_duration = 60
        batch_size = 96
        start_time = time.time()
        call_count = 0

        for i, row in df.iterrows():
            text = row['text']
            embeddings.append({
                'start': row['start'],
                'end': row['end'],
                'text': text,
                'embedding': None
            })

        embedding_batches = [embeddings[i:i+batch_size] for i in range(0, len(embeddings), batch_size)]

        for batch in tqdm(embedding_batches, total=len(embedding_batches), desc=f"Embedding {file_path}"):
            texts = [item['text'] for item in batch]
            response = co.embed(texts=texts, model='large', truncate='END')

            for i, embedding in enumerate(response.embeddings):
                batch[i]['embedding'] = embedding

            call_count += 1

            if call_count >= rate_limit_calls:
                elapsed_time = time.time() - start_time

                if elapsed_time < rate_limit_duration:
                    time.sleep(rate_limit_duration - elapsed_time)

                start_time = time.time()
                call_count = 0

        df_embeddings = pd.DataFrame(embeddings)
        df = df.merge(df_embeddings, on=['start', 'end', 'text'], how='left')
        df_embeddings.to_csv(output_file_path, index=False)

        return [output_file_path, colored(f"Embedding complete: {output_file_path}", 'green')]

In [None]:
data_folder = "./TRANSCRIBED_DONT_DELETE/"

run_on_dir.run_on_dir(data_folder, file_func=embed_transcript_file_func, recursive=True, file_types=['.csv'], print_output=True)