In [11]:
!pip install pinecone tiktoken



In [12]:
import os
import re
import time
import json
import openai
from google.colab import drive
from pinecone import Pinecone, ServerlessSpec

################################################################################
# 1) Mount Google Drive
################################################################################
drive.mount('/content/drive')

################################################################################
# 2) Set API Keys
################################################################################
PINECONE_API_KEY = "YOUR-PINECONE-API-KEY"
PINECONE_REGION  = "us-east-1"
OPENAI_API_KEY   = "YOUR-OPENAI-API-KEY"

INDEX_NAME = "PINECONE-INDEX-NAME"        # your Pinecone index name
EMBED_MODEL = "text-embedding-3-large"  # using '3-large' with ~8k context

openai.api_key = OPENAI_API_KEY

################################################################################
# 3) Initialize Pinecone (New Approach)
################################################################################
pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_REGION)
if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=3072,            # same dimension as 'text-embedding-3-large'
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region=PINECONE_REGION)
    )
index = pc.Index(INDEX_NAME)

################################################################################
# 4) Utility Functions
################################################################################

def sanitize_vector_id(text: str) -> str:
    """
    Remove non-ASCII characters from text to produce a safe vector ID.
    """
    return re.sub(r'[^\x00-\x7F]+', '', text)

def chunk_text_by_tokens(text: str, chunk_size: int = 6000, model_name: str = EMBED_MODEL) -> list:
    """
    Splits 'text' into chunks of up to 'chunk_size' tokens each,
    using the specified 'model_name' for tokenization (via tiktoken).
    """
    import tiktoken

    try:
        enc = tiktoken.encoding_for_model(model_name)
    except KeyError:
        # If for some reason tiktoken doesn't recognize the model, default to cl100k_base
        enc = tiktoken.get_encoding("cl100k_base")

    tokens = enc.encode(text)
    chunks = []
    for i in range(0, len(tokens), chunk_size):
        chunk_tokens = tokens[i:i+chunk_size]
        chunk_text = enc.decode(chunk_tokens)
        chunks.append(chunk_text)
    return chunks

def call_with_rate_limit_handling(func, *args, max_retries=5, **kwargs):
    """
    General purpose function to call any API with rate limit handling.

    Args:
        func: The function to call
        max_retries: Maximum number of retries before giving up
        *args, **kwargs: Arguments to pass to func

    Returns:
        The result of the function call, or None if all retries failed
    """
    retry_count = 0
    base_wait = 2  # Start with a 2 second wait

    while retry_count <= max_retries:
        try:
            return func(*args, **kwargs)
        except Exception as e:
            retry_count += 1
            error_str = str(e)

            # Check if it's a rate limit error
            if "429" in error_str or "rate limit" in error_str.lower():
                # Exponential backoff with jitter
                wait_time = base_wait * (2 ** retry_count) + (retry_count * 0.1)
                print(f"Rate limited. Waiting {wait_time:.1f} seconds before retry {retry_count}/{max_retries}...")
                time.sleep(wait_time)
            else:
                print(f"Error calling {func.__name__}: {e}")
                if retry_count >= max_retries:
                    print(f"Max retries reached.")
                    return None
                print(f"Retrying in 2 seconds...")
                time.sleep(2)

    return None

def generate_title_from_json(json_text: str) -> str:
    """
    Uses a large language model to generate a concise, descriptive title from the JSON content.
    The title will be in lowercase and use underscores, e.g. "agent_google_sheet_slack".
    """
    prompt = f"""
You are given a JSON representation of an automation workflow.
Generate a concise, descriptive, and uniform title that captures the automation's main functionality.
The title should be in lowercase and use underscores to separate words.
For example, if the automation reads from Google Sheets, calls an LLM, and sends a message to Slack,
you might return: agent_google_sheet_slack.
Only output the title.

JSON Content:
{json_text}
    """

    def _generate_title():
        from openai import OpenAI
        client = OpenAI(api_key=OPENAI_API_KEY)
        response = client.chat.completions.create(
            model="gpt-3.5-turbo", # Removed reasoning_effort
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content.strip()

    try:
        return call_with_rate_limit_handling(_generate_title)
    except Exception as e:
        print(f"Error generating title: {e}")
        return None

def generate_tldr_from_json(json_text: str) -> str:
    """
    Uses a large language model to generate a one-sentence TLDR summary of the automation.
    This summary describes the core functionality in plain language.
    """
    prompt = f"""
You are given a JSON representation of an automation workflow.
Generate a concise one-sentence summary (TLDR) of what this automation does.
It should capture the core functionality in plain language.
Only output the summary.

JSON Content:
{json_text}
    """

    def _generate_tldr():
        from openai import OpenAI
        client = OpenAI(api_key=OPENAI_API_KEY)
        response = client.chat.completions.create(
            model="gpt-3.5-turbo", # Removed reasoning_effort
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content.strip()

    try:
        return call_with_rate_limit_handling(_generate_tldr)
    except Exception as e:
        print(f"Error generating TLDR: {e}")
        return None

def create_embeddings(text):
    """Create embeddings with rate limit handling"""
    def _embed():
        response = openai.embeddings.create(
            input=[text],
            model=EMBED_MODEL
        )
        return response.data[0].embedding

    return call_with_rate_limit_handling(_embed)

################################################################################
# 5) Process Files with Checkpointing and Batch Processing
################################################################################

def main():
    folder_path = "/content/drive/MyDrive/n8n Workflows"  # Adjust if needed
    checkpoint_file = os.path.join(folder_path, "processing_checkpoint.json")

    # Load checkpoint if exists
    processed_files = []
    current_batch = 0
    batch_size = 5  # Process files in small batches to avoid rate limits

    if os.path.exists(checkpoint_file):
        try:
            with open(checkpoint_file, 'r') as f:
                checkpoint_data = json.load(f)
                processed_files = checkpoint_data.get('processed_files', [])
                current_batch = checkpoint_data.get('current_batch', 0)
                print(f"Resuming from checkpoint: {len(processed_files)} files processed, batch {current_batch}")
        except Exception as e:
            print(f"Error loading checkpoint: {e}")

    # Get all files to process
    all_files = [f for f in os.listdir(folder_path)
                if (f.lower().endswith(".txt") or f.lower().endswith(".json"))
                and f != "processing_checkpoint.json"]

    # Skip already processed files
    remaining_files = [f for f in all_files if f not in processed_files]
    print(f"Total files: {len(all_files)}, Remaining: {len(remaining_files)}")

    try:
        # Process files in batches
        batches = [remaining_files[i:i + batch_size] for i in range(0, len(remaining_files), batch_size)]

        if not batches:  # Handle case when all files are processed
            print("All files have already been processed!")
            return

        for batch_idx, batch in enumerate(batches[current_batch:], current_batch):
            print(f"\n--- Processing Batch {batch_idx + 1}/{len(batches)} ---")

            for file_name in batch:
                print(f"\nProcessing file: {file_name}")
                full_path = os.path.join(folder_path, file_name)

                try:
                    with open(full_path, "r", encoding="utf-8") as f:
                        file_text = f.read()

                    # Generate a descriptive title
                    generated_title = generate_title_from_json(file_text)
                    if generated_title:
                        base_vector_id = sanitize_vector_id(generated_title)
                        print(f"Generated title: {generated_title} (vector base ID: {base_vector_id})")
                    else:
                        base_vector_id = sanitize_vector_id(file_name)
                        generated_title = file_name
                        print(f"Using fallback title: {generated_title} (vector base ID: {base_vector_id})")

                    # Generate a TLDR (agent summary)
                    generated_tldr = generate_tldr_from_json(file_text)
                    if generated_tldr:
                        print(f"Generated TLDR: {generated_tldr}")
                    else:
                        generated_tldr = ""
                        print(f"Using empty TLDR.")

                    # Token-based chunking
                    chunks = chunk_text_by_tokens(file_text, chunk_size=6000, model_name=EMBED_MODEL)
                    print(f"Processing {len(chunks)} chunk(s).")

                    # Process each chunk with rate limit handling
                    for idx, chunk in enumerate(chunks):
                        vector_id = base_vector_id if len(chunks) == 1 else f"{base_vector_id}_{idx}"

                        # Create embeddings
                        embedding = create_embeddings(chunk)
                        if embedding is None:
                            print(f"Failed to create embedding for chunk {idx}. Skipping.")
                            continue

                        # Create metadata
                        metadata = {
                            "generated_title": generated_title,
                            "agent_summary": generated_tldr,
                            "chunk_index": idx,
                            "json_file": chunk
                        }

                        # Insert into Pinecone
                        def upsert_to_pinecone():
                            index.upsert(vectors=[(vector_id, embedding, metadata)])
                            return True

                        success = call_with_rate_limit_handling(upsert_to_pinecone)
                        if success:
                            print(f"Upserted chunk {idx} as vector ID '{vector_id}'.")
                        else:
                            print(f"Failed to upsert chunk {idx}.")

                    # Add file to processed list
                    processed_files.append(file_name)

                    # Update checkpoint after each file
                    with open(checkpoint_file, 'w') as f:
                        checkpoint_data = {
                            'processed_files': processed_files,
                            'current_batch': batch_idx
                        }
                        json.dump(checkpoint_data, f)

                    # Small pause between files to avoid rate limits
                    time.sleep(1)

                except Exception as e:
                    print(f"Error processing file '{file_name}': {e}")
                    continue

            # Update batch in checkpoint
            with open(checkpoint_file, 'w') as f:
                checkpoint_data = {
                    'processed_files': processed_files,
                    'current_batch': batch_idx + 1
                }
                json.dump(checkpoint_data, f)

            # Pause between batches
            if batch_idx < len(batches) - 1:
                print(f"Batch {batch_idx + 1} complete. Pausing for 10 seconds before next batch...")
                time.sleep(10)

        print("\nAll files processed successfully!")

    except KeyboardInterrupt:
        print("\nProcess interrupted by user.")
        print(f"Progress saved. You can resume from where you left off.")
    except Exception as e:
        print(f"\nUnexpected error: {e}")
        print(f"Progress saved. You can resume from where you left off.")

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Total files: 293, Remaining: 293

--- Processing Batch 1/59 ---

Processing file: Create dynamic Twitter profile banner.txt
Generated title: fetch_twitter_followers_profile_image_processing (vector base ID: fetch_twitter_followers_profile_image_processing)
Generated TLDR: This automation workflow fetches new followers from Twitter, resizes and crops their profile images, merges them into a composite image, and updates the user's profile banner.
Processing 1 chunk(s).
Upserted chunk 0 as vector ID 'fetch_twitter_followers_profile_image_processing'.

Processing file: Send specific PDF attachments from Gmail to Google Drive using OpenAI.txt
Generated title: pdf_gmail_drive_openai_workflow (vector base ID: pdf_gmail_drive_openai_workflow)
Generated TLDR: This automation workflow reads PDF content from Gmail attachments, uses OpenAI to match specified criteria, an