# B"H

## Env Setup

In [1]:
import sys
import os
from pathlib import Path

In [2]:
print(f"Python executable: {sys.executable}")
print(f"Python version: {sys.version.split()[0]}")
print(f"Virtual env prefix: {sys.prefix}")

Python executable: /home/izzy/repos/repos-p5/pointfive/ai-tag-intelligence/.venv/bin/python
Python version: 3.12.10
Virtual env prefix: /home/izzy/repos/repos-p5/pointfive/ai-tag-intelligence/.venv


In [3]:
assert sys.executable == '/home/izzy/repos/repos-p5/pointfive/ai-tag-intelligence/.venv/bin/python' 


In [4]:

src_path = Path(os.environ.get('AI_AGENT_DIR')) / "src"
assert src_path.is_dir()

if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

print(str(src_path))

/home/izzy/repos/repos-p5/pointfive/ai-tag-intelligence/src


In [5]:
sys.path

['/home/izzy/repos/repos-p5/pointfive/ai-tag-intelligence/src',
 '/home/izzy/.local/share/mise/installs/python/3.12.10/lib/python312.zip',
 '/home/izzy/.local/share/mise/installs/python/3.12.10/lib/python3.12',
 '/home/izzy/.local/share/mise/installs/python/3.12.10/lib/python3.12/lib-dynload',
 '',
 '/home/izzy/repos/repos-p5/pointfive/ai-tag-intelligence/.venv/lib/python3.12/site-packages',
 '/home/izzy/repos/repos-p5/pointfive/ai-tag-intelligence']

## Constants

In [6]:
from pathlib import Path

LARGE_FILES_DIR = Path('/home/izzy/repos/x-large-files/x-large-files-p5-tag-intelligence')
assert LARGE_FILES_DIR.is_dir()

# -- -----------------------------
ITERATION_NAME = 'deep-research-02'

ITERATION_DIR = LARGE_FILES_DIR / ITERATION_NAME
if not ITERATION_DIR.exists():
    ITERATION_DIR.mkdir()

# -- -----------------------------
COMPONENTS_AGG_INFO_FILE = ITERATION_DIR / 'components-agg-info.json'
assert COMPONENTS_AGG_INFO_FILE.exists()


CONSOLIDATOR_BATCH_ID_FILE = ITERATION_DIR / 'consolidator-batch-id.txt'

# -- -----------------------------
GCS_BUCKET = os.getenv("GCS_BUCKET")
print(GCS_BUCKET)

ai-tag-intelligence


## Data Peek

In [7]:
import pandas as pd


In [8]:
df = pd.read_json(COMPONENTS_AGG_INFO_FILE)

df.head()

Unnamed: 0,category,sample_values
0,Environment,"fdprod (FD Production), xprep (preparation/sta..."
1,Project/System,"sce (SCE product/project), cchh9cfqg3ymhauu8qq..."
2,Service/Domain,"portal-extns (Portal Extensions), warrsup, ifs..."
3,Resource Type,"engine, cluster (ECSCluster), doc (document), ..."
4,Application,"lawson, auth, pges, openedge (OpenEdge applica..."


## GenAI Client

In [9]:
import os
import json
from google import genai
from google.genai.types import HttpOptions

# Initialize Google GenAI client for Vertex AI
client = genai.Client(
    http_options=HttpOptions(api_version="v1"),
    vertexai=True,
    project=os.getenv("ANTHROPIC_VERTEX_PROJECT_ID"),
    location=os.getenv("CLOUD_ML_REGION")
)

## PROMPT_TEMPLATE_TO_CACHE

In [10]:
PROMPT_TEMPLATE_TO_CACHE = '''
You will analyze AWS resource categories to identify potential consolidation opportunities. Your task is to determine whether a specific category should be consolidated with another similar category from a comprehensive list.

Here is the complete list of all AWS resource categories:

<all_categories>
{all_categories}
</all_categories>
'''

## PROMPT_TEMPLATE_PART_2

In [11]:

PROMPT_TEMPLATE_PART_2 = '''

You are an AWS resource categorization analyst tasked with identifying consolidation opportunities between similar resource categories.

Here is the specific category you need to analyze for potential consolidation:

<specific_category>
{specific_category}
</specific_category>

## Your Task

Your goal is to determine whether the specific category should be consolidated with any other category from the all_categories list. Two categories should only be consolidated if they represent essentially the same concept and have very similar sample values.

## Analysis Process

Before making your final recommendation, you must conduct a systematic analysis in <analysis> tags inside your thinking block. It's OK for this section to be quite long. Follow this structured approach:

1. **Record Key Information**: Write out the specific category name and quote its sample values verbatim to keep them clearly in focus.

2. **Systematic Comparison**: Review each category in the all_categories list (excluding exact matches of your specific_category, since a category cannot be consolidated with itself). Create a list of potential consolidation candidates and note why each might be similar.

3. **Detailed Similarity Assessment**: For your top 3-5 most promising candidates, conduct detailed evaluation:
   - **PRIMARY FACTOR**: Compare sample values in detail - do they represent very similar things? Quote specific sample values from both categories and explain similarities/differences.
   - **SECONDARY FACTOR**: Compare category names - do they represent similar concepts? Explain the conceptual relationship.
   - Rate overall similarity for each candidate (high/medium/low).

4. **Identify Most Similar**: Explicitly identify which single category is most similar overall, even if you don't recommend consolidation.

5. **Make Consolidation Decision**: Apply strict criteria - only recommend consolidation when both sample values are very similar AND category names represent related concepts. Be conservative in your recommendations.

## Important Guidelines

- Focus primarily on sample values when making decisions, as these are more precise indicators than category names alone
- Consider semantic meaning and patterns in sample values, not just superficial similarities  
- Only suggest consolidation when there is clear, strong similarity
- The specific_category will appear in the all_categories list - ignore that exact match

## CRITICAL REQUIREMENT: Tool Call

After completing your analysis, you MUST call the process_category tool. This is not optional - every response must end with this tool call.

The tool requires these parameters:
- **category** (required): The name of the specific category being researched
- **most_similar_category** (required): The category from all_categories that is most similar (even if you don't recommend consolidation)
- **should_consolidate** (required): Boolean indicating whether consolidation is recommended
- **rationale** (required): Detailed explanation of your decision
- **new_consolidated_category_name** (optional): Only include if should_consolidate is true

Example tool call:
```
process_category(
    category="Example Category Name",
    most_similar_category="Most Similar Category Name",
    should_consolidate=false,
    rationale="Detailed explanation focusing on sample value differences and category name analysis..."
)
```

Begin your systematic analysis now, then call the process_category tool with your findings. Your final output should consist only of the tool call and should not duplicate or rehash any of the work you did in the thinking block.
'''

## Prepare Batch

In [12]:
from layer_1_util_modules.general_util.serializer import load_json, load_str, dict_to_json_str

In [13]:
prompt_to_cache = PROMPT_TEMPLATE_TO_CACHE.format(
    all_categories=load_str(COMPONENTS_AGG_INFO_FILE)
)

print(prompt_to_cache)


You will analyze AWS resource categories to identify potential consolidation opportunities. Your task is to determine whether a specific category should be consolidated with another similar category from a comprehensive list.

Here is the complete list of all AWS resource categories:

<all_categories>
[
  {
    "category":"Environment",
    "sample_values":"fdprod (FD Production), xprep (preparation\/staging environment), bexarprod, lspeucentb (Farm), prdsing (production-staging environment), develop (development branch), PP2 (pre-production 2), m3cecritfixc (M3CE Critical Fix environment), cismt3devuse1 (development, US-East-1 region), acct (account)"
  },
  {
    "category":"Project\/System",
    "sample_values":"sce (SCE product\/project), cchh9cfqg3ymhauu8qqprd (unique identifier), ssmgf14 (SSMGF product cluster 14), tam93, tam178 (cluster identifier), ezrms (EZ RMS - Revenue Management System), mingleios06 (Mingle product cluster instance), str (product), rdr (RDR product), tam03

In [14]:
categories = load_json(COMPONENTS_AGG_INFO_FILE)

In [15]:
categories[:2]

[{'category': 'Environment',
  'sample_values': 'fdprod (FD Production), xprep (preparation/staging environment), bexarprod, lspeucentb (Farm), prdsing (production-staging environment), develop (development branch), PP2 (pre-production 2), m3cecritfixc (M3CE Critical Fix environment), cismt3devuse1 (development, US-East-1 region), acct (account)'},
 {'category': 'Project/System',
  'sample_values': 'sce (SCE product/project), cchh9cfqg3ymhauu8qqprd (unique identifier), ssmgf14 (SSMGF product cluster 14), tam93, tam178 (cluster identifier), ezrms (EZ RMS - Revenue Management System), mingleios06 (Mingle product cluster instance), str (product), rdr (RDR product), tam03'}]

In [16]:
dict_to_json_str(categories[0])


'{\n  "category": "Environment",\n  "sample_values": "fdprod (FD Production), xprep (preparation/staging environment), bexarprod, lspeucentb (Farm), prdsing (production-staging environment), develop (development branch), PP2 (pre-production 2), m3cecritfixc (M3CE Critical Fix environment), cismt3devuse1 (development, US-East-1 region), acct (account)"\n}'

In [17]:
from layer_1_util_modules.general_util.id_gen import generate_id

In [18]:
# Create batch requests in Vertex AI format
batch_requests = []

for rec in categories:
    # Format the prompt with the data
    prompt_part_2 = PROMPT_TEMPLATE_PART_2.format(specific_category=dict_to_json_str(rec))
    
    # Create the batch request item in Vertex AI format
    batch_request = {
        "custom_id": generate_id(),
        "request": {
            "anthropic_version": "vertex-2023-10-16",
            "max_tokens": 6000,
            "temperature": 0.2,
            "messages": [
                {
                    "role": "user",
                    "content": prompt_to_cache,
                    "cache_control": {"type": "ephemeral"}
                },
                {
                    "role": "user",
                    "content": prompt_part_2
                },
            ],
            "tools": [
                 {
                    "type": "custom",
                    "name": "process_category",
                    "description": "Process Category",
                    "input_schema": {
                        "type": "object",
                        "properties": {
                            "category": {
                                "type": "string",
                                "description": "The specific category being researched"
                            },
                            "most_similar_category": {
                                "type": "string",
                                "description": "The category that is most similar to the specific category being researched"
                            },
                            "should_consolidate": {
                                "type": "boolean",
                                "description": "Whether we should consolidate or not"
                            },
                            "rationale": {
                                "type": "string",
                                "description": "The rationale of whether we should consolidate or not"
                            },
                            "new_consolidated_category_name": {
                                "type": "string",
                                "description": "The new category name of the consolidated categories"
                            }
                        },
                        "required": [
                            "category",
                            "most_similar_category",
                            "should_consolidate",
                            "rationale"
                        ]
                    }
                }
            ]
        }
    }
    
    batch_requests.append(batch_request)

print(f"Created {len(batch_requests)} batch requests")

Created 710 batch requests


In [19]:
batch_requests[0]

{'custom_id': '2025-11-03-18-50-01-128-18180a76',
 'request': {'anthropic_version': 'vertex-2023-10-16',
  'max_tokens': 6000,
  'temperature': 0.2,
  'messages': [{'role': 'user',
    'content': '\nYou will analyze AWS resource categories to identify potential consolidation opportunities. Your task is to determine whether a specific category should be consolidated with another similar category from a comprehensive list.\n\nHere is the complete list of all AWS resource categories:\n\n<all_categories>\n[\n  {\n    "category":"Environment",\n    "sample_values":"fdprod (FD Production), xprep (preparation\\/staging environment), bexarprod, lspeucentb (Farm), prdsing (production-staging environment), develop (development branch), PP2 (pre-production 2), m3cecritfixc (M3CE Critical Fix environment), cismt3devuse1 (development, US-East-1 region), acct (account)"\n  },\n  {\n    "category":"Project\\/System",\n    "sample_values":"sce (SCE product\\/project), cchh9cfqg3ymhauu8qqprd (unique id

## Save Batch to GCS

### Storage Client

In [20]:
from google.cloud import storage
from google.genai.types import CreateBatchJobConfig
from datetime import datetime

In [21]:
storage_client = storage.Client()
bucket = storage_client.bucket(GCS_BUCKET)

### Setup GCS paths

In [22]:

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
input_filename = f'batch_input_{timestamp}.jsonl'
input_gcs_path = f'gs://{GCS_BUCKET}/{ITERATION_NAME}/batch-inputs/{input_filename}'
output_gcs_prefix = f'gs://{GCS_BUCKET}/{ITERATION_NAME}/batch-outputs/{timestamp}/'

print(input_gcs_path)
print(output_gcs_prefix)

gs://ai-tag-intelligence/deep-research-02/batch-inputs/batch_input_20251103_205012.jsonl
gs://ai-tag-intelligence/deep-research-02/batch-outputs/20251103_205012/


### Upload JSONL to GCS

In [23]:
jsonl_content = '\n'.join([json.dumps(req) for req in batch_requests])
blob = bucket.blob(f'{ITERATION_NAME}/batch-inputs/{input_filename}')
blob.upload_from_string(jsonl_content, content_type='application/jsonl')

print(f"Uploaded batch input to: {input_gcs_path}")

Uploaded batch input to: gs://ai-tag-intelligence/deep-research-02/batch-inputs/batch_input_20251103_205012.jsonl


Peek at [GCS Folder](https://console.cloud.google.com/storage/browser/ai-tag-intelligence/deep-research-01/batch-inputs?pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))&project=pointfive-genai)

## Create batch job using Vertex AI

**NOTE: the Vertex REST API must be used in order to use the caching option!**

In [24]:
import requests
import subprocess
import json

# Get authentication token
token = subprocess.check_output(["gcloud", "auth", "print-access-token"]).decode().strip()

# Get project ID and region from environment
project_id = os.getenv("ANTHROPIC_VERTEX_PROJECT_ID")
region = os.getenv("CLOUD_ML_REGION")

# Construct the REST API endpoint
api_endpoint = f"https://{region}-aiplatform.googleapis.com/v1/projects/{project_id}/locations/{region}/batchPredictionJobs"

# Create the request body for batch prediction job
request_body = {
    "displayName": f"genai_batch_job_{timestamp}",
    "model": "publishers/anthropic/models/claude-haiku-4-5@20251001",
    "inputConfig": {
        "instancesFormat": "jsonl",
        "gcsSource": {
            "uris": [input_gcs_path]
        }
    },
    "outputConfig": {
        "predictionsFormat": "jsonl",
        "gcsDestination": {
            "outputUriPrefix": output_gcs_prefix
        }
    }
}

# Make the REST API call to create the batch job
headers = {
    "Authorization": f"Bearer {token}",
    "Content-Type": "application/json"
}

print(f"Creating batch job via REST API...")
print(f"Endpoint: {api_endpoint}")
print(f"Input: {input_gcs_path}")
print(f"Output: {output_gcs_prefix}")

response = requests.post(api_endpoint, headers=headers, json=request_body)

if response.status_code == 200:
    job_response = response.json()
    job_name = job_response.get("name")
    job_state = job_response.get("state")
    
    print(f"Batch job created successfully!")
    print(f"Job name: {job_name}")
    print(f"Job state: {job_state}")
    print(f"Output location: {output_gcs_prefix}")
    
    # Save job name to file
    with open(CONSOLIDATOR_BATCH_ID_FILE, 'w') as f:
        f.write(job_name)
    
    print(f"Job name saved to {CONSOLIDATOR_BATCH_ID_FILE}")
else:
    print(f"Error creating batch job: {response.status_code}")
    print(f"Response: {response.text}")
    raise Exception(f"Failed to create batch job: {response.text}")

Creating batch job via REST API...
Endpoint: https://us-east5-aiplatform.googleapis.com/v1/projects/pointfive-genai/locations/us-east5/batchPredictionJobs
Input: gs://ai-tag-intelligence/deep-research-02/batch-inputs/batch_input_20251103_205012.jsonl
Output: gs://ai-tag-intelligence/deep-research-02/batch-outputs/20251103_205012/
Batch job created successfully!
Job name: projects/1053914441280/locations/us-east5/batchPredictionJobs/5620802397237411840
Job state: JOB_STATE_PENDING
Output location: gs://ai-tag-intelligence/deep-research-02/batch-outputs/20251103_205012/
Job name saved to /home/izzy/repos/x-large-files/x-large-files-p5-tag-intelligence/deep-research-02/consolidator-batch-id.txt


## Monitor batch job status

In [29]:
import subprocess
import requests

# Load job name from file
with open(CONSOLIDATOR_BATCH_ID_FILE, 'r') as f:
    job_name = f.read().strip()

print(f"Loaded job name: {job_name}")

# Get authentication token
token = subprocess.check_output(["gcloud", "auth", "print-access-token"]).decode().strip()

# Construct the REST API endpoint for getting job status
project_id = os.getenv("ANTHROPIC_VERTEX_PROJECT_ID")
region = os.getenv("CLOUD_ML_REGION")
api_endpoint = f"https://{region}-aiplatform.googleapis.com/v1/{job_name}"

# Make the REST API call to get job status
headers = {
    "Authorization": f"Bearer {token}",
    "Content-Type": "application/json"
}

response = requests.get(api_endpoint, headers=headers)

if response.status_code == 200:
    job = response.json()
    print(f"Job status retrieved successfully")
    print(f"State: {job.get('state')}")
else:
    print(f"Error getting job status: {response.status_code}")
    print(f"Response: {response.text}")

Loaded job name: projects/1053914441280/locations/us-east5/batchPredictionJobs/5620802397237411840
Job status retrieved successfully
State: JOB_STATE_SUCCEEDED
