In [None]:
import os
import email
from email import policy
import requests
from dotenv import load_dotenv
from azure.identity.aio import ClientSecretCredential
from azure.identity import InteractiveBrowserCredential,DeviceCodeCredential
from bs4 import BeautifulSoup
import json
import pickle

In [13]:
import os
import glob
from dotenv import load_dotenv
from langchain_community.document_loaders import (
    DirectoryLoader, 
    UnstructuredHTMLLoader,
    PyPDFLoader,
    Docx2txtLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_openai import AzureChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
import httpx

In [14]:
import asyncio
from msgraph import GraphServiceClient
from pathlib import Path

In [15]:
load_dotenv(override=True)

True

In [16]:
AUTOX_API_KEY  = os.getenv("AUTOX_API_KEY")
NTNET_USERNAME = (os.getenv("NTNET_USERNAME") or "").strip()

# 3) Set proxy bypass BEFORE creating HTTP clients
os.environ["NO_PROXY"] = ",".join(filter(None, [
    os.getenv("NO_PROXY",""),
    ".autox.corp.amdocs.azr",
    "chat.autox.corp.amdocs.azr",
    "localhost","127.0.0.1"
]))
os.environ["no_proxy"] = os.environ["NO_PROXY"]

from langchain_openai import AzureChatOpenAI
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
import httpx

http_client = httpx.Client(
    verify=r"C:\amdcerts.pem",  # Use corporate certs
    timeout=30.0
)

# Create async client too (if needed)
async_http_client = httpx.AsyncClient(
    verify=r"C:\amdcerts.pem",
    timeout=30.0
)

# 5) Create LLM with custom HTTP client
llm = AzureChatOpenAI(
    azure_endpoint="https://chat.autox.corp.amdocs.azr/api/v1/proxy",
    api_key=AUTOX_API_KEY,
    azure_deployment="gpt-4o-128k",
    model="gpt-4o-128k",
    temperature=0.1,
    openai_api_version="2024-08-01-preview",
    default_headers={"username": NTNET_USERNAME, "application": "testing-proxyapi"},
    http_client=http_client,
    http_async_client=async_http_client
)

In [4]:
AZURE_TENANT_ID = os.getenv("AZURE_TENANT_ID")
AZURE_CLIENT_ID = os.getenv("AZURE_CLIENT_ID")
AZURE_CLIENT_SECRET = os.getenv("AZURE_CLIENT_SECRET_VALUE")

In [5]:
credential = DeviceCodeCredential(
    tenant_id=AZURE_TENANT_ID,
    client_id=AZURE_CLIENT_ID
)
scopes = ['https://graph.microsoft.com/.default']
graph_client = GraphServiceClient(credentials=credential, scopes=scopes)

In [None]:
# try:
#     print("Testing connection...")
    
#     # This will work because you're authenticated as yourself
#     user = await graph_client.me.get()
#     print(f"✓ Connected as: {user.display_name}")
#     print(f"  Email: {user.mail or user.user_principal_name}")
    
#     # Access YOUR OneNote notebooks directly
#     notebooks = await graph_client.me.onenote.notebooks.get()
#     if notebooks.value:
#         print(f"  Found {len(notebooks.value)} notebooks:")
#         for nb in notebooks.value:
#             print(f"    - {nb.display_name}")
#     else:
#         print("  No notebooks found")
        
# except Exception as e:
#     print(f"✗ Connection failed: {e}")
#     raise

Testing connection...
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code L6SQM7MRS to authenticate.


In [27]:
from typing import Dict
def extract_and_describe_images(mht_path: str) -> Dict[str, Dict]:
    """
        Extract images from MHT file and get their descriptions using Azure OpenAI
        
        Returns:
            Dictionary mapping image CIDs to their data and descriptions
    """
    images = {}

    with open(mht_path, 'rb') as f:
        msg = email.message_from_binary_file(f, policy=policy.default)
    
    image_count = 0
    for part in msg.walk():
        content_type = part.get_content_type()
    
        if content_type.startswith("image/"):
            image_count+=1

            cid = part.get('Content-ID', '')
            if cid:
                cid = cid.strip('<>')
            else:
                # Use Content-Location as fallback
                cid = part.get('Content-Location', f'image_{len(images)}')
            
            image_data = part.get_payload(decode=True)

            if llm:
                print(f"  Analyzing image {image_count}...")
                description = describe_image(image_data, content_type)
            
            else:
                description = "[Image - description unavailable without Azure API]"

            images[cid] = {
                'data': image_data,
                'content_type': content_type,
                'description': description,
                'size': len(image_data)
            }
    return images


In [30]:
import base64

def describe_image(image_data: bytes, content_type: str) -> str:
    """
    Use Azure OpenAI to describe what's in the image
    """
    
    try:
        # Convert image to base64
        base64_image = base64.b64encode(image_data).decode('utf-8')
        
        # Call Azure OpenAI with vision capabilities
        response = llm.invoke(  # Your Azure deployment name
            [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": """Describe this image in detail for personal note-taking purposes. Include:
- Any text visible in the image
- Diagrams, charts, or visual elements
- Key concepts or information shown
- Any annotations or highlights
Be concise but thorough."""
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:{content_type};base64,{base64_image}"
                            }
                        }
                    ]
                }
            ]
        )
        
        return response.content
    
    except Exception as e:
        print(f"    ⚠️  Error describing image: {e}")
        return f"[Image - error getting description: {str(e)[:100]}]"

In [31]:
knowledge_base_path = "knowledge-base/amdocsKnowledgeBase/Company.mht"
absolute_path = os.path.abspath(knowledge_base_path)
images = extract_and_describe_images(absolute_path)

  Analyzing image 1...
  Analyzing image 2...
  Analyzing image 3...
  Analyzing image 4...
  Analyzing image 5...
  Analyzing image 6...
  Analyzing image 7...
  Analyzing image 8...
  Analyzing image 9...
  Analyzing image 10...
  Analyzing image 11...
  Analyzing image 12...
  Analyzing image 13...
  Analyzing image 14...
  Analyzing image 15...
  Analyzing image 16...
  Analyzing image 17...
  Analyzing image 18...
  Analyzing image 19...
  Analyzing image 20...
  Analyzing image 21...
  Analyzing image 22...
  Analyzing image 23...
  Analyzing image 24...
  Analyzing image 25...
  Analyzing image 26...
  Analyzing image 27...
  Analyzing image 28...
  Analyzing image 29...
  Analyzing image 30...
  Analyzing image 31...
  Analyzing image 32...
  Analyzing image 33...
  Analyzing image 34...
  Analyzing image 35...
  Analyzing image 36...
  Analyzing image 37...
  Analyzing image 38...
  Analyzing image 39...
  Analyzing image 40...
  Analyzing image 41...
  Analyzing image 42...
 

KeyError: '1'

In [33]:
import pickle
with open('knowledge-base/amdocsKnowledgeBase/images_cache.pkl', 'wb') as f:
    pickle.dump(images, f)
print(f"Saved {len(images)} images to cache")

Saved 150 images to cache


In [None]:

cache_file = 'knowledge-base/amdocsKnowledgeBase/images_cache.pkl'

# Check if cache exists
if os.path.exists(cache_file):
    print("Loading images from cache...")
    with open(cache_file, 'rb') as f:
        images = pickle.load(f)
    print(f"Loaded {len(images)} images from cache")
else:
    print("No cache found. Processing images...")
    # Your extract_and_describe_images() call here
    images = extract_and_describe_images(absolute_path)
    
    # Save to cache
    with open(cache_file, 'wb') as f:
        pickle.dump(images, f)
    print(f"Processed and cached {len(images)} images")

In [36]:
print(images["file:///C:/A5099239/Company_files/image002.png"]["description"])

### Description of the Image:

#### **Section 1: Tunneling Diagram**
- **Title:** "Tunneling"
- **Visual Elements:**
  - A diagram illustrating the process of port forwarding.
  - Three options at the top: 
    - "Local port forwarding" (selected)
    - "Remote port forwarding"
    - "Dynamic port forwarding (SOCKS proxy)"
  - Left side: "My computer with MobaXterm" connected to "Local clients."
  - Middle: A firewall with a flame icon, showing port 6432.
  - Right side: "Remote server" with a lock icon and SSH server.
  - Arrows indicate the flow of data from local clients to the remote server via port 6432.
- **Text in Diagram:**
  - "svod-nehash-postgre" and "6432" near the remote server.
  - "100.66.122.185" and "22" near the SSH server.
  - "Local clients can access the remote server by connecting to <mycomputer>:6432."
- **Buttons:** "Save" and "Cancel" at the bottom.

#### **Section 2: Configuration Details**
- **Text:**
  - "Port: 6432"
  - "Consul ip: 100.66.122.185"
  - "Remo

In [51]:
from typing import List,Any
def create_enriched_chunks(mht_path: str,images: Dict[str, Dict]) -> List[Dict[str, Any]]:
    chunks = []
    with open(mht_path, 'rb') as f:
        msg = email.message_from_binary_file(f, policy=policy.default)
    
    for part in msg.walk():
        if part.get_content_type() == "text/html":
            html_content = part.get_content();
            soup = BeautifulSoup(html_content, 'html.parser')

            sections = soup.find_all(['div','sections','article'])
            for idx,section in enumerate(sections):
                text = section.get_text(separator='\n', strip=True)

                if not text or not text.strip():
                    continue

                section_images = []

                for img in section.find_all('img'):
                    img_src = img.get('src','')

                    if img_src:
                        filename = img_src.split('/')[-1]
                    
                    matching_key = None
                    for key in images.keys():
                        if filename in key:
                            matching_key = key
                            break
                    
                    if matching_key:
                        section_images.append({
                            'cid': matching_key,
                            'alt': img.get('alt', ''),
                            'description': images[matching_key]['description']
                        })
            
                section_links = []

                for a_tag in section.find_all('a', href=True):
                    link_text = a_tag.get_text(strip=True)
                    link_url = a_tag['href']

                    if link_url and not link_url.startswith('cid:'):
                        section_links.append({
                            'text': link_text,
                            'url': link_url,
                            'title': a_tag.get('title','')
                        })

                enriched_text = f"Content: {text}\n\n"

                if section_images:
                    enriched_text += "Images in this section:\n"
                    for img_info in section_images:
                        # Extract just the filename from the full path for readability
                        filename = img_info['cid'].split('/')[-1]  # Gets "image002.png" from full path
                        
                        enriched_text += f"- Image: {filename}\n"
                        enriched_text += f"  Description: {img_info['description']}\n"
                        if img_info['alt']:
                            enriched_text += f"  Alt text: {img_info['alt']}\n"
                    enriched_text += "\n"
                
                if section_links:
                    enriched_text += "Links in this section:\n"
                    for link_info in section_links:
                        enriched_text += f"- Link: {link_info['text']}\n"
                        enriched_text += f"  URL: {link_info['url']}\n"
                        if link_info['title']:
                            enriched_text += f"  Title: {link_info['title']}\n"

                chunks.append({
                    'text': enriched_text,
                    'raw_text': text,
                    'metadata': {
                        'source': str(mht_path),
                        'chunk_id': idx,
                        'images': section_images,
                        'links': section_links,
                        'has_images': len(section_images) > 0,
                        'has_links': len(section_links) > 0
                    }
                })
    return chunks
                    

In [39]:
embeddings = HuggingFaceEmbeddings(
    model_name="C:/AgenticAI/all-MiniLM-L6-v2",  # Your local model path
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': False}
)

In [54]:
chunks = create_enriched_chunks(absolute_path, images)
print(chunks[0])

{'text': 'Content: GA To GA\nThursday,\nJune 5, 2025\n12:29\nPM\nPlatform release\nversion 25.03:\nPlatform\nRelease 25.03 - Open Network Platform - Confluence AT - Production\nBlueIngressHost:\nportal-ga-vishnuna.oso.corp.amdocs.aws\n\nLinks in this section:\n- Link: Platform\nRelease 25.03 - Open Network Platform - Confluence AT - Production\n  URL: https://confluence/display/ONPONE/Platform+Release+25.03\n', 'raw_text': 'GA To GA\nThursday,\nJune 5, 2025\n12:29\nPM\nPlatform release\nversion 25.03:\nPlatform\nRelease 25.03 - Open Network Platform - Confluence AT - Production\nBlueIngressHost:\nportal-ga-vishnuna.oso.corp.amdocs.aws', 'metadata': {'source': 'c:\\AgenticAI\\llm_engineering\\week5\\knowledge-base\\amdocsKnowledgeBase\\Company.mht', 'chunk_id': 0, 'images': [], 'links': [{'text': 'Platform\nRelease 25.03 - Open Network Platform - Confluence AT - Production', 'url': 'https://confluence/display/ONPONE/Platform+Release+25.03', 'title': ''}], 'has_images': False, 'has_links

In [45]:
db_name = 'knowledge-base/amdocsKnowledgeBase/knowledge_base_db'
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [55]:
from langchain.schema import Document
def add_to_vector_store(chunks: List[Dict[str, Any]]):
    print(f"Adding {len(chunks)} chunks to vector store...")
    documents = []
    for chunk in chunks:
        # Generate embedding from the enriched text
        doc = Document(
            page_content=chunk['text'],  # The enriched text for embedding
            metadata={
                'source': chunk['metadata']['source'],
                'chunk_id': chunk['metadata']['chunk_id'],
                'has_images': chunk['metadata']['has_images'],
                'has_links': chunk['metadata']['has_links'],
                'num_images': len(chunk['metadata']['images']),
                'num_links': len(chunk['metadata']['links']),
                # Store complex objects as JSON strings
                'links_json': json.dumps(chunk['metadata']['links']),
                'images_json': json.dumps(chunk['metadata']['images']),
                'raw_text': chunk['raw_text']  # Optional: keep original text
            }
        )
        documents.append(doc)
    
    return documents

In [56]:
vectorstore = Chroma.from_documents(
    documents=add_to_vector_store(chunks),
    embedding=embeddings,
    persist_directory=db_name
)

Adding 330 chunks to vector store...


In [57]:
collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 330 vectors with 384 dimensions in the vector store


In [66]:
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

In [74]:
from langchain_core.callbacks import StdOutCallbackHandler
retriever = vectorstore.as_retriever(search_kwargs={"k": 40})

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [69]:
query = "What are the steps to setup AWS 100K env"
result = conversation_chain.invoke({"question": query})
print(result["answer"])

To set up an AWS 100K environment, follow these steps based on the provided context:

---

### **Step 1: EKS Cluster Setup**
1. **Create an EKS Cluster:**
   - Use the Jenkins pipeline for EKS setup:
     - [EKS Pipeline](https://jenkins-devops.neo.corp.amdocs.aws/job/DevOps/job/eks-poc-multibranch/job/master/)
   - Ensure the cluster name follows the format `clustername-username`.

2. **Parameters for EKS Cluster:**
   - Set the following parameters:
     - `eks_clustername`: Name of the cluster.
     - `aws_region`: AWS region (e.g., `eu-west-1`).
     - `default_no_of_nodes`: 8 nodes.
     - `kube_version`: 1.29.
     - `node_instance_type`: `m6i.4xlarge`.
     - `spot`: Set to `false` for on-demand nodes.

3. **Verify Auto Scaling Groups:**
   - Ensure the cluster has 3 auto-scaling groups created.

---

### **Step 2: MS Automation**
1. **Deploy Middleware Services:**
   - Use the Jenkins pipeline for MS automation:
     - [MS Automation Pipeline](https://jenkins-devops.neo.corp.am

In [75]:
import gradio as gr
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [76]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.
