In [3]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import json
import time
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from webdriver_manager.chrome import ChromeDriverManager
import os
import uuid
import openai
import pinecone
from tqdm import tqdm
from typing import List, Dict, Any
from dotenv import load_dotenv


  from pandas.core import (


In [4]:
def extract_links(article_text):
    link_pattern = r'(https?://[^\s]+|www\.[^\s]+\.[^\s]+)'
    links = re.finditer(link_pattern, article_text)
    links_dict = {}
    clean_text = article_text
    offset = 0
    for i, match in enumerate(links):
        link = match.group(0)
        start = match.start() - offset
        end = match.end() - offset
        links_dict[i] = {
            'link': link,
            'original_position': match.start(),
            'length': len(link)
        }
        clean_text = clean_text[:start] + clean_text[end:]
        offset += (end - start)
    
    return links_dict, clean_text

sample_article = """
Check out this awesome tutorial at https://www.example.com/tutorial. 
For more details, visit www.example.org/details and read 
the documentation at https://docs.example.net/guide.
"""

links_dict, clean_text = extract_links(sample_article)

print("Extracted Links:")
for idx, link_info in links_dict.items():
    print(f"{idx}: {link_info['link']} (appeared at position {link_info['original_position']})")

print("\nClean Text:")
print(clean_text)

Extracted Links:
0: https://www.example.com/tutorial. (appeared at position 36)
1: www.example.org/details (appeared at position 95)
2: https://docs.example.net/guide. (appeared at position 150)

Clean Text:

Check out this awesome tutorial at  
For more details, visit  and read 
the documentation at 



In [5]:
def setup_webdriver():
    chrome_options = Options()
    chrome_options.add_argument("--headless") 
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36")
    
    try:
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        return driver
    except Exception as e:
        print(f"Error setting up WebDriver: {e}")
        return None


def scrape_url_content(driver, url):
    if url.startswith('www.'):
        url = 'https://' + url
    
    result = {
        'url': url,
        'title': '',
        'text_content': '',
        'meta_description': '',
        'status': 'failed',
        'error': None
    }
    
    try:
        driver.set_page_load_timeout(30)
        driver.get(url)
        
        # Wait for page to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        time.sleep(2)
        try:
            result['title'] = driver.title
        except:
            result['title'] = 'No title found'
        
        try:
            meta_desc = driver.find_element(By.CSS_SELECTOR, "meta[name='description']")
            result['meta_description'] = meta_desc.get_attribute("content")
        except:
            result['meta_description'] = 'No meta description found'

        main_content = ''
        content_selectors = [
            "article", "main", ".content", "#content", ".post-content", 
            ".article-content", ".entry-content", "#main-content"
        ]
        
        for selector in content_selectors:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                if elements:
                    for elem in elements:
                        main_content += elem.text + "\n\n"
                    break
            except:
                continue
    
        if not main_content.strip():
            try:
                main_content = driver.find_element(By.TAG_NAME, "body").text
            except:
                main_content = 'Failed to extract content'
        
        result['text_content'] = main_content.strip()
        result['status'] = 'success'
        
        return result
        
    except TimeoutException:
        result['error'] = 'Page load timeout'
        return result
    except WebDriverException as e:
        result['error'] = f'WebDriver error: {str(e)}'
        return result
    except Exception as e:
        result['error'] = f'Unexpected error: {str(e)}'
        return result


def scrape_links_and_save(links_dict, output_file='scraped_content.json'):
    result = {
        'links': links_dict,
        'scraped_content': {}
    }
    
    driver = setup_webdriver()
    if not driver:
        print("Failed to set up WebDriver. Exiting.")
        return result
    
    try:
        for idx, link_info in links_dict.items():
            url = link_info['link']
            print(f"Scraping {url}...")
            scraped_data = scrape_url_content(driver, url)
            result['scraped_content'][idx] = scraped_data
            time.sleep(2)
        
        # Save results to JSON file
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(result, f, ensure_ascii=False, indent=2)
        
        print(f"Results saved to {output_file}")
        
    finally:
        if driver:
            driver.quit()
    
    return result


    # Example usage
sample_article = """https://en.wikipedia.org/wiki/Data_science"""

links_dict, clean_text = extract_links(sample_article)
results = scrape_links_and_save(links_dict, 'scraped_content_1.json')

# Print summary
print("\nScraping Summary:")
print(f"Found {len(results['links'])} links in the article")
print(f"Successfully scraped {sum(1 for item in results['scraped_content'].values() if item['status'] == 'success')} links")
print(f"Failed to scrape {sum(1 for item in results['scraped_content'].values() if item['status'] == 'failed')} links")

Error setting up WebDriver: Could not reach host. Are you offline?
Failed to set up WebDriver. Exiting.

Scraping Summary:
Found 1 links in the article
Successfully scraped 0 links
Failed to scrape 0 links


In [6]:

def scrape_url_link(driver, url):
    if url.startswith('www.'):
        url = 'https://' + url
    
    result = {
        'url': url,
        'title': '',
        'text_content': '',
        'meta_description': '',
        'links': [],  # New field to store all links found on the page
        'status': 'failed',
        'error': None
    }
    
    try:
        # Set page load timeout to 30 seconds
        driver.set_page_load_timeout(30)
        driver.get(url)
        
        # Wait for page to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        time.sleep(2)
        try:
            result['title'] = driver.title
        except:
            result['title'] = 'No title found'
        try:
            meta_desc = driver.find_element(By.CSS_SELECTOR, "meta[name='description']")
            result['meta_description'] = meta_desc.get_attribute("content")
        except:
            result['meta_description'] = 'No meta description found'
        try:
            link_elements = driver.find_elements(By.TAG_NAME, "a")
            extracted_links = []
            for link in link_elements:
                href = link.get_attribute("href")
                text = link.text.strip()
                
                if href and href.startswith(('http://', 'https://', 'www.')):
                    extracted_links.append({
                        'url': href,
                        'text': text if text else 'No link text',
                        'title': link.get_attribute("title") or ''
                    })
            
            result['links'] = extracted_links
        except Exception as e:
            print(f"Error extracting links: {e}")
            result['links'] = []
        
        # Extract main content - multiple strategies
        # Strategy 1: Try to find main content element
        main_content = ''
        content_selectors = [
            "article", "main", ".content", "#content", ".post-content", 
            ".article-content", ".entry-content", "#main-content"
        ]
        
        for selector in content_selectors:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                if elements:
                    for elem in elements:
                        main_content += elem.text + "\n\n"
                    break
            except:
                continue
        
        # Strategy 2: If no main content found, get the body text
        if not main_content.strip():
            try:
                main_content = driver.find_element(By.TAG_NAME, "body").text
            except:
                main_content = 'Failed to extract content'
        
        result['text_content'] = main_content.strip()
        result['status'] = 'success'
        
        return result
        
    except TimeoutException:
        result['error'] = 'Page load timeout'
        return result
    except WebDriverException as e:
        result['error'] = f'WebDriver error: {str(e)}'
        return result
    except Exception as e:
        result['error'] = f'Unexpected error: {str(e)}'
        return result


def scrape_links_and_save(article_text, output_file='scraped_content.json'):
    links_dict, clean_text = extract_links(article_text)
    result = {
        'original_text': article_text,
        'clean_text': clean_text,
        'links': links_dict,
        'scraped_content': {}
    }

    driver = setup_webdriver()
    if not driver:
        print("Failed to set up WebDriver. Exiting.")
        return result
    
    try:
        for idx, link_info in links_dict.items():
            url = link_info['link']
            print(f"Scraping {url}...")
            
            # Scrape the URL
            scraped_data = scrape_url_link(driver, url)
            
            # Add scraped content to results
            result['scraped_content'][idx] = scraped_data
            
            # Add a short delay between requests to be polite
            time.sleep(2)
        
        # Save results to JSON file
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(result, f, ensure_ascii=False, indent=2)
        
        print(f"Results saved to {output_file}")
        
    finally:
        # Always close the WebDriver
        if driver:
            driver.quit()
    
    return result



sample_article = """https://en.wikipedia.org/wiki/Data_science"""

# Scrape the links and save results
results = scrape_links_and_save(sample_article, 'scraped_content.json')


Error setting up WebDriver: Could not reach host. Are you offline?
Failed to set up WebDriver. Exiting.


In [7]:
import os
import re
import json
import time
import uuid
import pinecone
import openai
from typing import List
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec

load_dotenv()
openai.api_key = ""
PINECONE_API_KEY = ""
PINECONE_ENVIRONMENT = "us-east1-gcp"
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME") or "scraped-content"

# Model parameters
EMBEDDING_MODEL = "text-embedding-3-small"  
EMBEDDING_DIMENSION = 1536  
CHUNK_SIZE = 1000  
CHUNK_OVERLAP = 200  
MAX_CHUNKS_PER_BATCH = 100  



def initialize_pinecone():
    """Initialize Pinecone client and ensure index exists."""
    try:
        # Initialize Pinecone client
        pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)

        # Check if index already exists
        existing_indexes = pc.list_indexes().names()

        if PINECONE_INDEX_NAME not in existing_indexes:
            print(f"Creating Pinecone index: {PINECONE_INDEX_NAME}")
            pc.create_index(
                name=PINECONE_INDEX_NAME,
                dimension=EMBEDDING_DIMENSION,
                metric="cosine",
                spec=ServerlessSpec(
                    cloud="aws",       # Specify cloud provider (e.g., 'aws' or 'gcp')
                    region="us-east-1" # Specify region (e.g., 'us-east-1' for AWS)
                )
            )
            
            # Poll until index is ready
            max_attempts = 12
            attempt = 0
            while attempt < max_attempts:
                try:
                    index = pc.Index(PINECONE_INDEX_NAME)
                    index.describe_index_stats()  # Test if index is accessible
                    print(f"Index {PINECONE_INDEX_NAME} is ready.")
                    break
                except Exception as e:
                    attempt += 1
                    if attempt == max_attempts:
                        raise Exception(f"Index {PINECONE_INDEX_NAME} not ready after {max_attempts * 5} seconds.")
                    print(f"Waiting for index to be ready... ({attempt}/{max_attempts})")
                    time.sleep(5)
        
        else:
            print(f"Index {PINECONE_INDEX_NAME} already exists.")

        # Return the index object
        return pc.Index(PINECONE_INDEX_NAME)

    except Exception as e:
        raise Exception(f"Failed to initialize Pinecone: {str(e)}")

def clean_and_chunk_text(text: str, chunk_size: int = CHUNK_SIZE, chunk_overlap: int = CHUNK_OVERLAP) -> List[str]:
    """Clean and split text into chunks of specified size with overlap"""
    if not text or not isinstance(text, str):
        return []
    text = re.sub(r'\s+', ' ', text).strip()
    
    chunks = []
    start = 0
    text_length = len(text)
    
    while start < text_length:
        end = start + chunk_size
        if end < text_length:
            if text[end-1] not in ".,!? ":
                last_period = text.rfind('.', start, end)
                last_space = text.rfind(' ', start, end)
                if last_period > start + (chunk_size // 2):
                    end = last_period + 1
                elif last_space > start + (chunk_size // 2):
                    end = last_space + 1
        
        chunk = text[start:end].strip()
        if chunk:  # Only add non-empty chunks
            chunks.append(chunk)
        start = end - chunk_overlap
    return chunks

def get_embeddings(chunks: List[str]) -> List[List[float]]:
    """Get embeddings for chunks using OpenAI's embedding API"""
    embeddings = []
    for i in range(0, len(chunks), MAX_CHUNKS_PER_BATCH):
        batch = chunks[i:i + MAX_CHUNKS_PER_BATCH]
        try:
            response = openai.embeddings.create(
                model=EMBEDDING_MODEL,
                input=batch
            )
            batch_embeddings = [item.embedding for item in response.data]
            embeddings.extend(batch_embeddings)
            if i + MAX_CHUNKS_PER_BATCH < len(chunks):
                time.sleep(0.5) 
                
        except Exception as e:
            print(f"Error getting embeddings: {e}")
            embeddings.extend([[0] * EMBEDDING_DIMENSION] * len(batch))
    
    return embeddings


def process_json_file(file_path: str):
    """Process scraped content JSON file and ingest into Pinecone"""
    print(f"Loading scraped content from {file_path}")
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Initialize Pinecone
    index = initialize_pinecone()
    
    # Extract and process original article
    print("Processing original article...")
    article_text = data.get('original_text', '')
    article_chunks = clean_and_chunk_text(article_text)
    
    all_chunks = []
    all_metadata = []
    
    # Process original article chunks
    for i, chunk in enumerate(article_chunks):
        chunk_id = f"original_article_{i}"
        all_chunks.append(chunk)
        all_metadata.append({
            "id": chunk_id,
            "type": "original_article",
            "chunk_index": i,
            "total_chunks": len(article_chunks),
            "source": "original_article"
        })
    
    scraped_content = data.get('scraped_content', {})
    print(f"Processing {len(scraped_content)} scraped pages...")
    
    for page_id, page_data in scraped_content.items():
        if page_data.get('status') != 'success':
            continue
        
        # Get page content
        url = page_data.get('url', '')
        title = page_data.get('title', '')
        text_content = page_data.get('text_content', '')
        
        # Skip if no substantial content
        if len(text_content) < 50:
            continue
        
        # Chunk page content
        page_chunks = clean_and_chunk_text(text_content)
        
        # Process page chunks
        for i, chunk in enumerate(page_chunks):
            chunk_id = f"page_{page_id}_chunk_{i}"
            all_chunks.append(chunk)
            all_metadata.append({
                "id": chunk_id,
                "type": "scraped_page",
                "url": url,
                "title": title,
                "chunk_index": i,
                "total_chunks": len(page_chunks),
                "page_id": page_id
            })
            
         
            if i == 0 and page_data.get('links'):  # Only store links with first chunk
                link_texts = []
                for link in page_data.get('links', []):
                    link_url = link.get('url', '')
                    link_text = link.get('text', '')
                    if link_url and link_text:
                        link_texts.append(f"{link_text}: {link_url}")
                
                if link_texts:
                    links_text = "Links found on page:\n" + "\n".join(link_texts)
                    links_chunks = clean_and_chunk_text(links_text)
                    
                    for j, link_chunk in enumerate(links_chunks):
                        chunk_id = f"page_{page_id}_links_{j}"
                        all_chunks.append(link_chunk)
                        all_metadata.append({
                            "id": chunk_id,
                            "type": "page_links",
                            "url": url,
                            "title": title + " - Links",
                            "chunk_index": j,
                            "total_chunks": len(links_chunks),
                            "page_id": page_id
                        })
    
    # Get embeddings for all chunks
    print(f"Generating embeddings for {len(all_chunks)} chunks...")
    embeddings = get_embeddings(all_chunks)
    
    # Prepare vectors for Pinecone
    vectors = []
    for i, (chunk, embedding, metadata) in enumerate(zip(all_chunks, embeddings, all_metadata)):
        vector_id = str(uuid.uuid4())
        vectors.append({
            "id": vector_id,
            "values": embedding,
            "metadata": {
                **metadata,
                "text": chunk[:1000],  # Store truncated text in metadata
                "timestamp": time.time()
            }
        })
        
        # Upsert in batches of 100
        if len(vectors) >= 100 or i == len(all_chunks) - 1:
            print(f"Upserting batch of {len(vectors)} vectors to Pinecone...")
            index.upsert(vectors=vectors)
            vectors = []
    
    print("Ingestion complete!")
    return len(all_chunks)

def query_similar_content(query_text: str, top_k: int = 5):
    """Query Pinecone for similar content based on the query text"""
    # Initialize Pinecone
    pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)
    index = pc.Index(PINECONE_INDEX_NAME)
    
    # Get query embedding
    query_embedding = get_embeddings([query_text])[0]
    
    # Query Pinecone
    query_results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )
    
    # Process and return results
    results = []
    for match in query_results.matches:
        results.append({
            "score": match.score,
            "text": match.metadata.get("text", ""),
            "source": match.metadata.get("url", match.metadata.get("source", "")),
            "title": match.metadata.get("title", "")
        })
    
    return results


        # Ingest content

    

In [11]:
!pip install --upgrade openai

Collecting openai
  Downloading openai-1.79.0-py3-none-any.whl.metadata (25 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Using cached jiter-0.9.0-cp311-cp311-win_amd64.whl.metadata (5.3 kB)
Downloading openai-1.79.0-py3-none-any.whl (683 kB)
   ---------------------------------------- 0.0/683.3 kB ? eta -:--:--
   ---------------------------------------- 683.3/683.3 kB 5.4 MB/s eta 0:00:00
Using cached jiter-0.9.0-cp311-cp311-win_amd64.whl (210 kB)
Installing collected packages: jiter, openai
  Attempting uninstall: openai
    Found existing installation: openai 0.28.1
    Uninstalling openai-0.28.1:
      Successfully uninstalled openai-0.28.1
Successfully installed jiter-0.9.0 openai-1.79.0


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-openai 0.1.14 requires langchain-core<0.3,>=0.2.2, but you have langchain-core 0.3.60 which is incompatible.

[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
!conda install -c conda-forge openai

^C


In [8]:
import os
import json
import time
import uuid
import re
from typing import List
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI  # Use the updated OpenAI client

# Load environment variables
load_dotenv()

# Configuration
PINECONE_API_KEY = ""
PINECONE_ENVIRONMENT = "us-east1-gcp"
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME") or "scraped-content"

# Model parameters
EMBEDDING_MODEL = "text-embedding-3-small"
EMBEDDING_DIMENSION = 1536
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
MAX_CHUNKS_PER_BATCH = 100

# Initialize OpenAI client
openai_client = OpenAI(api_key=OPENAI_API_KEY)

def get_embeddings(chunks: List[str]) -> List[List[float]]:
    """Get embeddings for chunks using OpenAI's embedding API."""
    embeddings = []
    for i in range(0, len(chunks), MAX_CHUNKS_PER_BATCH):
        batch = chunks[i:i + MAX_CHUNKS_PER_BATCH]
        try:
            response = openai_client.embeddings.create(
                model=EMBEDDING_MODEL,
                input=batch
            )
            batch_embeddings = [item.embedding for item in response.data]
            # Ensure embeddings are floats
            batch_embeddings = [[float(x) for x in embedding] for embedding in batch_embeddings]
            embeddings.extend(batch_embeddings)
            if i + MAX_CHUNKS_PER_BATCH < len(chunks):
                time.sleep(0.5)  # Avoid rate limits
        except Exception as e:
            print(f"Error getting embeddings: {e}")
            # Return zero embeddings for failed batch to maintain alignment
            embeddings.extend([[0.0] * EMBEDDING_DIMENSION] * len(batch))
    
    return embeddings

def process_json_file(file_path: str):
    """Process scraped content JSON file and ingest into Pinecone."""
    print(f"Loading scraped content from {file_path}")
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except Exception as e:
        raise Exception(f"Failed to load JSON file: {e}")

    # Initialize Pinecone
    index = initialize_pinecone()
    
    # Extract and process original article
    print("Processing original article...")
    article_text = data.get('original_text', '')
    article_chunks = clean_and_chunk_text(article_text)
    
    all_chunks = []
    all_metadata = []
    
    # Process original article chunks
    for i, chunk in enumerate(article_chunks):
        chunk_id = f"original_article_{i}"
        all_chunks.append(chunk)
        all_metadata.append({
            "id": chunk_id,
            "type": "original_article",
            "chunk_index": i,
            "total_chunks": len(article_chunks),
            "source": "original_article"
        })
    
    scraped_content = data.get('scraped_content', {})
    print(f"Processing {len(scraped_content)} scraped pages...")
    
    for page_id, page_data in scraped_content.items():
        if page_data.get('status') != 'success':
            continue
        
        # Get page content
        url = page_data.get('url', '')
        title = page_data.get('title', '')
        text_content = page_data.get('text_content', '')
        
        # Skip if no substantial content
        if len(text_content) < 50:
            continue
        
        # Chunk page content
        page_chunks = clean_and_chunk_text(text_content)
        
        # Process page chunks
        for i, chunk in enumerate(page_chunks):
            chunk_id = f"page_{page_id}_chunk_{i}"
            all_chunks.append(chunk)
            all_metadata.append({
                "id": chunk_id,
                "type": "scraped_page",
                "url": url,
                "title": title,
                "chunk_index": i,
                "total_chunks": len(page_chunks),
                "page_id": page_id
            })
            
            if i == 0 and page_data.get('links'):  # Only store links with first chunk
                link_texts = []
                for link in page_data.get('links', []):
                    link_url = link.get('url', '')
                    link_text = link.get('text', '')
                    if link_url and link_text:
                        link_texts.append(f"{link_text}: {link_url}")
                
                if link_texts:
                    links_text = "Links found on page:\n" + "\n".join(link_texts)
                    links_chunks = clean_and_chunk_text(links_text)
                    
                    for j, link_chunk in enumerate(links_chunks):
                        chunk_id = f"page_{page_id}_links_{j}"
                        all_chunks.append(link_chunk)
                        all_metadata.append({
                            "id": chunk_id,
                            "type": "page_links",
                            "url": url,
                            "title": title + " - Links",
                            "chunk_index": j,
                            "total_chunks": len(links_chunks),
                            "page_id": page_id
                        })
    
    # Get embeddings for all chunks
    print(f"Generating embeddings for {len(all_chunks)} chunks...")
    embeddings = get_embeddings(all_chunks)
    
    # Prepare vectors for Pinecone
    vectors = []
    for i, (chunk, embedding, metadata) in enumerate(zip(all_chunks, embeddings, all_metadata)):
        vector_id = str(uuid.uuid4())
        vectors.append({
            "id": vector_id,
            "values": embedding,  # Already ensured to be floats
            "metadata": {
                **metadata,
                "text": chunk[:1000],  # Store truncated text in metadata
                "timestamp": time.time()
            }
        })
        
        # Upsert in batches of 100 or at the end
        if len(vectors) >= 100 or i == len(all_chunks) - 1:
            print(f"Upserting batch of {len(vectors)} vectors to Pinecone...")
            try:
                index.upsert(vectors=vectors)
                vectors = []
            except Exception as e:
                print(f"Failed to upsert batch: {e}")
                raise
    
    print("Ingestion complete!")
    return len(all_chunks)

# Other functions (initialize_pinecone, clean_and_chunk_text, query_similar_content) remain unchanged

In [9]:
import os
from pathlib import Path

# Define file path and query directly
file_path = Path("C:/Users/anant/Misogi/scraped_content.json")  # Use forward slashes for cross-platform compatibility
query = "What is data Science?"  # Corrected typo

# Process the JSON file
try:
    chunks_processed = process_json_file(file_path)
    print(f"Successfully processed {chunks_processed} chunks")
except Exception as e:
    print(f"Failed to process file: {e}")
    exit(1)

# Query similar content
try:
    results = query_similar_content(query)
    if not results:
        print(f"No results found for query: {query}")
    else:
        print("\nSearch Results:")
        for i, result in enumerate(results, 1):  # Start enumeration at 1
            title = result.get('title', 'No title')  # Safe access with default
            score = result.get('score', 0.0)
            source = result.get('source', 'No source')
            text = result.get('text', '')[:200]  # Safe truncation
            print(f"\n{i}. {title} (Score: {score:.4f})")
            print(f"Source: {source}")
            print(f"Content: {text}...")
except Exception as e:
    print(f"Error querying content: {e}")

Loading scraped content from C:\Users\anant\Misogi\scraped_content.json
Index scraped-content already exists.
Processing original article...
Processing 1 scraped pages...
Generating embeddings for 74 chunks...
Upserting batch of 74 vectors to Pinecone...
Ingestion complete!
Successfully processed 74 chunks
No results found for query: What is data Science?


In [10]:

import os
import json
import time
import openai
import pinecone
import sounddevice as sd
import soundfile as sf
import numpy as np
import wave
import tempfile
import threading
from pathlib import Path
from typing import List, Dict, Any
from dotenv import load_dotenv
import speech_recognition as sr 
import threading
from pathlib import Path
from typing import List, Dict, Any
from dotenv import load_dotenv


In [11]:
SAMPLE_RATE = 16000
RECORD_SECONDS = 5
OUTPUT_FILENAME = "response.wav"
GPT_MODEL = "gpt-3.5-turbo"


def record_audio(duration=RECORD_SECONDS):
    """Record audio from microphone"""
    print(f"Recording for {duration} seconds...")
    
    # Record audio
    audio_data = sd.rec(int(SAMPLE_RATE * duration), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
    for _ in range(duration):
        print(".", end="", flush=True)
        time.sleep(1)
    print("\nRecording complete!")
    
    sd.wait()  
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    temp_filename = temp_file.name
    temp_file.close()
    
    with wave.open(temp_filename, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)  # 16-bit audio
        wf.setframerate(SAMPLE_RATE)
        wf.writeframes(audio_data.tobytes())
    
    return temp_filename

def speech_to_text(audio_file_path):
    """Convert speech to text using speech recognition"""
    recognizer = sr.Recognizer()
    
    try:
        with sr.AudioFile(audio_file_path) as source:
            audio_data = recognizer.record(source)
            print("Transcribing...")
            
            try:
                text = recognizer.recognize_google(audio_data)
            except:
                # Fall back to OpenAI's Whisper API
                with open(audio_file_path, "rb") as audio_file:
                    response = openai.audio.transcriptions.create(
                        model="whisper-1",
                        file=audio_file
                    )
                    text = response.text
            
            print(f"Transcription: {text}")
            return text
            
    except Exception as e:
        print(f"Error transcribing audio: {e}")
        return ""
    finally:
        # Clean up temporary file
        try:
            os.remove(audio_file_path)
        except:
            pass

def generate_answer(query, context_data):
    """Generate answer using GPT-3.5 with context from RAG"""
    context = ""
    for i, item in enumerate(context_data):
        context += f"\nSOURCE {i+1} ({item['title']}):\n{item['text']}\n"
    
    # Create prompt for GPT
    messages = [
        {"role": "system", "content": (
            "You are a helpful voice assistant that provides informative answers based on retrieved content. "
            "Use the provided context to answer the user's question. "
            "If the context doesn't contain relevant information, say so politely. "
            "Keep answers concise and conversational, suitable for voice responses. "
            "Don't reference 'context' or 'sources' directly in your answer."
        )},
        {"role": "user", "content": f"CONTEXT: {context}\n\nQUESTION: {query}\n\nPlease provide a helpful answer:"}
    ]
    
    try:
        response = openai.chat.completions.create(
            model=GPT_MODEL,
            messages=messages,
            max_tokens=300,
            temperature=0.7
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error generating answer: {e}")
        return "I'm sorry, I wasn't able to generate an answer. Please try again."

def text_to_speech(text, output_file=OUTPUT_FILENAME):
    """Convert text to speech using OpenAI's TTS API"""
    try:
        # Generate speech using OpenAI's TTS
        response = openai.audio.speech.create(
            model="tts-1",
            voice="alloy",
            input=text
        )
        
        # Save to file
        response.stream_to_file(output_file)
        print(f"Speech saved to {output_file}")
        
        return output_file
    except Exception as e:
        print(f"Error converting text to speech: {e}")
        return None

def play_audio(file_path):
    """Play audio file"""
    try:
        data, fs = sf.read(file_path)
        sd.play(data, fs)
        status = sd.wait()
    except Exception as e:
        print(f"Error playing audio: {e}")

def voice_assistant():
    """Main voice assistant function"""
    print("Voice RAG Assistant Starting...")
    print("Make sure your Pinecone index has been populated with your scraped content.")
    
    # Initialize Pinecone
    try:
        initialize_pinecone()
        print("Connected to Pinecone successfully!")
    except Exception as e:
        print(f"Error connecting to Pinecone: {e}")
        return
    
    while True:
        print("\n" + "="*50)
        print("Press Enter to ask a question (or type 'exit' to quit)")
        user_input = input()
        
        if user_input.lower() == 'exit':
            print("Thank you for using Voice RAG Assistant. Goodbye!")
            break
        
        try:
            # Record audio
            audio_file = record_audio()
            
            # Convert speech to text
            query = speech_to_text(audio_file)
            
            if not query:
                print("Sorry, I couldn't understand that. Please try again.")
                continue
                
            print(f"Processing question: {query}")
            
            # Query for similar content
            print("Searching knowledge base...")
            search_results = query_similar_content(query, top_k=3)
            
            if not search_results:
                answer = "I couldn't find any relevant information in my knowledge base."
            else:
                # Generate answer using GPT with context
                print("Generating answer...")
                answer = generate_answer(query, search_results)
            
            print(f"Answer: {answer}")
            
            # Convert answer to speech
            print("Converting answer to speech...")
            speech_file = text_to_speech(answer)
            
            if speech_file:
                # Play the response
                print("Playing response...")
                play_audio(speech_file)
                
        except Exception as e:
            print(f"An error occurred: {e}")

if __name__ == "__main__":
    voice_assistant()

Voice RAG Assistant Starting...
Make sure your Pinecone index has been populated with your scraped content.
Index scraped-content already exists.
Connected to Pinecone successfully!

Press Enter to ask a question (or type 'exit' to quit)
Recording for 5 seconds...
.....
Recording complete!
Transcribing...
Transcription: hello hello hello hello hello hello hello hello
Processing question: hello hello hello hello hello hello hello hello
Searching knowledge base...
Generating answer...
Answer: I'm here to assist you with any questions you may have. How can I help you today?
Converting answer to speech...


  response.stream_to_file(output_file)


Speech saved to response.wav
Playing response...

Press Enter to ask a question (or type 'exit' to quit)
Recording for 5 seconds...
.....
Recording complete!
Transcribing...
Transcription: hello hello hello hello hello how are you
Processing question: hello hello hello hello hello how are you
Searching knowledge base...
Generating answer...
Answer: I'm here and ready to assist you! How can I help you today?
Converting answer to speech...
Speech saved to response.wav
Playing response...

Press Enter to ask a question (or type 'exit' to quit)
Recording for 5 seconds...
.....
Recording complete!
Transcribing...
Transcription: Good morning.
Processing question: Good morning.
Searching knowledge base...
Generating answer...
Answer: Good morning! How can I assist you today?
Converting answer to speech...
Speech saved to response.wav
Playing response...

Press Enter to ask a question (or type 'exit' to quit)
Recording for 5 seconds...
.....
Recording complete!
Transcribing...
Transcription: 


KeyboardInterrupt: 

In [None]:
import os
import time
import tempfile
import wave
import sounddevice as sd
import soundfile as sf
import speech_recognition as sr
import requests
from dotenv import load_dotenv
from openai import OpenAI
from pathlib import Path

# Load environment variables
load_dotenv()

# Configuration
CARTESIA_API_KEY = os.getenv("CARTESIA_API_KEY") or "sk_car_ZJkw9b4W43vXvtNuEh7sFi"  # Replace with your key or use .env
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or "your-openai-api-key-here"  # Replace or use .env
SAMPLE_RATE = 16000
RECORD_SECONDS = 5
OUTPUT_FILENAME = "response.wav"
GPT_MODEL = "gpt-3.5-turbo"
openai_client = OpenAI(api_key=OPENAI_API_KEY)

def text_to_speech(text, output_file=OUTPUT_FILENAME):
    """Convert text to speech using Cartesia's TTS API."""
    try:
        url = "https://api.cartesia.ai/tts/bytes"
        headers = {
            "Cartesia-Version": "2024-06-10",
            "X-API-Key": CARTESIA_API_KEY,
            "Content-Type": "application/json"
        }
        payload = {
            "model_id": "sonic-2",
            "transcript": text,
            "voice": {
                "mode": "id",
                "id": "bf0a246a-8642-498a-9950-80c35e9276b5"
            },
            "output_format": {
                "container": "wav",
                "encoding": "pcm_f32le",
                "sample_rate": 44100
            },
            "language": "en"
        }

        # Send POST request to Cartesia API
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()  # Raise exception for bad status codes

        # Save binary audio response to WAV file
        output_path = Path(output_file)
        with open(output_path, "wb") as f:
            f.write(response.content)

        print(f"Speech saved to {output_file}")
        return str(output_path)

    except Exception as e:
        print(f"Error converting text to speech: {e}")
        return None

# Rest of your code (unchanged functions: record_audio, speech_to_text, generate_answer, play_audio, voice_assistant)
# Placeholder for required functions to make the artifact complete
def record_audio(duration=RECORD_SECONDS):
    """Record audio from microphone"""
    print(f"Recording for {duration} seconds...")
    audio_data = sd.rec(int(SAMPLE_RATE * duration), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
    for _ in range(duration):
        print(".", end="", flush=True)
        time.sleep(1)
    print("\nRecording complete!")
    sd.wait()
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    temp_filename = temp_file.name
    temp_file.close()
    with wave.open(temp_filename, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(SAMPLE_RATE)
        wf.writeframes(audio_data.tobytes())
    return temp_filename

def speech_to_text(audio_file_path):
    """Convert speech to text using speech recognition"""
    recognizer = sr.Recognizer()
    try:
        with sr.AudioFile(audio_file_path) as source:
            audio_data = recognizer.record(source)
            print("Transcribing...")
            try:
                text = recognizer.recognize_google(audio_data)
            except:
                with open(audio_file_path, "rb") as audio_file:
                    response = openai_client.audio.transcriptions.create(
                        model="whisper-1",
                        file=audio_file
                    )
                    text = response.text
            print(f"Transcription: {text}")
            return text
    except Exception as e:
        print(f"Error transcribing audio: {e}")
        return ""
    finally:
        try:
            os.remove(audio_file_path)
        except:
            pass

def generate_answer(query, context_data):
    """Generate answer using GPT-3.5 with context from RAG"""
    context = ""
    for i, item in enumerate(context_data):
        context += f"\nSOURCE {i+1} ({item['title']}):\n{item['text']}\n"
    messages = [
        {"role": "system", "content": (
            "You are a helpful voice assistant that provides informative answers based on retrieved content. "
            "Use the provided context to answer the user's question. "
            "If the context doesn't contain relevant information, say so politely. "
            "Keep answers concise and conversational, suitable for voice responses. "
            "Don't reference 'context' or 'sources' directly in your answer."
        )},
        {"role": "user", "content": f"CONTEXT: {context}\n\nQUESTION: {query}\n\nPlease provide a helpful answer:"}
    ]
    try:
        response = openai_client.chat.completions.create(
            model=GPT_MODEL,
            messages=messages,
            max_tokens=300,
            temperature=0.7
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error generating answer: {e}")
        return "I'm sorry, I wasn't able to generate an answer. Please try again."

def play_audio(file_path):
    """Play audio file"""
    try:
        data, fs = sf.read(file_path)
        sd.play(data, fs)
        sd.wait()
    except Exception as e:
        print(f"Error playing audio: {e}")

def initialize_pinecone():
    """Placeholder for Pinecone initialization"""
    pass

def query_similar_content(query, top_k):
    """Placeholder for querying similar content"""
    return [{"title": "Sample", "text": "This is a sample response"}]

def voice_assistant():
    """Main voice assistant function"""
    print("Voice RAG Assistant Starting...")
    print("Make sure your Pinecone index has been populated with your scraped content.")
    try:
        initialize_pinecone()
        print("Connected to Pinecone successfully!")
    except Exception as e:
        print(f"Error connecting to Pinecone: {e}")
        return
    while True:
        print("\n" + "="*50)
        print("Press Enter to ask a question (or type 'exit' to quit)")
        user_input = input()
        if user_input.lower() == 'exit':
            print("Thank you for using Voice RAG Assistant. Goodbye!")
            break
        try:
            audio_file = record_audio()
            query = speech_to_text(audio_file)
            if not query:
                print("Sorry, I couldn't understand that. Please try again.")
                continue
            print(f"Processing question: {query}")
            print("Searching knowledge base...")
            search_results = query_similar_content(query, top_k=3)
            if not search_results:
                answer = "I couldn't find any relevant information in my knowledge base."
            else:
                print("Generating answer...")
                answer = generate_answer(query, search_results)
            print(f"Answer: {answer}")
            print("Converting answer to speech...")
            speech_file = text_to_speech(answer)
            if speech_file:
                print("Playing response...")
                play_audio(speech_file)
        except Exception as e:
            print(f"An error occurred: {e}")

if __name__ == "__main__":
    voice_assistant()