In [None]:
!pip install -r requirements.txt #download from github and upload it here

# Cuda Docs Crawling

In [None]:
!python -m spacy download en_core_web_trf

In [None]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

visited_pages = set()
gathered_content = []
semaphore = asyncio.Semaphore(10)  # Limit concurrent requests

async def fetch_page_content(session, target_url, level):
    if level > 5 or target_url in visited_pages: #change 5 to 1 for running purpose otherwise it will take forever to crawl lakhs of links
        return
    visited_pages.add(target_url)

    try:
        async with semaphore:
            async with session.get(target_url, timeout=10) as response:
                if response.status != 200:
                    print(f"Failed to access {target_url}: HTTP {response.status}")
                    return

                content = await response.text()
                page_soup = BeautifulSoup(content, 'lxml')
                print(f"Processing: {target_url}")

                page_text = page_soup.get_text(separator="\n", strip=True)
                gathered_content.append(f"Source: {target_url}\n\n{page_text[:750]}\n{'='*50}\n")

                tasks = []
                for anchor in page_soup.find_all('a', href=True):
                    child_url = urljoin(target_url, anchor['href'])
                    parsed_child = urlparse(child_url)
                    if (parsed_child.netloc == urlparse(target_url).netloc and
                        parsed_child.scheme in ["http", "https"]):
                        tasks.append(asyncio.ensure_future(
                            fetch_page_content(session, child_url, level + 1)
                        ))

                await asyncio.gather(*tasks)

    except asyncio.TimeoutError:
        print(f"Timeout while accessing {target_url}")
    except Exception as general_err:
        print(f"Unexpected error while processing {target_url}: {general_err}")

async def main():
    initial_url = "https://docs.nvidia.com/cuda/"
    async with aiohttp.ClientSession() as session:
        await fetch_page_content(session, initial_url, 0)

    output_file = 'gathered_web_content.txt'
    with open(output_file, 'w', encoding='utf-8') as outfile:
        outfile.writelines(gathered_content)
    print(f'Content gathering completed. Data saved to {output_file}')

def run_async_scraper():
    """Run the scraper in a new event loop."""

    asyncio.run(main())

if __name__ == "__main__":
  await main()


# Data Chunking

In [None]:
import spacy
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Initialize NLP components
print("Loading NLP models...")
text_processor = spacy.load("en_core_web_trf")
text_processor.max_length = 1000000
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("NLP models loaded successfully.")

def import_content(filepath):
    print(f"Importing content from {filepath}...")
    with open(filepath, 'r', encoding='utf-8') as file:
        raw_content = file.read()

    # Split the content by the separator used in the original file
    content_blocks = raw_content.split("\n" + "="*50 + "\n")
    processed_content = []

    for block in content_blocks:
        if block.strip():
            components = block.split("\n", 1)
            source = components[0].replace("Source: ", "")
            body = components[1] if len(components) > 1 else ""
            processed_content.append({'source': source, 'body': body})

    print(f"Imported {len(processed_content)} content blocks.")
    return processed_content

def split_large_text(text, chunk_size=1500000):
    """Divide text into smaller portions of specified maximum size."""
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

def segment_content(raw_data):
    print("Segmenting content...")
    segmented_data = []

    for idx, content_item in enumerate(raw_data, 1):
        segments = []
        body = content_item.get('body', '')

        # Handle large texts by splitting them
        text_portions = split_large_text(body, chunk_size=1500000)

        for portion in text_portions:
            # Process text using spaCy for sentence detection
            doc = text_processor(portion)

            # Segment sentences based on semantic relatedness
            current_segment = []
            for sentence in doc.sents:
                if current_segment:
                    # Evaluate semantic relatedness between current segment and new sentence
                    segment_vector = embedding_model.encode(" ".join([str(s) for s in current_segment]))
                    sentence_vector = embedding_model.encode(sentence.text)
                    relatedness = cosine_similarity([segment_vector], [sentence_vector])[0][0]

                    if relatedness < 0.75:  # Threshold for semantic relatedness
                        segments.append(" ".join([str(s) for s in current_segment]))
                        current_segment = []

                current_segment.append(sentence)

            if current_segment:
                segments.append(" ".join([str(s) for s in current_segment]))

        content_item['segments'] = segments
        segmented_data.append(content_item)
        print(f"Processed item {idx}/{len(raw_data)}: {len(segments)} segments created.")

    return segmented_data

def export_segmented_content(segmented_data, output_path):
    print(f"Exporting segmented content to {output_path}...")
    with open(output_path, 'w', encoding='utf-8') as file:
        for content_item in segmented_data:
            file.write(f"Source: {content_item['source']}\n\n")
            for segment in content_item['segments']:
                file.write(segment + '\n\n')
            file.write("="*50 + '\n\n')
    print("Export completed.")

if __name__ == '__main__':
    source_file = 'gathered_web_content.txt'
    output_file = 'segmented_content.txt'

    # Import gathered content
    raw_data = import_content(source_file)

    # Segment the content based on semantic relatedness
    segmented_data = segment_content(raw_data)

    # Export segmented content to text file
    export_segmented_content(segmented_data, output_file)

    print(f'Content Chunking completed. Results saved to {output_file}')

# Chunks to Embeddings

In [None]:
import gc
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from gensim import corpora
from gensim.models import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
import re
from tqdm import tqdm

compute_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Utilizing computational device: {compute_device}")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_model.to(compute_device)

def text_block_iterator(input_file, block_size=1000):
    with open(input_file, 'r', encoding='utf-8') as file:
        text_blocks = []
        for line in file:
            text_blocks.append(line.strip())
            if len(text_blocks) == block_size:
                yield text_blocks
                text_blocks = []
        if text_blocks:
            yield text_blocks

def clean_and_tokenize(text):
    cleaned_text = re.sub(r'[^\w\s]', '', text.lower())
    return [token for token in cleaned_text.split() if token not in STOPWORDS]

def build_topic_model(text_iterator, topic_count=10):
    word_dict = corpora.Dictionary()
    doc_term_matrix = []
    for blocks in text_iterator:
        processed_blocks = [clean_and_tokenize(block) for block in blocks]
        word_dict.add_documents(processed_blocks)
        doc_term_matrix.extend([word_dict.doc2bow(text) for text in processed_blocks])
    lda_model = LdaModel(corpus=doc_term_matrix, id2word=word_dict, num_topics=topic_count, random_state=42)
    return lda_model, word_dict

def extract_dominant_topic(text_block, lda_model, word_dict):
    bow = word_dict.doc2bow(clean_and_tokenize(text_block))
    topic_distribution = lda_model.get_document_topics(bow)
    return max(topic_distribution, key=lambda x: x[1])[0] if topic_distribution else None

@torch.no_grad()
def analyze_and_persist_blocks(text_blocks, lda_model, word_dict, output_file, batch_size=64):
    with open(output_file, 'a', encoding='utf-8') as out_file:
        for i in tqdm(range(0, len(text_blocks), batch_size), desc="Analyzing batches"):
            current_batch = text_blocks[i:i + batch_size]
            try:
                embeddings = embedding_model.encode(current_batch, convert_to_tensor=True, device=compute_device)
                topics = [extract_dominant_topic(block, lda_model, word_dict) for block in current_batch]

                for block, embedding, topic in zip(current_batch, embeddings, topics):
                    embedding_str = ' '.join(map(str, embedding.cpu().numpy().tolist()))
                    out_file.write(f"{block}\t{embedding_str}\t{topic}\n")
            except Exception as e:
                print(f"Error in batch {i//batch_size}: {e}")

            torch.cuda.empty_cache()

if __name__ == '__main__':
    input_file = 'segmented_content.txt'
    result_file = 'analyzed_text_data.txt'

    try:
        print("Constructing topic model...")
        lda_model, word_dict = build_topic_model(text_block_iterator(input_file))

        print("Analyzing text blocks and persisting results...")
        for text_blocks in text_block_iterator(input_file):
            analyze_and_persist_blocks(text_blocks, lda_model, word_dict, result_file, batch_size=64)
            gc.collect()

        print(f'Text analysis and embedding generation completed. Results saved to {result_file}')
    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
import faiss
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import wordnet
import re

# Ensure necessary NLTK data is available
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

# Initialize the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
documents="segmented_content.txt"

def create_and_save_index(embedding_model, documents, index_variant="flat"):

    # Create the FAISS index
    index = faiss.IndexFlatL2(embedding_model.shape[1])

    # Add embeddings to the index
    index.add(embedding_model)

    # Save the FAISS index to a file
    faiss.write_index(index, f"cuda_{index_variant.lower()}.index")

    # Create metadata dictionary
    document_data = {doc["id"]: {"title": doc["title"]} for doc in documents}

    # Save metadata to a pickle file
    with open(f"cuda_{index_variant.lower()}_metadata.pkl", 'wb') as f:
        pickle.dump(document_data, f)


def load_search_components(index_variant):
    vector_index = faiss.read_index(f"cuda_{index_variant.lower()}.index")
    with open(f"cuda_{index_variant.lower()}_metadata.pkl", 'rb') as f:
        document_data = pickle.load(f)
    return vector_index, document_data

def initialize_lexical_index(document_data):
    tokenized_docs = [doc[0].split() for doc in document_data]
    return BM25Okapi(tokenized_docs)

def sanitize_input(text):
    # Strip non-alphabetic characters and convert to lowercase
    return re.sub(r'[^a-zA-Z\s]', '', text).lower()

def determine_pos(term):
    tag = nltk.pos_tag([term])[0][1][0].upper()
    pos_mapping = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return pos_mapping.get(tag, wordnet.NOUN)

def enrich_query(original_query, synonym_count=3):
    additional_terms = []
    for term in original_query.split():
        term_synonyms = []
        for synset in wordnet.synsets(term):
            for lemma in synset.lemmas():
                if lemma.name() != term and lemma.name() not in term_synonyms:
                    term_synonyms.append(lemma.name())
                    if len(term_synonyms) == synonym_count:
                        break
            if len(term_synonyms) == synonym_count:
                break
        additional_terms.extend(term_synonyms)
    return original_query + ' ' + ' '.join(additional_terms)

def apply_relevance_feedback(query_embedding, vector_index, document_data, top_k=5, feedback_weight=0.3):
    # Initial search
    _, top_indices = vector_index.search(query_embedding.reshape(1, -1), top_k)

    # Extract top documents
    top_documents = [document_data[i][0] for i in top_indices[0]]

    # Create TF-IDF representation
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(top_documents)

    # Compute centroid
    centroid = tfidf_matrix.mean(axis=0)

    # Expand query embedding
    expanded_embedding = query_embedding + feedback_weight * embedding_model.encode(tfidf_vectorizer.get_feature_names_out()[centroid.argmax()])

    return expanded_embedding

def perform_hybrid_search(vector_index, lexical_index, document_data, user_query, result_count=5,
                          vector_weight=0.5, use_query_enrichment=True, use_relevance_feedback=True):
    # Preprocess and optionally enrich the query
    clean_query = sanitize_input(user_query)
    if use_query_enrichment:
        enriched_query = enrich_query(clean_query)
    else:
        enriched_query = clean_query

    # Vector-based search
    query_embedding = embedding_model.encode(enriched_query)
    if use_relevance_feedback:
        query_embedding = apply_relevance_feedback(query_embedding, vector_index, document_data)
    vector_distances, vector_indices = vector_index.search(query_embedding.reshape(1, -1), result_count*2)

    # Lexical search
    lexical_scores = lexical_index.get_scores(enriched_query.split())
    lexical_top_indices = np.argsort(lexical_scores)[::-1][:result_count*2]

    # Merge results
    combined_relevance = {}
    for i, idx in enumerate(vector_indices[0]):
        combined_relevance[idx] = vector_weight * (1 - vector_distances[0][i])  # Convert distance to similarity

    for i, idx in enumerate(lexical_top_indices):
        if idx in combined_relevance:
            combined_relevance[idx] += (1 - vector_weight) * (lexical_scores[idx] / max(lexical_scores))
        else:
            combined_relevance[idx] = (1 - vector_weight) * (lexical_scores[idx] / max(lexical_scores))

    # Select top results
    top_indices = sorted(combined_relevance, key=combined_relevance.get, reverse=True)[:result_count]

    search_results = []
    for idx in top_indices:
        content, category, source = document_data[idx]
        search_results.append({
            "content": content,
            "category": category,
            "source": source,
            "relevance": combined_relevance[idx]
        })
    return search_results

def main():
    index_variant = "FLAT"  # Alternative: "IVF"
    vector_index, document_data = load_search_components(index_variant)
    lexical_index = initialize_lexical_index(document_data)

    while True:
        user_query = input("Enter your search query (or 'exit' to quit): ")
        if user_query.lower() == 'exit':
            break

        results = perform_hybrid_search(vector_index, lexical_index, document_data, user_query)

        print("\nSearch Results:")
        for i, result in enumerate(results, 1):
            print(f"{i}. Content: {result['content'][:100]}...")
            print(f"   Category: {result['category']}")
            print(f"   Source: {result['source']}")
            print(f"   Relevance: {result['relevance']:.4f}")
            print("---")

if __name__ == "__main__":
    main()

In [None]:
from openai import OpenAI
import os
from typing import List, Dict
import numpy as np
from sentence_transformers import SentenceTransformer

# Set your OpenAI API key directly here
OPENAI_API_KEY = "YOUR_OPENAI_API_KEY"

# Initialize OpenAI client
openai_client = OpenAI(api_key=OPENAI_API_KEY)

# Initialize SentenceTransformer model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def build_context(search_results: List[Dict], user_query: str) -> str:
    """Build the context for the LLM from search results, ranking by relevance to the query."""
    query_embedding = embedding_model.encode(user_query)
    result_embeddings = embedding_model.encode([result['chunk'] for result in search_results])

    similarities = np.dot(result_embeddings, query_embedding) / (np.linalg.norm(result_embeddings, axis=1) * np.linalg.norm(query_embedding))

    ranked_results = [result for _, result in sorted(zip(similarities, search_results), key=lambda x: x[0], reverse=True)]

    context_text = "Here are some relevant passages from the CUDA documentation, ordered by relevance:\n\n"
    for idx, result in enumerate(ranked_results, 1):
        context_text += f"{idx}. {result['chunk']}\n\n"
    return context_text

def generate_answer(question: str, search_results: List[Dict]) -> str:
    """Use GPT to answer the question based on the retrieved and ranked results."""
    context = build_context(search_results, question)

    messages = [
        {"role": "system", "content": """You are a helpful assistant that answers questions about CUDA based on the provided context.
        Follow these guidelines:
        1. Always base your answers on the information provided in the context.
        2. If the answer cannot be found in the context, clearly state that you don't have enough information to answer accurately.
        3. If the context contains conflicting information, mention this and explain the different viewpoints.
        4. Use technical terms correctly and explain them if they're complex.
        5. If appropriate, structure your answer with bullet points or numbered lists for clarity.
        6. Cite the relevant passage numbers from the context to support your answer.
        7. If the user's question is unclear, ask for clarification before attempting to answer."""},
        {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"}
    ]

    response = openai_client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        max_tokens=1000,
        n=1,
        stop=None,
        temperature=0.3
    )

    return response.choices[0].message.content.strip()

def main():
    idx_type = "FLAT"  # or "IVF"
    index, metadata = load_search_components(idx_type)  # Assuming these functions are defined elsewhere
    lex_index = initialize_lexical_index(metadata)  # Assuming this function is defined elsewhere

    while True:
        query = input("Enter your question about CUDA (or 'quit' to exit): ")
        if query.lower() == 'quit':
            break

        search_results = hybrid_search(index, lex_index, metadata, query, k=5, alpha=0.5, use_query_expansion=True, use_prf=True)

        print("\nRetrieved Passages:")
        for idx, result in enumerate(search_results, 1):
            print(f"{idx}. {result['chunk'][:100]}...")

        answer = generate_answer(query, search_results)
        print("\nAnswer:")
        print(answer)
        print("---")

if __name__ == "__main__":
    main()


In [None]:
import gradio as gr
from ChatGPT import answer
from Vector_Retrieval_Reranking import hybrid_search
def main():
    idx_type = "FLAT"  # or "IVF"
    index, metadata = load_search_components(idx_type)
    lex_index = initialize_lexical_index(metadata)

    def inference(query):
        nonlocal index, lex_index, metadata
        results = hybrid_search(index, lex_index, metadata, query, k=5, alpha=0.5, use_query_expansion=True, use_prf=True)
        answer = generate_answer(query, results)
        return answer

    iface = gr.Interface(
        fn=inference,
        inputs="text",
        outputs="text",
        title="CUDA Documentation Assistant",
        description="Ask a question about CUDA documentation.",
        theme="huggingface",
        examples=[["How to use CUDA with Python?"]],
    )
    iface.launch(share=True)

if __name__ == "__main__":
    main()








