In [7]:
import os
import pickle
import time
import requests
from bs4 import BeautifulSoup
from langchain_groq import ChatGroq
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA

# Initialize LLM (Groq model)
llm = ChatGroq(temperature=0, groq_api_key="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", model_name="llama-3.1-70b-versatile")

# File path for FAISS index
file_path = "faiss_store_openai.pkl"

# Function to scrape content from a website
def scrape_website(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract text from specific HTML sections (e.g., <article>, <section>, etc.)
            main_content = soup.find_all(['article', 'section', 'div', 'header', 'footer'])
            text = ' '.join([element.get_text() for element in main_content])

            return text.strip()
        else:
            print(f"Failed to retrieve {url}")
            return ""
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ""

# Function to process website content and save embeddings to FAISS
def process_websites(urls):
    all_text = ""

    print("Starting to scrape websites...")
    for url in urls:
        print(f"Scraping: {url}")
        extracted_text = scrape_website(url)
        all_text += extracted_text + "\n"

    # Split text into smaller chunks
    print("Splitting text into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
    text_chunks = text_splitter.split_text(all_text)

    # Create embeddings using HuggingFace model
    print("Generating embeddings for text chunks...")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_texts(text_chunks, embeddings)

    # Save the FAISS index to a pickle file
    print("Saving the FAISS index...")
    with open(file_path, "wb") as f:
        pickle.dump(vectorstore, f)
    print("FAISS index saved.")

# Function to query the processed data
def handle_query():
    while True:
        query = input("\nAsk a Question (or type 'exit' to quit): ").strip()
        if query.lower() == 'exit':
            print("Exiting the query system.")
            break

        if query:
            try:
                # Load the FAISS index from the pickle file
                if os.path.exists(file_path):
                    with open(file_path, "rb") as f:
                        vectorstore = pickle.load(f)

                    # Initialize the retrieval chain
                    chain = RetrievalQA.from_llm(llm=llm, retriever=vectorstore.as_retriever())

                    # Get response from the chain
                    result = chain.run(query)

                    # Display the response
                    print("\nAnswer:")
                    print(result)
                else:
                    print("FAISS index not found. Please process the websites first.")
            except Exception as e:
                print(f"Error occurred: {e}")
        else:
            print("Please enter a valid query.")

# Function to handle website URL input and initiate processing
def main():
    urls = input("Enter the website URLs (comma-separated): ").split(',')

    # Process websites after URL input
    process_websites(urls)

    # Start querying in a loop
    handle_query()

if __name__ == "__main__":
    main()


Enter the website URLs (comma-separated): https://www.icc-cricket.com/tournaments/t20cricketworldcup
Starting to scrape websites...
Scraping: https://www.icc-cricket.com/tournaments/t20cricketworldcup
Splitting text into chunks...
Generating embeddings for text chunks...
Saving the FAISS index...
FAISS index saved.

Ask a Question (or type 'exit' to quit): mention some indian cricket players

Answer:
Based on the provided context, some Indian cricket players mentioned are:

1. Rohit Sharma
2. Virat Kohli
3. Jasprit Bumrah
4. Hardik Pandya
5. Suryakumar Yadav
6. Axar Patel

Ask a Question (or type 'exit' to quit): classify their roles in cricket

Answer:
Based on the provided context, the roles of the mentioned individuals in cricket are:

1. Rohit Sharma - Batsman and former captain of the Indian cricket team.
2. Virat Kohli - Batsman.
3. Axar Patel - All-rounder (batsman and bowler).
4. Jasprit Bumrah - Bowler.
5. Hardik Pandya - All-rounder (batsman and bowler).
6. Gautam Gambhir - F