In [1]:
import PyPDF2
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from rake_nltk import Rake
import json
import os

# Function to extract text from a PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to generate keywords using TF-IDF
def generate_keywords_tfidf(content, top_n=5):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=top_n)
    X = vectorizer.fit_transform([content])
    keywords = vectorizer.get_feature_names_out()
    return list(keywords)

# Function to scrape a web page
def scrape_web_page(url, topic="General"):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch page: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    documents = []

    # Find all <h2> tags
    titles = soup.find_all('h2')  # Adjust this tag based on the page structure
    for title in titles:
        question = title.text.strip()
        content = question + " "  # Initialize content with the question text
        sibling = title.find_next_sibling()

        # Collect content until the next <h2> or the end of the section
        while sibling and sibling.name != 'h2':
            if sibling.name in ['p', 'ul', 'ol', 'div']:  # Include other relevant tags as needed
                content += sibling.get_text(separator=" ", strip=True) + " "
            sibling = sibling.find_next_sibling()

        # Generate a relevant part of the URL as the source
        section_id = title.get('id')  # Extract the 'id' attribute if available
        if section_id:
            section_url = f"{url}#{section_id}"  # Append the ID as a fragment
        else:
            section_url = url  # Use the main URL if no ID is found

        # Generate keywords using TF-IDF
        keywords = generate_keywords_tfidf(content)

        documents.append({
            "title": question,
            "content": content.strip(),
            "topic": topic,
            "keywords": keywords,
            "source": section_url  # Use the section URL for this title
        })

    return documents


# Extract text from PDFs
pdf_paths = [
    "C:/Users/omidm/Downloads/RCP#0032 Intake 10 Student Internship Summary reports (share with all).pdf",
    "C:/Users/omidm/Downloads/Research Computing Platform Student Internship Handbook.pdf",
    "C:/Users/omidm/Downloads/RCP0026 Welcome Students Semeter 2 Intake 10.pdf"
]
pdf_data = []

for pdf_path in pdf_paths:
    pdf_text = extract_text_from_pdf(pdf_path)
    pdf_keywords = generate_keywords_tfidf(pdf_text)
    
    # Extract the file name (without extension) from the file path
    file_name = os.path.splitext(os.path.basename(pdf_path))[0]
    
    pdf_data.append({
        "title": f"Content from {file_name}",
        "content": pdf_text,
        "topic": file_name,  # Use the file name as the topic
        "keywords": pdf_keywords,
        "source": None  # Do not save any source for PDFs
    })


# Scrape web pages
links = [
    {"url": "https://wehi-researchcomputing.github.io/students", "topic": "Unpaid Student Internship Program"},
    {"url": "https://wehi-researchcomputing.github.io/complex-projects", "topic": "complex ambiguous projects"},
    {"url": "https://wehi-researchcomputing.github.io/software_maturity_model", "topic": "software maturity model"},
    {"url": "https://wehi-researchcomputing.github.io/explanation_about_ohs", "topic": "explanation about ohs"},
    {"url": "https://wehi-researchcomputing.github.io/top-5-mistakes", "topic": "top 5 mistakes"},
    {"url": "https://wehi-researchcomputing.github.io/project-wikis", "topic": "project wikis"},
    {"url": "https://wehi-researchcomputing.github.io/student-loxcoder", "topic": "loxcoder"},
    {"url": "https://wehi-researchcomputing.github.io/student-data-commons", "topic": "student data commons"},
    {"url": "https://wehi-researchcomputing.github.io/student-cryoem", "topic": "cryoem"},
    {"url": "https://wehi-researchcomputing.github.io/student-genomics-qc", "topic": "genomics quantum computing"},
    {"url": "https://wehi-researchcomputing.github.io/student-schex", "topic": "schex"},
    {"url": "https://wehi-researchcomputing.github.io/student-mixOmics.html", "topic": "mixOmics"},
    {"url": "https://wehi-researchcomputing.github.io/student-capacity-planning.html", "topic": "capacity planning"},
    {"url": "https://wehi-researchcomputing.github.io/student-haemosphere", "topic": "haemosphere"},
    {"url": "https://wehi-researchcomputing.github.io/student-imaging", "topic": "imaging"},
    {"url": "https://wehi-researchcomputing.github.io/student-quantum", "topic": "quantum"},
    {"url": "https://wehi-researchcomputing.github.io/student-genomics-metadata.html", "topic": "genomics metadata"},
    {"url": "https://wehi-researchcomputing.github.io/student-aive", "topic": "aive"},
    {"url": "https://wehi-researchcomputing.github.io/student-bionix", "topic": "bionix"},
    {"url": "https://wehi-researchcomputing.github.io/student-clinical-dashboards", "topic": "clinical dashboards"},
    {"url": "https://wehi-researchcomputing.github.io/email_acknowledgement", "topic": "email acknowledgement"},
    {"url": "https://wehi-researchcomputing.github.io/code-of-conduct", "topic": "code of conduct"},
    {"url": "https://wehi-researchcomputing.github.io/faq", "topic": "FAQ"},
    {"url": "https://wehi-researchcomputing.github.io/intake_dates", "topic": "Intake Date"}     


]
web_data = []
for link in links:
    data = scrape_web_page(link["url"], topic=link["topic"])
    if data:
        web_data.extend(data)

# Combine PDF data and web-scraped data
all_data = pdf_data + web_data

# Print and save the aggregated data
for doc in all_data:
    print(f"Title: {doc['title']}")
    print(f"Content: {doc['content']}")  # Truncate for readability
    print(f"Topic: {doc['topic']}")
    print(f"Keywords: {', '.join(doc['keywords'])}")
    print(f"Source: {doc['source']}")  # Print source if available
    print("-" * 40)

with open("aggregated_data.json", "w") as f:
    json.dump(all_data, f, indent=4)


Title: Content from RCP#0032 Intake 10 Student Internship Summary reports (share with all)
Content: RCP#0032
Intake
10
Student 
Internship
Summary
reports
Table
of
Contents
Link
to
Intake
9
Summary
Report
RCP#0016
Intake
9
Student
Internship
Summary
reports.pdf
that
can
be
used
as
an
example.AIVE
Student
Project
Interns:
Si
Yang
(Sean)
Chen,
Chun-Tung
(Chloe)
Tsai,
Jiawen
Deng
High-Level
Domain
work
During
the
first
4-5
weeks,
we
learned
about
and
tried
to
understand
the
AIVE
workflow
for
converting
2D
cell
image
stacks
into
3D
models
and
familiarised
ourselves
with
software
tools
used
in
the
workflow
such
as
ImageJ,
MIB
and
WEKA.
Our
in-depth
understanding
was
presented
in
the
whiteboard
presentation,
which
detailed
key
stages
in
AIVE’s
workflow.
The
team
also
developed
high-level
flowcharts
showing
the
entire
process
and
the
interconnected
stages.
Subsequently ,
we
decided
to
work
on
the
organelle
segmentation
stage.
We
converted 
two
ImageJ
macros
(macros
1
and
1b)
from
ImageJ
Macro

In [None]:
import PyPDF2
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from rake_nltk import Rake
import json
import os

# Function to extract text from a PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to generate keywords using TF-IDF
def generate_keywords_tfidf(content, top_n=5):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=top_n)
    X = vectorizer.fit_transform([content])
    keywords = vectorizer.get_feature_names_out()
    return list(keywords)

def scrape_web_page(url, topic="General"):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch page: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    documents = []

    # Find all <h2> tags
    titles = soup.find_all('h2')  # Adjust this tag based on the page structure
    for title in titles:
        question = title.text.strip()
        content = question + " "  # Initialize content with the question text
        sibling = title.find_next_sibling()

        # Collect content until the next <h2> or the end of the section
        while sibling and sibling.name != 'h2':
            if sibling.name in ['p', 'ul', 'ol', 'div']:  # Include other relevant tags as needed
                content += sibling.get_text(separator=" ", strip=True) + " "
                
                # Extract links from anchor tags within this section
                for link in sibling.find_all('a'):
                    link_text = link.get_text(strip=True)
                    link_url = link['href']
                    content += f" [{link_text}]({link_url}) "  # Add links to content

            sibling = sibling.find_next_sibling()

        # Generate a relevant part of the URL as the source
        section_id = title.get('id')  # Extract the 'id' attribute if available
        if section_id:
            section_url = f"{url}#{section_id}"  # Append the ID as a fragment
        else:
            section_url = url  # Use the main URL if no ID is found

        # Generate keywords using TF-IDF
        keywords = generate_keywords_tfidf(content)

        documents.append({
            "title": question,
            "content": content.strip(),
            "topic": topic,
            "keywords": keywords,
            "source": section_url  # Use the section URL for this title
        })

    return documents

# Extract text from PDFs
pdf_paths = [
    "C:/Users/omidm/Downloads/RCP#0032 Intake 10 Student Internship Summary reports (share with all).pdf",
    "C:/Users/omidm/Downloads/Research Computing Platform Student Internship Handbook.pdf",
    "C:/Users/omidm/Downloads/RCP0026 Welcome Students Semeter 2 Intake 10.pdf"
]
pdf_data = []

for pdf_path in pdf_paths:
    pdf_text = extract_text_from_pdf(pdf_path)
    pdf_keywords = generate_keywords_tfidf(pdf_text)
    
    # Extract the file name (without extension) from the file path
    file_name = os.path.splitext(os.path.basename(pdf_path))[0]
    
    pdf_data.append({
        "title": f"Content from {file_name}",
        "content": pdf_text,
        "topic": file_name,  # Use the file name as the topic
        "keywords": pdf_keywords,
        "source": None  # Do not save any source for PDFs
    })


# Scrape web pages
links = [
    {"url": "https://wehi-researchcomputing.github.io/students", "topic": "Unpaid Student Internship Program"},
    {"url": "https://wehi-researchcomputing.github.io/complex-projects", "topic": "complex ambiguous projects"},
    {"url": "https://wehi-researchcomputing.github.io/software_maturity_model", "topic": "software maturity model"},
    {"url": "https://wehi-researchcomputing.github.io/explanation_about_ohs", "topic": "explanation about ohs"},
    {"url": "https://wehi-researchcomputing.github.io/top-5-mistakes", "topic": "top 5 mistakes"},
    {"url": "https://wehi-researchcomputing.github.io/project-wikis", "topic": "project wikis"},
    {"url": "https://wehi-researchcomputing.github.io/student-loxcoder", "topic": "loxcoder"},
    {"url": "https://wehi-researchcomputing.github.io/student-data-commons", "topic": "student data commons"},
    {"url": "https://wehi-researchcomputing.github.io/student-cryoem", "topic": "cryoem"},
    {"url": "https://wehi-researchcomputing.github.io/student-genomics-qc", "topic": "genomics quantum computing"},
    {"url": "https://wehi-researchcomputing.github.io/student-schex", "topic": "schex"},
    {"url": "https://wehi-researchcomputing.github.io/student-mixOmics.html", "topic": "mixOmics"},
    {"url": "https://wehi-researchcomputing.github.io/student-capacity-planning.html", "topic": "capacity planning"},
    {"url": "https://wehi-researchcomputing.github.io/student-haemosphere", "topic": "haemosphere"},
    {"url": "https://wehi-researchcomputing.github.io/student-imaging", "topic": "imaging"},
    {"url": "https://wehi-researchcomputing.github.io/student-quantum", "topic": "quantum"},
    {"url": "https://wehi-researchcomputing.github.io/student-genomics-metadata.html", "topic": "genomics metadata"},
    {"url": "https://wehi-researchcomputing.github.io/student-aive", "topic": "aive"},
    {"url": "https://wehi-researchcomputing.github.io/student-bionix", "topic": "bionix"},
    {"url": "https://wehi-researchcomputing.github.io/student-clinical-dashboards", "topic": "clinical dashboards"},
    {"url": "https://wehi-researchcomputing.github.io/email_acknowledgement", "topic": "email acknowledgement"},
    {"url": "https://wehi-researchcomputing.github.io/code-of-conduct", "topic": "code of conduct"},
    {"url": "https://wehi-researchcomputing.github.io/faq", "topic": "FAQ"},
    {"url": "https://wehi-researchcomputing.github.io/intake_dates", "topic": "Intake Date"}     


]
web_data = []
for link in links:
    data = scrape_web_page(link["url"], topic=link["topic"])
    if data:
        web_data.extend(data)

# Combine PDF data and web-scraped data
all_data = pdf_data + web_data

# Print and save the aggregated data
for doc in all_data:
    print(f"Title: {doc['title']}")
    print(f"Content: {doc['content']}")  # Truncate for readability
    print(f"Topic: {doc['topic']}")
    print(f"Keywords: {', '.join(doc['keywords'])}")
    print(f"Source: {doc['source']}")  # Print source if available
    print("-" * 40)

with open("aggregated_data.json", "w") as f:
    json.dump(all_data, f, indent=4)


In [None]:
from flask import Flask, request, jsonify, render_template_string
import google.generativeai as genai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
import os

# HTML template as a string
HTML_TEMPLATE = '''
<!DOCTYPE html>
<html>
<head>
    <title>Question Answering System</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
        }
        .container {
            margin-top: 20px;
        }
        #question {
            width: 100%;
            padding: 10px;
            margin-bottom: 10px;
        }
        #response {
            margin-top: 20px;
            white-space: pre-wrap;
        }
        .sources {
            margin-top: 10px;
            font-size: 0.9em;
        }
        .content-link {
            color: blue; /* Set text color to blue */
            text-decoration: underline; /* Underline for links */
            cursor: pointer; /* Change cursor to pointer for links */
        }
    </style>
</head>
<body>
    <h1>Ask a Question</h1>
    <div class="container">
        <textarea id="question" rows="4" placeholder="Enter your question here..."></textarea>
        <button onclick="askQuestion()">Submit</button>
        <div id="response"></div>
        <div id="sources" class="sources"></div>
    </div>

    <script>
        async function askQuestion() {
            const question = document.getElementById('question').value;
            const responseDiv = document.getElementById('response');
            const sourcesDiv = document.getElementById('sources');
            
            responseDiv.innerHTML = 'Loading...';
            sourcesDiv.innerHTML = '';

            try {
                const response = await fetch('/ask', {
                    method: 'POST',
                    headers: {
                        'Content-Type': 'application/json',
                    },
                    body: JSON.stringify({ question: question }),
                });

                const data = await response.json();
                responseDiv.innerHTML = data.answer;
                
                if (data.sources && data.sources.length > 0) {
                    sourcesDiv.innerHTML = '<h3>Sources:</h3>' + 
                        data.sources.map(source => 
                            `<p><a href="${source.url}" target="_blank" class="content-link">${source.title}</a></p>`
                        ).join('');
                }
            } catch (error) {
                responseDiv.innerHTML = 'Error: ' + error.message;
            }
        }
    </script>
</body>
</html>
'''

# Configure Gemini directly with API key
genai.configure(api_key="")
model = genai.GenerativeModel('gemini-pro')

app = Flask(__name__)

try:
    # Load the data
    with open('aggregated_data.json', 'r', encoding='utf-8') as f:
        knowledge_base = json.load(f)
except FileNotFoundError:
    raise FileNotFoundError("aggregated_data.json not found. Please make sure it exists in the same directory as app.py")

# Prepare documents for TF-IDF
documents = [doc['content'] for doc in knowledge_base]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

def get_relevant_documents(query, top_k=3):
    # Transform the query
    query_vector = vectorizer.transform([query])
    
    # Calculate similarity
    similarities = cosine_similarity(query_vector, tfidf_matrix)[0]
    
    # Get top k most similar documents
    top_indices = similarities.argsort()[-top_k:][::-1]
    
    relevant_docs = []
    for idx in top_indices:
        doc = knowledge_base[idx]
        relevant_docs.append({
            'content': doc['content'],
            'source': doc['source'],
            'title': doc['title']
        })
    
    return relevant_docs

@app.route('/')
def home():
    return render_template_string(HTML_TEMPLATE)

@app.route('/ask', methods=['POST'])
def ask():
    try:
        data = request.json
        question = data['question']
        
        # Get relevant documents
        relevant_docs = get_relevant_documents(question)
        
        # Prepare context for Gemini
        context = "Based on the following information:\n\n"
        for doc in relevant_docs:
            # Format content to include clickable links
            content_with_links = doc['content'].replace("http", "<a href='http").replace(" ", "' target='_blank'> </a>")
            context += f"<div class='content-link'>{content_with_links}</div>\n\n"
        
        prompt = f"""{context}
        
        Question: {question}
        
        Please provide a clear and concise answer based only on the information provided above. 
        If the information is not sufficient to answer the question, please say so."""
        
        # Generate response using Gemini
        response = model.generate_content(prompt)
        
        # Prepare sources information
        sources = []
        for doc in relevant_docs:
            if doc['source']:  # Only include if source URL exists
                sources.append({
                    'url': doc['source'],
                    'title': doc['title']
                })
        
        return jsonify({
            'answer': response.text,
            'sources': sources
        })
        
    except Exception as e:
        return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
    app.run(debug=True, use_reloader=False)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [31/Jan/2025 11:33:46] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [31/Jan/2025 11:34:12] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [31/Jan/2025 11:34:36] "POST /ask HTTP/1.1" 200 -
127.0.0.1 - - [31/Jan/2025 11:35:44] "POST /ask HTTP/1.1" 200 -
127.0.0.1 - - [31/Jan/2025 11:36:14] "POST /ask HTTP/1.1" 200 -
127.0.0.1 - - [31/Jan/2025 11:36:43] "POST /ask HTTP/1.1" 200 -
127.0.0.1 - - [31/Jan/2025 11:38:17] "POST /ask HTTP/1.1" 200 -
127.0.0.1 - - [31/Jan/2025 11:40:25] "POST /ask HTTP/1.1" 200 -
127.0.0.1 - - [31/Jan/2025 11:42:06] "POST /ask HTTP/1.1" 200 -
127.0.0.1 - - [31/Jan/2025 11:43:00] "POST /ask HTTP/1.1" 200 -


In [3]:
from flask import Flask, request, jsonify, render_template_string
import google.generativeai as genai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
import os

# HTML template as a string
HTML_TEMPLATE = '''
<!DOCTYPE html>
<html>
<head>
    <title>Question Answering System</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
        }
        .container {
            margin-top: 20px;
        }
        #question {
            width: 100%;
            padding: 10px;
            margin-bottom: 10px;
        }
        #response {
            margin-top: 20px;
            white-space: pre-wrap;
        }
        .sources {
            margin-top: 10px;
            font-size: 0.9em;
        }
    </style>
</head>
<body>
    <h1>Ask a Question</h1>
    <div class="container">
        <textarea id="question" rows="4" placeholder="Enter your question here..."></textarea>
        <button onclick="askQuestion()">Submit</button>
        <div id="response"></div>
        <div id="sources" class="sources"></div>
    </div>

    <script>
        async function askQuestion() {
            const question = document.getElementById('question').value;
            const responseDiv = document.getElementById('response');
            const sourcesDiv = document.getElementById('sources');
            
            responseDiv.innerHTML = 'Loading...';
            sourcesDiv.innerHTML = '';

            try {
                const response = await fetch('/ask', {
                    method: 'POST',
                    headers: {
                        'Content-Type': 'application/json',
                    },
                    body: JSON.stringify({ question: question }),
                });

                const data = await response.json();
                responseDiv.innerHTML = data.answer;
                
                if (data.sources && data.sources.length > 0) {
                    sourcesDiv.innerHTML = '<h3>Sources:</h3>' + 
                        data.sources.map(source => 
                            `<p><a href="${source.url}" target="_blank">${source.title}</a></p>`
                        ).join('');
                }
            } catch (error) {
                responseDiv.innerHTML = 'Error: ' + error.message;
            }
        }
    </script>
</body>
</html>
'''

# Configure Gemini directly with API key
genai.configure(api_key="")
model = genai.GenerativeModel('gemini-pro')

app = Flask(__name__)

try:
    # Load the data
    with open('aggregated_data.json', 'r', encoding='utf-8') as f:
        knowledge_base = json.load(f)
except FileNotFoundError:
    raise FileNotFoundError("aggregated_data.json not found. Please make sure it exists in the same directory as app.py")

# Prepare documents for TF-IDF
documents = [doc['content'] for doc in knowledge_base]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

def get_relevant_documents(query, top_k=3):
    # Transform the query
    query_vector = vectorizer.transform([query])
    
    # Calculate similarity
    similarities = cosine_similarity(query_vector, tfidf_matrix)[0]
    
    # Get top k most similar documents
    top_indices = similarities.argsort()[-top_k:][::-1]
    
    relevant_docs = []
    for idx in top_indices:
        doc = knowledge_base[idx]
        relevant_docs.append({
            'content': doc['content'],
            'source': doc['source'],
            'title': doc['title']
        })
    
    return relevant_docs

@app.route('/')
def home():
    return render_template_string(HTML_TEMPLATE)

@app.route('/ask', methods=['POST'])
def ask():
    try:
        data = request.json
        question = data['question']
        
        # Get relevant documents
        relevant_docs = get_relevant_documents(question)
        
        # Prepare context for Gemini
        context = "Based on the following information:\n\n"
        for doc in relevant_docs:
            context += f"Document: {doc['content']}\n\n"
        
        prompt = f"""{context}
        
        Question: {question}
        
        Please provide a clear and concise answer based only on the information provided above. 
        If the information is not sufficient to answer the question, please say so."""
        
        # Generate response using Gemini
        response = model.generate_content(prompt)
        
        # Prepare sources information
        sources = []
        for doc in relevant_docs:
            if doc['source']:  # Only include if source URL exists
                sources.append({
                    'url': doc['source'],
                    'title': doc['title']
                })
        
        return jsonify({
            'answer': response.text,
            'sources': sources
        })
        
    except Exception as e:
        return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
    app.run(debug=True, use_reloader=False)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [30/Jan/2025 12:34:32] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [30/Jan/2025 12:34:44] "POST /ask HTTP/1.1" 200 -
127.0.0.1 - - [30/Jan/2025 12:35:02] "POST /ask HTTP/1.1" 200 -


In [None]:
############Improved Version

In [1]:
import gradio as gr
from flask import Flask
import google.generativeai as genai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json

# Configure Gemini with API key
genai.configure(api_key="")
model = genai.GenerativeModel('gemini-pro')

# Flask app for other purposes if needed
app = Flask(__name__)

# Load knowledge base
with open('aggregated_data.json', 'r', encoding='utf-8') as f:
    knowledge_base = json.load(f)

# Prepare documents for TF-IDF
documents = [doc['content'] for doc in knowledge_base]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

def get_relevant_documents(query, top_k=3):
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, tfidf_matrix)[0]
    top_indices = similarities.argsort()[-top_k:][::-1]
    relevant_docs = [knowledge_base[idx] for idx in top_indices]
    return relevant_docs

def answer_question(question):
    # Get relevant documents
    relevant_docs = get_relevant_documents(question)
    
    # Prepare context for Gemini
    context = "Based on the following information:\n\n"
    for doc in relevant_docs:
        context += f"Document: {doc['content']}\n\n"
    
    prompt = f"""{context}
    
    Question: {question}
    
    Please provide a clear and concise answer based only on the information provided above. 
    If the information is not sufficient to answer the question, please say so."""
    
    # Generate response using Gemini
    response = model.generate_content(prompt)
    
    # Format sources
    sources = [
        f"{doc['title']}: {doc['source']}" 
        for doc in relevant_docs if 'source' in doc and doc['source']
    ]
    
    return response.text, "\n".join(sources)

# Define the Gradio interface
def gradio_interface(question):
    answer, sources = answer_question(question)
    return answer, sources

# Create Gradio interface
iface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Textbox(label="Enter your question"),
    outputs=[
        gr.Textbox(label="Answer"),
        gr.Textbox(label="Sources"),
    ],
    title="Question Answering System",
    description="Ask a question, and get answers with references to relevant documents."
)

# Run the app
if __name__ == '__main__':
    iface.launch(server_name="0.0.0.0", server_port=7860)



* Running on local URL:  http://0.0.0.0:7860

To create a public link, set `share=True` in `launch()`.


In [1]:
import gradio as gr
from flask import Flask, request, jsonify
import google.generativeai as genai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json

# Configure Gemini directly with API key
genai.configure(api_key="")
model = genai.GenerativeModel('gemini-pro')

# Load the data
try:
    with open('aggregated_data.json', 'r', encoding='utf-8') as f:
        knowledge_base = json.load(f)
except FileNotFoundError:
    raise FileNotFoundError("aggregated_data.json not found. Please make sure it exists in the same directory as this script")

# Prepare documents for TF-IDF
documents = [doc['content'] for doc in knowledge_base]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

def get_relevant_documents(query, top_k=3):
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, tfidf_matrix)[0]
    top_indices = similarities.argsort()[-top_k:][::-1]
    relevant_docs = [{'content': knowledge_base[idx]['content'], 
                      'source': knowledge_base[idx].get('source'), 
                      'title': knowledge_base[idx].get('title')} 
                     for idx in top_indices]
    return relevant_docs

def ask_question(question):
    try:
        relevant_docs = get_relevant_documents(question)
        
        context = "Based on the following information:\n\n"
        for doc in relevant_docs:
            context += f"Document: {doc['content']}\n\n"
        
        prompt = f"""{context}
        
        Question: {question}
        
        Please provide a clear and concise answer based only on the information provided above. 
        If the information is not sufficient to answer the question, please say so."""
        
        response = model.generate_content(prompt)
        
        sources = [{'url': doc['source'], 'title': doc['title']} 
                   for doc in relevant_docs if doc.get('source')]
        
        return response.text, sources
    except Exception as e:
        return f"Error: {str(e)}", []

# Gradio interface
def gradio_interface(question):
    answer, sources = ask_question(question)
    sources_html = "<br>".join([f"<a href='{src['url']}' target='_blank'>{src['title']}</a>" for src in sources])
    return answer, sources_html

interface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Textbox(lines=4, placeholder="Enter your question here..."),
    outputs=[gr.Textbox(label="Answer"), gr.HTML(label="Sources")],
    live=True
)

if __name__ == "__main__":
    interface.launch(share=True)
import gradio as gr
from flask import Flask, request, jsonify
import google.generativeai as genai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json

# Configure Gemini directly with API key
genai.configure(api_key="")
model = genai.GenerativeModel('gemini-pro')

# Load the data
try:
    with open('aggregated_data.json', 'r', encoding='utf-8') as f:
        knowledge_base = json.load(f)
except FileNotFoundError:
    raise FileNotFoundError("aggregated_data.json not found. Please make sure it exists in the same directory as this script")

# Prepare documents for TF-IDF
documents = [doc['content'] for doc in knowledge_base]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

def get_relevant_documents(query, top_k=3):
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, tfidf_matrix)[0]
    top_indices = similarities.argsort()[-top_k:][::-1]
    relevant_docs = [{'content': knowledge_base[idx]['content'], 
                      'source': knowledge_base[idx].get('source'), 
                      'title': knowledge_base[idx].get('title')} 
                     for idx in top_indices]
    return relevant_docs

def ask_question(question):
    try:
        relevant_docs = get_relevant_documents(question)
        
        context = "Based on the following information:\n\n"
        for doc in relevant_docs:
            context += f"Document: {doc['content']}\n\n"
        
        prompt = f"""{context}
        
        Question: {question}
        
        Please provide a clear and concise answer based only on the information provided above. 
        If the information is not sufficient to answer the question, please say so."""
        
        response = model.generate_content(prompt)
        
        sources = [{'url': doc['source'], 'title': doc['title']} 
                   for doc in relevant_docs if doc.get('source')]
        
        return response.text, sources
    except Exception as e:
        return f"Error: {str(e)}", []

# Gradio interface
def gradio_interface(question):
    answer, sources = ask_question(question)
    sources_html = "<br>".join([f"<a href='{src['url']}' target='_blank'>{src['title']}</a>" for src in sources])
    return answer, sources_html

interface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Textbox(lines=4, placeholder="Enter your question here..."),
    outputs=[gr.Textbox(label="Answer"), gr.HTML(label="Sources")],
    live=True
)

if __name__ == "__main__":
    interface.launch(share=True)


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://38aca9a2014da9a821.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://0d453080f2f480e0d1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [None]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import gradio as gr
from flask import Flask, request, jsonify
import google.generativeai as genai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
import os

# Configure Gemini directly with API key
genai.configure(api_key="")
model = genai.GenerativeModel('gemini-pro')

# Load the data
try:
    with open('aggregated_data.json', 'r', encoding='utf-8') as f:
        knowledge_base = json.load(f)
except FileNotFoundError:
    raise FileNotFoundError("aggregated_data.json not found. Please make sure it exists in the same directory as this script")

# Prepare documents for TF-IDF
documents = [doc['content'] for doc in knowledge_base]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

def get_relevant_documents(query, top_k=3):
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, tfidf_matrix)[0]
    top_indices = similarities.argsort()[-top_k:][::-1]
    relevant_docs = [{'content': knowledge_base[idx]['content'], 
                      'source': knowledge_base[idx].get('source'), 
                      'title': knowledge_base[idx].get('title')} 
                     for idx in top_indices]
    return relevant_docs

def ask_question(question):
    try:
        relevant_docs = get_relevant_documents(question)
        
        context = "Based on the following information:\n\n"
        for doc in relevant_docs:
            context += f"Document: {doc['content']}\n\n"
        
        prompt = f"""{context}
        
        Question: {question}
        
        Please provide a clear and concise answer based only on the information provided above. 
        If the information is not sufficient to answer the question, please say so."""
        
        response = model.generate_content(prompt)
        
        sources = [{'url': doc['source'], 'title': doc['title']} 
                   for doc in relevant_docs if doc.get('source')]
        
        return response.text, sources
    except Exception as e:
        return f"Error: {str(e)}", []

# Gradio interface wrapped in Flask
app = Flask(__name__)

@app.route('/ask', methods=['POST'])
def ask():
    question = request.json.get('question')
    answer, sources = ask_question(question)
    return jsonify({'answer': answer, 'sources': sources})

# Gradio interface
def gradio_interface(question):
    answer, sources = ask_question(question)
    sources_html = "<br>".join([f"<a href='{src['url']}' target='_blank'>{src['title']}</a>" for src in sources])
    return answer, sources_html

# Launch Gradio interface
interface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Textbox(lines=4, placeholder="Enter your question here..."),
    outputs=[gr.Textbox(label="Answer"), gr.HTML(label="Sources")],
    live=True
)

if __name__ == "__main__":
    # Use the port specified by Cloud Run
    port = int(os.environ.get("PORT", 8080))
    app.run(host='0.0.0.0', port=port)

# In[ ]:






 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:8080
 * Running on http://192.168.1.129:8080
Press CTRL+C to quit


In [None]:
from flask import Flask, request, jsonify, render_template_string
import google.generativeai as genai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
import os

# HTML template as a string
HTML_TEMPLATE = '''
<!DOCTYPE html>
<html>
<head>
    <title>Question Answering System</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
        }
        .container {
            margin-top: 20px;
        }
        #question {
            width: 100%;
            padding: 10px;
            margin-bottom: 10px;
        }
        #response {
            margin-top: 20px;
            white-space: pre-wrap;
        }
        .sources {
            margin-top: 10px;
            font-size: 0.9em;
        }
    </style>
</head>
<body>
    <h1>Ask a Question</h1>
    <div class="container">
        <textarea id="question" rows="4" placeholder="Enter your question here..."></textarea>
        <button onclick="askQuestion()">Submit</button>
        <div id="response"></div>
        <div id="sources" class="sources"></div>
    </div>

    <script>
        async function askQuestion() {
            const question = document.getElementById('question').value;
            const responseDiv = document.getElementById('response');
            const sourcesDiv = document.getElementById('sources');
            
            responseDiv.innerHTML = 'Loading...';
            sourcesDiv.innerHTML = '';

            try {
                const response = await fetch('/ask', {
                    method: 'POST',
                    headers: {
                        'Content-Type': 'application/json',
                    },
                    body: JSON.stringify({ question: question }),
                });

                const data = await response.json();
                responseDiv.innerHTML = data.answer;
                
                if (data.sources && data.sources.length > 0) {
                    sourcesDiv.innerHTML = '<h3>Sources:</h3>' + 
                        data.sources.map(source => 
                            `<p><a href="${source.url}" target="_blank">${source.title}</a></p>`
                        ).join('');
                }
            } catch (error) {
                responseDiv.innerHTML = 'Error: ' + error.message;
            }
        }
    </script>
</body>
</html>
'''

# Configure Gemini directly with API key
genai.configure(api_key="")
model = genai.GenerativeModel('gemini-pro')

app = Flask(__name__)

try:
    # Load the data
    with open('aggregated_data.json', 'r', encoding='utf-8') as f:
        knowledge_base = json.load(f)
except FileNotFoundError:
    raise FileNotFoundError("aggregated_data.json not found. Please make sure it exists in the same directory as app.py")

# Prepare documents for TF-IDF
documents = [doc['content'] for doc in knowledge_base]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

def get_relevant_documents(query, top_k=3):
    # Transform the query
    query_vector = vectorizer.transform([query])
    
    # Calculate similarity
    similarities = cosine_similarity(query_vector, tfidf_matrix)[0]
    
    # Get top k most similar documents
    top_indices = similarities.argsort()[-top_k:][::-1]
    
    relevant_docs = []
    for idx in top_indices:
        doc = knowledge_base[idx]
        relevant_docs.append({
            'content': doc['content'],
            'source': doc['source'],
            'title': doc['title']
        })
    
    return relevant_docs

@app.route('/')
def home():
    return render_template_string(HTML_TEMPLATE)

@app.route('/ask', methods=['POST'])
def ask():
    try:
        data = request.json
        question = data['question']
        
        # Get relevant documents
        relevant_docs = get_relevant_documents(question)
        
        # Prepare context for Gemini
        context = "Based on the following information:\n\n"
        for doc in relevant_docs:
            context += f"Document: {doc['content']}\n\n"
        
        prompt = f"""{context}
        
        Question: {question}
        
        Please provide a clear and concise answer based only on the information provided above. 
        If the information is not sufficient to answer the question, please say so."""
        
        # Generate response using Gemini
        response = model.generate_content(prompt)
        
        # Prepare sources information
        sources = []
        for doc in relevant_docs:
            if doc['source']:  # Only include if source URL exists
                sources.append({
                    'url': doc['source'],
                    'title': doc['title']
                })
        
        return jsonify({
            'answer': response.text,
            'sources': sources
        })
        
    except Exception as e:
        return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
    # Use the port specified by Cloud Run
    port = int(os.environ.get("PORT", 8080))
    app.run(host='0.0.0.0', port=port)

  from pandas.core import (


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:8080
 * Running on http://192.168.1.129:8080
Press CTRL+C to quit
192.168.1.129 - - [18/Feb/2025 12:51:25] "GET / HTTP/1.1" 200 -
192.168.1.129 - - [18/Feb/2025 12:51:25] "GET /favicon.ico HTTP/1.1" 404 -
192.168.1.129 - - [18/Feb/2025 12:51:40] "POST /ask HTTP/1.1" 200 -
192.168.1.129 - - [18/Feb/2025 12:52:23] "POST /ask HTTP/1.1" 200 -
