In [7]:
import PyPDF2
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from rake_nltk import Rake
import json
import os

# Function to extract text from a PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to generate keywords using TF-IDF
def generate_keywords_tfidf(content, top_n=5):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=top_n)
    X = vectorizer.fit_transform([content])
    keywords = vectorizer.get_feature_names_out()
    return list(keywords)

# Function to scrape a web page
def scrape_web_page(url, topic="General"):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch page: {response.status_code}")
        return None
    soup = BeautifulSoup(response.text, 'html.parser')
    documents = []
    titles = soup.find_all('h2')  # Adjust this tag based on page structure
    for title in titles:
        question = title.text.strip()
        answer = title.find_next('p').text.strip() if title.find_next('p') else "No answer found"
        content = question + " " + answer
        keywords = generate_keywords_tfidf(content)
        documents.append({
            "title": question,
            "content": answer,
            "topic": topic,
            "keywords": keywords
        })
    return documents

# Extract text from PDFs
pdf_paths = [
    "C:/Users/omidm/Downloads/RCP#0032 Intake 10 Student Internship Summary reports (share with all).pdf",
    "C:/Users/omidm/Downloads/Research Computing Platform Student Internship Handbook.pdf",
    "C:/Users/omidm/Downloads/RCP0026 Welcome Students Semeter 2 Intake 10.pdf"
]
pdf_data = []

for pdf_path in pdf_paths:
    pdf_text = extract_text_from_pdf(pdf_path)
    pdf_keywords = generate_keywords_tfidf(pdf_text)
    
    # Extract the file name (without extension) from the file path
    file_name = os.path.splitext(os.path.basename(pdf_path))[0]
    
    pdf_data.append({
        "title": f"Content from {file_name}",
        "content": pdf_text,
        "topic": file_name,  # Use the file name as the topic
        "keywords": pdf_keywords
    })


# Scrape web pages
links = [
    {"url": "https://wehi-researchcomputing.github.io/students", "topic": "Unpaid Student Internship Program"},
    {"url": "https://wehi-researchcomputing.github.io/complex-projects", "topic": "complex ambiguous projects"},
    {"url": "https://wehi-researchcomputing.github.io/software_maturity_model", "topic": "software maturity model"},
    {"url": "https://wehi-researchcomputing.github.io/explanation_about_ohs", "topic": "explanation about ohs"},
    {"url": "https://wehi-researchcomputing.github.io/top-5-mistakes", "topic": "top 5 mistakes"},
    {"url": "https://wehi-researchcomputing.github.io/project-wikis", "topic": "project wikis"},
    {"url": "https://wehi-researchcomputing.github.io/student-loxcoder", "topic": "loxcoder"},
    {"url": "https://wehi-researchcomputing.github.io/student-data-commons", "topic": "student data commons"},
    {"url": "https://wehi-researchcomputing.github.io/student-cryoem", "topic": "cryoem"},
    {"url": "https://wehi-researchcomputing.github.io/student-genomics-qc", "topic": "genomics quantum computing"},
    {"url": "https://wehi-researchcomputing.github.io/student-schex", "topic": "schex"},
    {"url": "https://wehi-researchcomputing.github.io/student-mixOmics.html", "topic": "mixOmics"},
    {"url": "https://wehi-researchcomputing.github.io/student-capacity-planning.html", "topic": "capacity planning"},
    {"url": "https://wehi-researchcomputing.github.io/student-haemosphere", "topic": "haemosphere"},
    {"url": "https://wehi-researchcomputing.github.io/student-imaging", "topic": "imaging"},
    {"url": "https://wehi-researchcomputing.github.io/student-quantum", "topic": "quantum"},
    {"url": "https://wehi-researchcomputing.github.io/student-genomics-metadata.html", "topic": "genomics metadata"},
    {"url": "https://wehi-researchcomputing.github.io/student-aive", "topic": "aive"},
    {"url": "https://wehi-researchcomputing.github.io/student-bionix", "topic": "bionix"},
    {"url": "https://wehi-researchcomputing.github.io/student-clinical-dashboards", "topic": "clinical dashboards"},
    {"url": "https://wehi-researchcomputing.github.io/email_acknowledgement", "topic": "email acknowledgement"},
    {"url": "https://wehi-researchcomputing.github.io/code-of-conduct", "topic": "code of conduct"},
    {"url": "https://wehi-researchcomputing.github.io/faq", "topic": "FAQ"}       
]
web_data = []
for link in links:
    data = scrape_web_page(link["url"], topic=link["topic"])
    if data:
        web_data.extend(data)

# Combine PDF data and web-scraped data
all_data = pdf_data + web_data

# Print and save the aggregated data
for doc in all_data:
    print(f"Title: {doc['title']}")
    print(f"Content: {doc['content'][:500]}...")  # Truncate for readability
    print(f"Topic: {doc['topic']}")
    print(f"Keywords: {', '.join(doc['keywords'])}")
    print("-" * 40)

with open("aggregated_data.json", "w") as f:
    json.dump(all_data, f, indent=4)


Title: Content from RCP#0032 Intake 10 Student Internship Summary reports (share with all)
Content: RCP#0032
Intake
10
Student 
Internship
Summary
reports
Table
of
Contents
Link
to
Intake
9
Summary
Report
RCP#0016
Intake
9
Student
Internship
Summary
reports.pdf
that
can
be
used
as
an
example.AIVE
Student
Project
Interns:
Si
Yang
(Sean)
Chen,
Chun-Tung
(Chloe)
Tsai,
Jiawen
Deng
High-Level
Domain
work
During
the
first
4-5
weeks,
we
learned
about
and
tried
to
understand
the
AIVE
workflow
for
converting
2D
cell
image
stacks
into
3D
models
and
familiarised
ourselves
with
software
tools
used
in
the
...
Topic: RCP#0032 Intake 10 Student Internship Summary reports (share with all)
Keywords: data, github, project, technical, work
----------------------------------------
Title: Content from Research Computing Platform Student Internship Handbook
Content: Research
Computing
Platform
Student Handbook3
4
5
6
7
8Introduction
Philosophy
Benefits for Students
Numbers behind the program
Code of Conduct

In [None]:
    {"url": "https://www.patreon.com/posts/64545194", "topic": "Help your students help you teach"},
    {"url": "https://wehi-researchcomputing.github.io/key-milestones", "topic": "Key Milestones"},
    {"url": "https://wehi-researchcomputing.github.io/student-project-outlines", "topic": "Student Project Outlines"}




    

In [8]:
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from rake_nltk import Rake
from neo4j import GraphDatabase

# Connect to Neo4j
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "12345678"))


In [10]:
# Function to insert data into Neo4j
def insert_into_neo4j(documents):
    with driver.session() as session:
        for doc in documents:
            # Create Document node
            session.run(
                """
                MERGE (d:Document {title: $title, content: $content, topic: $topic})
                """,
                title=doc['title'], content=doc['content'], topic=doc['topic']
            )
            
            # Create Keyword nodes and relationships
            for keyword in doc['keywords']:
                session.run(
                    """
                    MERGE (k:Keyword {name: $keyword})
                    MERGE (d:Document {title: $title})
                    MERGE (d)-[:HAS_KEYWORD]->(k)
                    """,
                    keyword=keyword, title=doc['title']
                )

# Insert data into Neo4j
insert_into_neo4j(all_data)

# Close the Neo4j
driver.close()


In [11]:
from neo4j import GraphDatabase
from transformers import AutoTokenizer, AutoModelForCausalLM
import os

# Set up Neo4j connection
class Neo4jQueryTool:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def query(self, cypher_query, params=None):
        with self.driver.session() as session:
            result = session.run(cypher_query, params)
            return list(result)

    def close(self):
        self.driver.close()

# Initialize Neo4j Tool
neo4j_tool = Neo4jQueryTool("bolt://localhost:7687", "neo4j", "12345678")

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


# Example usage
text = "Hello, how can I assist you today?"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs, max_length=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


  warn(
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Hello, how can I assist you today?

I'm sorry, but I'm not sure how to help you.

I'm sorry, but I'm not sure how to help you.

I'm sorry, but I'm


In [18]:
from neo4j import GraphDatabase
import os
import google.generativeai as genai
from IPython.display import Markdown

# Set up Neo4j connection
class Neo4jQueryTool:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def query(self, cypher_query, params=None):
        with self.driver.session() as session:
            result = session.run(cypher_query, params)
            # Explicitly fetch all records into a list before processing
            return list(result)

    def close(self):
        self.driver.close()

# Initialize Neo4j Tool
neo4j_tool = Neo4jQueryTool("bolt://localhost:7687", "neo4j", "12345678")

# Set up Google Gemini LLM API key
os.environ['GOOGLE_API_KEY'] = "AIzaSyDpTvl6KQLte4DksHn83iYDbhZlARYig8Y"  # Replace with your actual API key
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])

# Instantiate the GenerativeModel with the 'gemini-pro' model
model = genai.GenerativeModel('gemini-pro')



In [17]:
def fetch_and_generate_content(limit=6, max_input_length=1024, max_new_tokens=200):
    try:
        # Query Neo4j to retrieve document titles and contents
        result = neo4j_tool.query("""
            MATCH (d:Document)
            RETURN d.title AS title, d.content AS content
            ORDER BY d.createdAt DESC
            LIMIT $limit
        """, {"limit": limit})
        
        # Check if any results were returned
        if not result:
            return "No documents were found in the database."
        
        # Format the result into a string suitable for the LLM
        document_info = "\n".join(
            [f"Title: {record['title']}\nContent: {record['content'][:500]}"  # Truncate content for safety
             for record in result]
        )
        
        # Construct the prompt for LLaMA
        prompt = (
            f"Here are the details of some documents:\n{document_info}\n"
            "Can you summarize the key points and provide advice about these documents?"
        )
        
        # Tokenize the input with truncation
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input_length)
        
        # Validate input_ids length
        if inputs.input_ids.shape[1] > max_input_length:
            raise ValueError(f"Input exceeds max_input_length: {inputs.input_ids.shape[1]} tokens")
        
        # Generate content using LLaMA
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=max_new_tokens,
            num_beams=4,
            no_repeat_ngram_size=2,
            early_stopping=True
        )
        
        # Decode the output
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response
    
    except Exception as e:
        return f"An error occurred: {str(e)}"
    finally:
        # Ensure the Neo4j connection is closed
        neo4j_tool.close()

# Example usage: fetch data and generate content
response_text = fetch_and_generate_content(limit=6)
print(response_text)


  with self.driver.session() as session:
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Here are the details of some documents:
Title: Content from Research Computing Platform Student Internship Handbook
Content: Research
Computing
Platform
Student Handbook3
4
5
6
7
8Introduction
Philosophy
Benefits for Students
Numbers behind the program
Code of Conduct
Want to know more?Table of ContentsIntroduction
The Research Computing Platform (RCP) is a  
collaborative, multi-disciplinary lab that
supports and advocates for researchers and
their computational research needs at WEHI.
RCP has established a 100% remote, unpaid
student internship program with subjects
provided at the University of Melbourne. We
did th
Title: Content from RCP0026 Welcome Students Semeter 2 Intake 10
Content: Walter and Eliza Hall Institute of Medical ResearchRCP Student Internship Welcome 
Summer 2024 2025
Week of 25th November 2024Acknowledgement of Country
I acknowledge the Wurundjeri and Boon Wurrung people, on
whose unceded lands some of us live and work here in
Naarm (Melbourne). I respectfully ack

In [38]:
from neo4j import GraphDatabase 
import os
import google.generativeai as genai
from IPython.display import Markdown

# Set up Neo4j connection
class Neo4jQueryTool:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def query(self, cypher_query, params=None):
        with self.driver.session() as session:
            result = session.run(cypher_query, params)
            return list(result)

    def close(self):
        self.driver.close()

# Initialize Neo4j Tool
neo4j_tool = Neo4jQueryTool("bolt://localhost:7687", "neo4j", "12345678")

# Set up Google Gemini LLM API key
os.environ['GOOGLE_API_KEY'] = "AIzaSyDpTvl6KQLte4DksHn83iYDbhZlARYig8Y"  # Replace with your actual API key
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])

# Instantiate the GenerativeModel with the 'gemini-pro' model
model = genai.GenerativeModel('gemini-pro')

# Function to fetch all data and generate content in chunks
def fetch_and_generate_all_content(chunk_size=5000):
    # Query Neo4j for documents mentioning 'Easter'
    result = neo4j_tool.query("""
        MATCH (d:Document)
        WHERE toLower(d.title) CONTAINS 'easter' OR toLower(d.content) CONTAINS 'easter'
        RETURN d.title AS title, d.content AS content
        """)
    print(result)

    if not result:
        return "No documents mentioning Easter found in the database."

    responses = []
    for record in result:
        title = record['title']
        content = record['content']

        # Chunk the document content
        chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
        for idx, chunk in enumerate(chunks):
            prompt = f"The document titled '{title}' includes 'Easter' in its title but does not mention it in the content. Summarize the document and hypothesize the significance of Easter based on the title and content:\n{chunk}"

            try:
                response = model.generate_content(prompt)
                responses.append(f"Document '{title}':\n{response.text}")
            except Exception as e:
                responses.append(f"Error in chunk {idx + 1} for document '{title}': {str(e)}")

    return "\n\n".join(responses)


# Example usage: fetch and generate content for all data
response_text = fetch_and_generate_all_content(chunk_size=200)  # Adjust chunk size as needed
print(response_text)

# Close the Neo4j tool after use
neo4j_tool.close()


[<Record title='What happens over Christmas / New Year / Easter period?' content='We take a break between Christmas and New Year, as well as the first week of January as well. During the time away there is no need to write and send the weekly update email during the break. We do encourage you to write down questions and what you have done in your technical diary so that you can ask questions when we are back in the second week of January.'>]
Document 'What happens over Christmas / New Year / Easter period?':
**Summary of Document:**
The document outlines that there will be a break from sending weekly update emails between Christmas and New Year, and for the first week of January.

**Hypothesis on Significance of Easter:**
Despite being mentioned in the title, the document does not provide any details on Easter. However, the inclusion of Easter in the title suggests that it may also be a period where weekly updates are not sent. This hypothesis aligns with the common practice of many or

In [46]:
from flask import Flask, request, jsonify
from neo4j import GraphDatabase
import os
import google.generativeai as genai

# Flask App
app = Flask(__name__)

# Set up Neo4j connection
class Neo4jQueryTool:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def query(self, cypher_query, params=None):
        with self.driver.session() as session:
            result = session.run(cypher_query, params)
            return list(result)

    def close(self):
        self.driver.close()

# Initialize Neo4j Tool
neo4j_tool = Neo4jQueryTool("bolt://localhost:7687", "neo4j", "12345678")

# Set up Google Gemini LLM API key
os.environ['GOOGLE_API_KEY'] = "YOUR_ACTUAL_GOOGLE_API_KEY"  # Replace with your actual API key
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])

# Instantiate the GenerativeModel with the 'gemini-pro' model
model = genai.GenerativeModel('gemini-pro')

# Function to handle queries and generate responses
def handle_user_question(question, chunk_size=5000):
    cypher_query = """
        MATCH (d:Document)
        WHERE toLower(d.title) CONTAINS $keyword OR toLower(d.content) CONTAINS $keyword
        RETURN d.title AS title, d.content AS content
    """
    params = {"keyword": question.lower()}
    result = neo4j_tool.query(cypher_query, params)

    if not result:
        return f"No documents mentioning '{question}' found in the database."

    responses = []
    for record in result:
        title = record['title']
        content = record['content']

        # Chunk the document content
        chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
        for idx, chunk in enumerate(chunks):
            prompt = (
                f"The document titled '{title}' is related to your query '{question}'. "
                f"Summarize its main points and discuss any relevant references:\n{chunk}"
            )
            try:
                response = model.generate_content(prompt)
                responses.append(f"Document '{title}':\n{response.text}")
            except Exception as e:
                responses.append(f"Error in chunk {idx + 1} for document '{title}': {str(e)}")

    return "\n\n".join(responses)

# Flask endpoint for chatbot
@app.route("/chat", methods=["POST"])
def chat():
    user_input = request.json.get("question", "")
    if not user_input:
        return jsonify({"error": "Please provide a valid question."}), 400

    try:
        response_text = handle_user_question(user_input, chunk_size=200)
        return jsonify({"response": response_text})
    except Exception as e:
        return jsonify({"error": str(e)}), 500

# Close Neo4j tool gracefully on shutdown
@app.teardown_appcontext
def close_neo4j_connection(exception):
    neo4j_tool.close()

if __name__ == "__main__":
    app.run(debug=True, port=5003, use_reloader=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5003
Press CTRL+C to quit
 * Restarting with watchdog (windowsapi)


SystemExit: 1

In [51]:
import streamlit as st
from neo4j import GraphDatabase
import os
import google.generativeai as genai

# Set up Neo4j connection
class Neo4jQueryTool:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def query(self, cypher_query, params=None):
        with self.driver.session() as session:
            result = session.run(cypher_query, params)
            return list(result)

    def close(self):
        self.driver.close()

# Initialize Neo4j Tool
neo4j_tool = Neo4jQueryTool("bolt://localhost:7687", "neo4j", "12345678")

# Set up Google Gemini LLM API key
os.environ['GOOGLE_API_KEY'] = "YOUR_ACTUAL_GOOGLE_API_KEY"  # Replace with your actual API key
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])

# Instantiate the GenerativeModel with the 'gemini-pro' model
model = genai.GenerativeModel('gemini-pro')

# Function to handle queries and generate responses
def handle_user_question(question, chunk_size=5000):
    cypher_query = """
        MATCH (d:Document)
        WHERE toLower(d.title) CONTAINS $keyword OR toLower(d.content) CONTAINS $keyword
        RETURN d.title AS title, d.content AS content
    """
    params = {"keyword": question.lower()}
    result = neo4j_tool.query(cypher_query, params)

    if not result:
        return f"No documents mentioning '{question}' found in the database."

    responses = []
    for record in result:
        title = record['title']
        content = record['content']

        # Chunk the document content
        chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
        for idx, chunk in enumerate(chunks):
            prompt = (
                f"The document titled '{title}' is related to your query '{question}'. "
                f"Summarize its main points and discuss any relevant references:\n{chunk}"
            )
            try:
                response = model.generate_content(prompt)
                responses.append(f"Document '{title}':\n{response.text}")
            except Exception as e:
                responses.append(f"Error in chunk {idx + 1} for document '{title}': {str(e)}")

    return "\n\n".join(responses)

# Streamlit App
st.title("Document Query and Chatbot")

# Input field for user question
user_input = st.text_input("Ask a question about the documents:", "")

if user_input:
    try:
        response_text = handle_user_question(user_input, chunk_size=200)
        st.write("Response:")
        st.text(response_text)
    except Exception as e:
        st.error(f"Error: {str(e)}")




In [52]:
from fastapi import FastAPI, HTTPException
from neo4j import GraphDatabase
import os
import google.generativeai as genai

# FastAPI App
app = FastAPI()

# Set up Neo4j connection
class Neo4jQueryTool:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def query(self, cypher_query, params=None):
        with self.driver.session() as session:
            result = session.run(cypher_query, params)
            return list(result)

    def close(self):
        self.driver.close()

# Initialize Neo4j Tool
neo4j_tool = Neo4jQueryTool("bolt://localhost:7687", "neo4j", "12345678")

# Set up Google Gemini LLM API key
os.environ['GOOGLE_API_KEY'] = "YOUR_ACTUAL_GOOGLE_API_KEY"  # Replace with your actual API key
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])

# Instantiate the GenerativeModel with the 'gemini-pro' model
model = genai.GenerativeModel('gemini-pro')

# Function to handle queries and generate responses
def handle_user_question(question, chunk_size=5000):
    cypher_query = """
        MATCH (d:Document)
        WHERE toLower(d.title) CONTAINS $keyword OR toLower(d.content) CONTAINS $keyword
        RETURN d.title AS title, d.content AS content
    """
    params = {"keyword": question.lower()}
    result = neo4j_tool.query(cypher_query, params)

    if not result:
        return f"No documents mentioning '{question}' found in the database."

    responses = []
    for record in result:
        title = record['title']
        content = record['content']

        # Chunk the document content
        chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
        for idx, chunk in enumerate(chunks):
            prompt = (
                f"The document titled '{title}' is related to your query '{question}'. "
                f"Summarize its main points and discuss any relevant references:\n{chunk}"
            )
            try:
                response = model.generate_content(prompt)
                responses.append(f"Document '{title}':\n{response.text}")
            except Exception as e:
                responses.append(f"Error in chunk {idx + 1} for document '{title}': {str(e)}")

    return "\n\n".join(responses)

# FastAPI endpoint for chatbot
@app.post("/chat")
async def chat(user_input: str):
    if not user_input:
        raise HTTPException(status_code=400, detail="Please provide a valid question.")
    
    try:
        response_text = handle_user_question(user_input, chunk_size=200)
        return {"response": response_text}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# Close Neo4j tool gracefully on shutdown
@app.on_event("shutdown")
def close_neo4j_connection():
    neo4j_tool.close()

# To run this, use `uvicorn main:app --reload` in the terminal


        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
        
  @app.on_event("shutdown")


In [53]:
!uvicorn app:app --reload

^C


In [None]:
!uvicorn app:app --reload --host 0.0.0.0 --port 8000