## Imports for web scraping

In [18]:
!pip install requests
!pip install bs4
!pip install lxml



In [11]:
import requests
from bs4 import BeautifulSoup as soup
from bs4 import Tag
from tqdm import tqdm

In [None]:
API_URL : str = "https://en.wikipedia.org/w/rest.php/v1/"
session : requests.Session = requests.Session()

visited : list = []

html_data : list = []

def get_html_document(title: str,depth : int, max_depth : int = 4, session: requests.Session = session) -> str:
  if title in visited or depth > max_depth:
    return None
  visited.append(title)

  used_url = f"{API_URL}page/{title}/html"
  response = session.get(used_url)
  return response.text

def get_titles(html : str) -> list[str]:
  parsed_document = soup(html,"lxml")

  a_tags : list[Tag] = parsed_document.find_all("a")

  related : list = []

  for a_tag in a_tags:
    if a_tag.has_attr("title"):
      related.append(a_tag["href"][2:])

  return related

queue : list = [("law",1)]

progress_bar = tqdm(total = 0, dynamic_ncols=True)

while queue:
  title,depth = queue.pop(0)
  html_document = get_html_document(title,depth)
  visited.append(title)
  if html_document:
    html_data.append(html_document)
    total = len(html_data)
    progress_bar.update(1)
    for related_title in get_titles(html_document):
      if related_title not in visited and related_title not in queue:
        queue.append((related_title,depth+1))

    progress_bar.set_description(f"Docs: {len(html_data)} | Queue: {len(queue)}")

progress_bar.close()

In [20]:
import pickle

with open('data.pkl','wb') as f:
  pickle.dump(html_data,f)

In [22]:
!du -h data.pkl

71M	data.pkl


In [61]:
import re

def parse_document(html : str) -> str:
  parsed_document = soup(html, 'lxml')
  text_elements : list[Tag] = parsed_document.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

  document : str = ""

  for element in text_elements:
    document += re.sub(r'\[\d+\]|\n|\t', '', element.getText())

  return document

In [62]:
text_only_documents : list[str] = []

for html_document in html_data:
  text_only_documents.append(parse_document(html_document))

In [63]:
text_only_documents[20]

'ErrorToo many requestsIf you report this error to the Wikimedia System Administrators, please include the details below.Request served via cp5021 cp5021, Varnish XID 473743757Upstream caches: cp5021 intError: 429, Too many requests at Sun, 27 Apr 2025 09:40:44 GMTSensitive client informationIP address: 35.229.178.106'

In [64]:
non_error_docs : list[str] = []

for i in text_only_documents:
  if "errortoo many requests".lower() not in i.lower():
    non_error_docs.append(i)

In [65]:
len(non_error_docs)

152

In [66]:
with open('text_data.pkl','wb') as f:
  pickle.dump(non_error_docs,f)

In [67]:
!du -h text_data.pkl

5.1M	text_data.pkl


In [6]:
def generate_chunks(text : str, chunk_size : int = 512) -> list[str]:
  chunks : list[str] = []

  for i in range(0,len(text),chunk_size):
    chunks.append(text[i:i+chunk_size])

  return chunks

In [69]:
all_chunks : list[str] = []

for doc in non_error_docs:
  all_chunks.extend(generate_chunks(doc))

In [70]:
len(all_chunks)

10398

In [71]:
all_chunks[0]

"Law is a set of rules that are created and are enforceable by social or governmental institutions to regulate behavior, with its precise definition a matter of longstanding debate. It has been variously described as a science and as the art of justice. State-enforced laws can be made by a legislature, resulting in statutes; by the executive through decrees and regulations; or by judges' decisions, which form precedent in common law jurisdictions. An autocrat may exercise those functions within their realm. "

In [1]:
!pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [7]:
import pickle

with open("text_data (1).pkl",'rb') as f:
  all_docs = pickle.load(f)

all_chunks : list[str] = []

for doc in all_docs:
  all_chunks.extend(generate_chunks(doc))

In [10]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/msmarco-bert-base-dot-v5")


embeddings = model.encode(all_chunks[0])
embeddings.shape
# similarities = model.similarity(embeddings, embeddings)
# print(similarities.shape)
# # [4, 4]

(768,)

In [12]:
knowledge_base : dict = {}

for index, chunk in tqdm(enumerate(all_chunks)):

  knowledge_base[index] = model.encode(chunk)

10398it [02:14, 77.48it/s]


In [14]:
!pip install google-genai



In [20]:
with open('knowledge_base.pkl','wb') as f:
  pickle.dump(knowledge_base,f)

In [22]:
with open('chunks.pkl','wb') as f:
  pickle.dump(all_chunks,f)

In [24]:
!du -h chunks.pkl

5.2M	chunks.pkl


In [21]:
!du -h knowledge_base.pkl

31M	knowledge_base.pkl


In [15]:
type(knowledge_base[0])

numpy.ndarray

In [31]:
from google import genai
from google.genai import types
import numpy as np

client = genai.Client(api_key="")

def get_most_similarity(request: str) -> list[str]:
  request_embedding = model.encode(request)

  similarity_to_all : list[tuple] = []

  for i,j in knowledge_base.items():
    similarity : float = np.dot(request_embedding, j) / (np.linalg.norm(j)* np.linalg.norm(request_embedding))
    similarity_to_all.append((i,similarity))

  similarity_to_all.sort(key=lambda x: x[1], reverse=True)

  return similarity_to_all[:5]

def get_response(request : str, chat : list[str]) -> str:
  similar_docs = get_most_similarity(request)
  print(similar_docs)


  for doc in similar_docs:
    chat.append(all_chunks[doc[0]])

  chat.append(request)



  response = client.models.generate_content(
        model="gemini-2.0-flash", contents= " ".join(chat)
    )

  return response.text, chat

chat : list[str] = ["Using the provided documents answer the requested question"]

while True:
  response, chat = get_response(input("Enter your question: "),chat)

  print(response)
  chat.append(response)


Enter your question: laws in ancient egypt
[(1308, np.float32(0.92534757)), (508, np.float32(0.9235945)), (408, np.float32(0.9235416)), (358, np.float32(0.9207855)), (356, np.float32(0.9163487))]
Based on the provided documents, here's a summary of the laws in ancient Egypt:

*   **Pharaoh as Head of Legal System:** The pharaoh was officially the head of the legal system, responsible for enacting laws, delivering justice, and maintaining law and order (Ma'at).

*   **No Surviving Legal Codes:** No legal codes from ancient Egypt have survived.

*   **Common-Sense Approach:** Egyptian law was based on a common-sense view of right and wrong.

*   **Emphasis on Agreements:** Egyptian law emphasized reaching agreements and resolving disputes through reconciliation.

*   **Influence on Greek Law:** Egyptian law, particularly regarding women's rights (allowing them to own property and act independently), influenced the more restrictive conventions of the Greeks.

*   **Later Influence of Roma

KeyboardInterrupt: Interrupted by user