In [None]:
#Goal: Create PerformRetrieval Function w/ Wikipedia as External Database

!pip install -q wikipedia
!pip install -q wikipedia-api

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone


In [None]:
import wikipediaapi

wiki_wiki = wikipediaapi.Wikipedia(
    language='en',
    extract_format=wikipediaapi.ExtractFormat.WIKI,
    user_agent='semantic-spillway-research-retrieval-mechanism/1.0 (techsuryas@gmail.com)'
)

categories = [
    "Science", "Technology", "Medicine", "Politics",
    "History", "Current events", "Mathematics",
    "Philosophy", "Ethics", "Computer science", "Artificial intelligence",
    "Physics", "Biology", "Law", "Economics", "Chemistry", "Music"
]

# Function to collect page titles from a category
def get_category_members(category_name, max_depth=1, max_pages=20):
    cat = wiki_wiki.page("Category:" + category_name)
    pages = []

    def add_members(category, depth):
        if depth > max_depth:
            return
        for title in category.categorymembers:
            page = category.categorymembers[title]
            if page.ns == wikipediaapi.Namespace.MAIN and not page.title.startswith("Category:"):
                pages.append(page.title)
            elif page.ns == wikipediaapi.Namespace.CATEGORY:
                add_members(page, depth + 1)

    add_members(cat, 0)
    return pages[:max_pages]

# Collect all unique page titles from Wikipedia
all_topics = set()
for category in categories:
    print(f"Fetching topics from: {category}")
    topics = get_category_members(category, max_depth=1, max_pages=25)
    all_topics.update(topics)

print(f"Total topics collected: {len(all_topics)}")

Fetching topics from: Science
Fetching topics from: Technology
Fetching topics from: Medicine
Fetching topics from: Politics
Fetching topics from: History
Fetching topics from: Current events
Fetching topics from: Mathematics
Fetching topics from: Philosophy
Fetching topics from: Ethics
Fetching topics from: Computer science
Fetching topics from: Artificial intelligence
Fetching topics from: Physics
Fetching topics from: Biology
Fetching topics from: Law
Fetching topics from: Economics
Fetching topics from: Chemistry
Total topics collected: 392


In [None]:
import wikipedia

with open("common_knowledge.txt", "w", encoding="utf-8") as f:
    for topic in all_topics:
        try:
            content = wikipedia.page(topic).content
            f.write(f"== {topic} ==\n")
            f.write(content + "\n\n")
        except wikipedia.exceptions.DisambiguationError as e:
            f.write(f"== {topic} (disambiguation error) ==\n")
            f.write(f"Disambiguation: {e.options[:5]}\n\n")
        except Exception as e:
            f.write(f"== {topic} (error) ==\n")
            f.write(f"Error: {str(e)}\n\n")

In [None]:
!pip install faiss-cpu openai tiktoken

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m72.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [None]:
import os
import re
import tiktoken

# Load corpus
with open("common_knowledge.txt", "r", encoding="utf-8") as f:
    corpus_text = f.read()

# Tokenizer for length control
tokenizer = tiktoken.get_encoding("cl100k_base")

# Simple chunking by tokens (approx. 300–400 words per chunk)
def chunk_text(text, max_tokens=300):
    paragraphs = text.split("\n")
    chunks = []
    current_chunk = ""
    for para in paragraphs:
        current_chunk += para + "\n"
        if len(tokenizer.encode(current_chunk)) >= max_tokens:
            chunks.append(current_chunk.strip())
            current_chunk = ""
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

chunks = chunk_text(corpus_text, max_tokens=300)
print(f"Total chunks: {len(chunks)}")

Total chunks: 2714


In [None]:
from openai import OpenAI
import os

client = OpenAI(api_key = input("Enter your API Key Here"))

In [None]:
import faiss
import numpy as np
import tiktoken
import pickle

def count_tokens(text, model="text-embedding-3-small"):
    enc = tiktoken.encoding_for_model(model)
    return len(enc.encode(text))

def chunk_text(text, max_tokens=300, model="text-embedding-3-small"):
    enc = tiktoken.encoding_for_model(model)
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if count_tokens(" ".join(current_chunk), model=model) >= max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

In [None]:
def embed_chunks(chunks, model="text-embedding-3-small"):
    embeddings = []
    for chunk in chunks:
        response = client.embeddings.create(
            input=[chunk],
            model=model
        )
        vector = response.data[0].embedding
        embeddings.append(vector)
    return np.array(embeddings, dtype='float32')

In [None]:
corpus_path = "common_knowledge.txt"

with open(corpus_path, "r", encoding="utf-8") as f:
    full_text = f.read()

chunks = chunk_text(full_text)
print(f"Generated {len(chunks)} chunks...")

embeddings = embed_chunks(chunks)

Generated 3005 chunks...


In [None]:
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# Save index and chunks for reuse
faiss.write_index(index, "knowledge_index.faiss")
with open("chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)

In [None]:
def retrieve_relevant_chunks(query, k=5, model="text-embedding-3-small"):
    response = client.embeddings.create(
        input=[query],
        model=model
    )
    query_vector = np.array(response.data[0].embedding, dtype='float32').reshape(1, -1)

    D, I = index.search(query_vector, k)
    return [chunks[i] for i in I[0]]

In [None]:
query = "Who is Kendrick Lamar?"
top_chunks = retrieve_relevant_chunks(query)

for i, chunk in enumerate(top_chunks):
    print(f"\nChunk {i+1}:\n{chunk}")



Chunk 1:
K-pop fans are cited as participating in everything from human rights campaigns to education programs throughout the years, often in tribute or honor of the idols they love. In 2020, large subsets of the K-pop community began a movement to disrupt a rally being held by Donald Trump as part of his reelection campaign. The rally was held at Tulsa, Oklahoma's BOK Center, with a 19,000-seat capacity. The Trump campaign in 2016 reported receiving one million ticket requests for the event. Despite the capacity and requests, the Tulsa Fire Department reported that the fire marshal counted 6,200 scanned tickets of attendees. Many believe this to be the work of the K-pop fans who began requesting tickets in large quantities following a tweet by the Trump campaign inviting supporters to register for free tickets. They did this knowing they would not attend and shared the instruction for others to do so on their social media platforms. The social media posts, especially on TikTok, garne