In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

%cd /content/drive/MyDrive/projects_AI/SmartWiki

Mounted at /content/drive
/content/drive/MyDrive/projects_AI/SmartWiki


In [2]:
%%writefile utils/scraper.py
# load excel file and scraping the wiki articles and storing them as txt file in data folder for each wiki
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import os

def load_excel(excel_file):
  df = pd.read_excel(excel_file)
  if "url" not in df.columns:
    raise ValueError("Excel file must have a column named 'url'")
  else:
    urls = df['url'].tolist()
  return urls

def scrape_wiki(urls):
  for url in urls:
    r = requests.get(url, headers = {"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(r.text, "lxml")

    content_div = soup.find("div", {"id": "mw-content-text"})
    if content_div:
      paragraphs = content_div.find_all("p")
    else:
      return

    clean_text = []
    for p in paragraphs:
      text = p.get_text()
      text = re.sub(r"\[\d+\]"," ",text)
      clean_text.append(text.strip())
    article_text = "\n".join(clean_text)

    title = soup.find("title")
    file_name = title.get_text().replace(" - Wikipedia","")+".txt"
    output_file = os.path.join("data/raw/", file_name)
    with open(output_file, "w", encoding="utf-8") as file:
      file.write(article_text)
    print(f"Saved: {file_name}")



Overwriting utils/scraper.py


In [3]:
%%writefile utils/chunks.py
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt_tab", quiet = True)
import json
import os

def chunk_text(chunk_size):
  all_chunks = []
  for file_name in os.listdir("data/raw"):
    if file_name.endswith(".txt"):
      doc_id = file_name.replace(".txt", "")
      input_file = os.path.join("data/raw", file_name)
      with open(input_file, "r", encoding="utf-8") as file:
        text = file.read()
      sentences = sent_tokenize(text)
      chunks = []
      current_chunk = ""
      for s in sentences:
        if len(current_chunk)+ len(s)<=chunk_size:
          current_chunk += s+" "
        else:
          if current_chunk.strip():
            chunks.append(current_chunk.strip())
            current_chunk = s+" "

      if current_chunk.strip():
        chunks.append(current_chunk.strip())
      for i, chunk in enumerate(chunks):
              all_chunks.append({
                "doc_id": doc_id,
                "chunk_id": i,
                "text": chunk
              })
  output_file = os.path.join("artifacts/chunks", "final_chunks.json")
  with open(output_file, "w", encoding="utf-8") as file:
    json.dump(all_chunks, file, indent=2, ensure_ascii = False)

# def save_chunks(all_chunks, output_file):
#   with open(output_file, "w", encoding="utf-8") as file:
#     json.dump(all_chunks, file, indent=2, ensure_ascii = False)

# all_chunks = []
# for file_name in os.listdir("data/raw"):
#   if file_name.endswith(".txt"):
#     doc_id = file_name.replace(".txt", "")
#     input_file = os.path.join("data/raw", file_name)
#     with open(input_file, "r", encoding="utf-8") as file:
#       text = file.read()
#     chunks = chunk_text(text, 500)
#     for i, chunk in enumerate(chunks):
#       all_chunks.append({
#         "doc_id": doc_id,
#         "chunk_id": i,
#         "text": chunk
#       })
# output_file = os.path.join("artifacts/chunks", "final_chunks.json")
# save_chunks(all_chunks, output_file)

Overwriting utils/chunks.py


In [4]:
!pip install faiss-cpu



In [5]:
%%writefile utils/embedder.py
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import os
import json

emb_model = SentenceTransformer("all-MiniLM-L6-v2")

def build_faiss_index():
   with open("artifacts/chunks/final_chunks.json", "r", encoding="utf-8") as file:
    chunks = json.load(file)
   embeddings = []
   for chunk in chunks:
      vector = emb_model.encode(chunk["text"])
      embeddings.append({
          "doc_id": chunk["doc_id"],
          "chunk_id": chunk["chunk_id"],
          "embedding": vector.tolist(),
          "text": chunk["text"]
      })
   output_file = os.path.join("artifacts/embeddings", "final_embeddings.json")
   with open(output_file, "w", encoding="utf-8") as file:
      json.dump(embeddings, file, indent=2, ensure_ascii=False)
   vectors = [np.array(e["embedding"], dtype ="float32") for e in embeddings]
   metadata = [{"doc_id":e["doc_id"], "chunk_id": e["chunk_id"], "text":e["text"]} for e in embeddings]

   vectors = np.vstack(vectors)
   dim = vectors.shape[1]
   index = faiss.IndexFlatL2(dim)
   index.add(vectors)
   faiss.write_index(index,"artifacts/faiss/my_index.faiss")
   with open("artifacts/faiss/metadata.json", "w", encoding="utf-8") as file:
    json.dump(metadata, file, indent=2, ensure_ascii=False)
   print("Faiss index built successfully")

def load_faiss_index():
  index = faiss.read_index("artifacts/faiss/my_index.faiss")
  with open("artifacts/faiss/metadata.json", "r", encoding="utf-8") as file:
    metadata = json.load(file)
  return index, metadata






Overwriting utils/embedder.py


In [6]:
%%writefile utils/retriever.py
import numpy as np
from sentence_transformers import SentenceTransformer
from utils.embedder import load_faiss_index
import json

emb_model = SentenceTransformer("all-MiniLM-L6-v2")
def retrieve_docs(query, index, metadata, top_k = 3):
  query_embeddings = emb_model.encode([query]).astype("float32")
  scores,indices = index.search(query_embeddings,top_k)
  context_chunks = []

  for idx in indices[0]:
    context_chunks.append(metadata[idx]["text"])
  return context_chunks



Overwriting utils/retriever.py


In [7]:
%%writefile utils/llm.py
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# microsoft/phi-2
# mistralai/Mistral-7B-Instruct-v0.2
model_name = "tiiuae/falcon-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map ="auto", offload_folder = "offload", torch_dtype = torch.float16)

llm = pipeline("text-generation", model = model, tokenizer = tokenizer)

def generate_answer(query, context_chunks):
    prompt = f"""
  you are a helpful assistant. Use the provided context to answer the question. If answer is not in the context, say "I don't know".
  Context:{context_chunks[0]}
  Question:{query}
  Answer:
  """

    #using opensource LLM from hugging face

    response = llm(prompt,max_new_tokens = 200, temperature = 0.7, do_sample = True)
    answer = response[0]["generated_text"]
    return answer



Overwriting utils/llm.py


In [8]:
pip install --upgrade streamlit



In [9]:

%%writefile app.py


import streamlit as st
from utils.scraper import scrape_wiki, load_excel
from utils.chunks import chunk_text
from utils.embedder import build_faiss_index, load_faiss_index
from utils.retriever import retrieve_docs
from utils.llm import generate_answer
import pandas as pd


st.set_page_config(page_title = "SmartWiki", layout = "wide" )
st.title("SmartWiki - AI powered wiki assistant")
st.write("Upload links, scrape articles, chunk, embed and query using LLM's!")


#**************************************************************************************

st.subheader("Welcome to SmartWiki")
st.subheader("Step 1: Upload and Scrape Articles")
excel_file = st.file_uploader("Upload Excel File", type = ["xlsx"])


# excel_file = pd.read_excel("data/wiki_excel.xlsx")
# df = pd.read_excel(excel_file)
# if "url" not in df.columns:
#   raise ValueError("Excel file must have a column named 'url'")
# else:
#   urls = df['url'].tolist()

if excel_file is not None:
  if st.button("Scrape Links"):
    with st.spinner("Scraping in progress..."):
      urls = load_excel(excel_file)
      scrape_wiki(urls)
    st.success("Scraping completed!")
st.subheader("Step 2: Chunking articles")
if st.button("Run CHUNKING"):
  with st.spinner("Chunking in progress..."):
    chunk_text(500)
  st.success("Chunking completed!")
st.subheader("Step 3: Generate Embeddings and Store in FAISS")
if st.button("Run EMBEDDING"):
  with st.spinner("Embedding in progress..."):
    build_faiss_index()
  st.success("Embedding completed!")
st.subheader("Step 4: Ask AI from your knowledge base")
query = st.text_input("Enter your question:")
if st.button("Get Answer"):
  if query.strip():
    with st.spinner("Generating answer..."):
      index, metadata = load_faiss_index()
      context_chunks = retrieve_docs(query, index, metadata, top_k = 3)
      answer = generate_answer(query, context_chunks)
      st.write(answer)
  else:
    st.warning("Please enter a question")



#**************************************************************************************

# #sidebar navigation
# st.sidebar.title("Navigation")
# menu = st.radio("Go to:", ("Home", "Scraping", "Chunking"," Embedding", "Ask AI"))

# #Home
# if menu == "Home":
#   st.subheader("Welcome to SmartWiki")
#   st.markdown("""
#   -**Step 1: **Upload an Excel file with acolum 'urls'
#   -**Step 2:** Scrape articles
#   -**Step 3:** Chunk articles into smaller pieces
#   -**Step 4:** Generate embeddings + store in FAISS
#   -**Step 5:** Ask questions with AI!
#   """)


# #Scraping
# elif menu == "Scraping":
#   st.subheader("Step 1: Upload and Scrape Articles")
#   excel_file = st.file_uploader("Upload Excel File", type = "xlsx")
#   if excel_file is not None:
#     if st.button("Scrape Links"):
#       with st.spinner("Scraping in progress..."):
#         urls = load_excel(excel_file)
#         scrape_wiki(urls)
#       st.sucess("Scraping completed!")

# #chunking
# elif menu == "Chunking":
#   st.subheader("Step 2: Chunking articles")
#   if st.button("Run CHUNKING"):
#     with st.spinner("Chunking in progress..."):
#       chunk_text(500)
#     st.sucess("Chunking completed!")

# #Embedding
# elif menu == "Embedding":
#   st.subheader("Step 3: Generate Embeddings and Store in FAISS")
#   if st.button("Run EMBEDDING"):
#     with st.spinner("Embedding in progress..."):
#       build_faiss_index()
#     st.sucess("Embedding completed!")

# #retriver
# elif menu == "Ask AI":
#   st.subheader("Step 4: Ask AI from your knowledge base")
#   query = st.text_input("Enter your question:")
#   if st.button("Get Answer"):
#     if query.strip():
#       with st.spinner("Generating answer..."):
#         index, metadata = load_faiss_index()
#         context_chunks = retrieve_docs(query, index, metadata, top_k = 3)
#         answer = generate_answer(query, context_chunks)
#         st.write(answer)

#     else:
#       st.warning("Please enter a question")



Overwriting app.py


In [10]:
!curl https://loca.lt/mytunnelpassword

35.229.204.31

In [12]:
!npm install -g cloudflared

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K
changed 1 package in 5s
[1G[0K⠧[1G[0K

In [13]:
!kill -9 $(lsof -t -i:8501) 2>/dev/null || true


In [14]:
!streamlit run app.py --server.port 8501 --server.address 0.0.0.0 > logs.txt 2>&1 &
!tail -n 30 logs.txt


In [None]:

!cloudflared tunnel --url http://localhost:8501 --no-autoupdate


[90m2025-09-02T06:55:25Z[0m [32mINF[0m Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/cloudflare-one/connections/connect-apps
[90m2025-09-02T06:55:25Z[0m [32mINF[0m Requesting new quick Tunnel on trycloudflare.com...
[90m2025-09-02T06:55:28Z[0m [32mINF[0m +--------------------------------------------------------------------------------------------+
[90m2025-09-02T06:55:28Z[0m [32mINF[0m |  Your quick Tunnel has been created! Visit it at (it may take some time to be reachable):  |
[90m2025

In [15]:
!tail -n 30 logs.txt


[31m──[0m[31m────────────────────────[0m[31m [0m[1;31mTraceback [0m[1;2;31m(most recent call last)[0m[31m [0m[31m─────────────────────────[0m[31m──[0m
[31m [0m [2;33m/usr/local/lib/python3.12/dist-packages/streamlit/runtime/scriptrunner/[0m[1;33mexec_code.py[0m: [31m [0m
[31m [0m [94m128[0m in [92mexec_func_with_error_handling[0m                                                 [31m [0m
[31m [0m                                                                                      [31m [0m
[31m [0m [2;33m/usr/local/lib/python3.12/dist-packages/streamlit/runtime/scriptrunner/[0m[1;33mscript_runner[0m [31m [0m
[31m [0m [1;33m.py[0m:[94m669[0m in [92mcode_to_exec[0m                                                              [31m [0m
[31m [0m                                                                                      [31m [0m
[31m [0m [2;33m/content/drive/MyDrive/projects_AI/SmartWiki/[0m[1;33mapp.py[0m:[94m45[0m in 