In [None]:
import os, requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
import re
import json
import glob
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
import hashlib

In [15]:
source_urls = [
    #English resources
    'https://www.northbaycounselling.com/wp-content/uploads/2022/05/Cirillo-Pomodoro-Technique.pdf?utm_source=chatgpt.com',
    'https://dres.illinois.edu/wp-content/uploads/2023/04/Pomodoro-Technique.pdf?utm_source=chatgpt.com',
    'https://www.research.ed.ac.uk/files/447113983/PedersenEtal2024TSTimeManagement.pdf?utm_source=chatgpt.com',
    'https://sajithpai.com/wp-content/uploads/2016/06/Deep-Work-Summary.pdf?utm_source=chatgpt.com',
    'https://www.getstoryshots.com/books/deep-work-summary/?utm_source=chatgpt.com',
    'https://www.todoist.com/productivity-methods/time-blocking',
    'http://jamesclear.com/atomic-habits-summary',
    'https://www.cdc.gov/sleep/about/?CDC_AAref_Val=https://www.cdc.gov/sleep/about_sleep/sleep_hygiene.html',
    'https://tarteel.ai/blog/5-simple-techniques-for-better-time-management-as-a-muslim/',
    'https://islam365.io/topics/time_management_islam.html',
    'https://www.halaltimes.com/what-does-quran-say-about-time-management/',
    'https://www.muslim-library.com/dl/books/English_Time_management_from_Islamic_and_Administrative_perspective.pdf',
    'https://www.youngmuslimdigest.com/perspective/11/2022/time-management-and-islam/',
    'https://www.linkedin.com/pulse/time-management-islamic-perspective-abdul-ghaffar',
    'http://irep.iium.edu.my/97927/']

In [None]:
clean_kb_dir = "../kb"
collected_kb_dir = '../kb/collected/'

os.makedirs(clean_kb_dir, exist_ok=True)
os.makedirs(collected_kb_dir, exist_ok=True)

In [None]:
def safe_filename(url):
    base = url.split("/")[-1].split("?")[0]  
    if not base: 
        base = hashlib.md5(url.encode()).hexdigest()
    return base

def download_file(url, folder=collected_kb_dir):
    fname = safe_filename(url)
    path = os.path.join(folder, fname)
    if not os.path.exists(path):
        try:
            r = requests.get(url, timeout=30)
            if r.status_code == 200:
                with open(path, "wb") as f:
                    f.write(r.content)
                print(f"[+] Downloaded: {fname}")
        except Exception as e:
            print(f"[!] Error downloading {url} -> {e}")
    return path

In [18]:
def extract_text(path):
    if path.endswith(".pdf"):
        text = ""
        try:
            with pdfplumber.open(path) as pdf:
                for page in pdf.pages:
                    text += page.extract_text() or ""
        except Exception as e:
            print(f"[!] PDF read error {path}: {e}")
        return text
    else:
        # HTML
        try:
            with open(path, "rb") as f:
                html = f.read()
            soup = BeautifulSoup(html, "html.parser")
            paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
            return "\n".join(paragraphs)
        except Exception as e:
            print(f"[!] HTML parse error {path}: {e}")
            return ""

In [None]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  
    text = re.sub(r"\s+", " ", text)     
    text = text.replace("\u200f", "")    
    return text.strip()

In [29]:
for url in source_urls:
    path = download_file(url)
    raw_text = extract_text(path)
    cleaned = clean_text(raw_text)
    
    if cleaned:
        out_path = os.path.join(clean_kb_dir, os.path.basename(path) + ".txt")
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(cleaned)

[!] PDF read error collected/PedersenEtal2024TSTimeManagement.pdf: [Errno 2] No such file or directory: 'collected/PedersenEtal2024TSTimeManagement.pdf'


In [20]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

def chunk_text(text, source):
    chunks = splitter.split_text(text)
    return [
        {
            "id": f"{os.path.basename(source)}_{i}",
            "text": ch,
            "source": source
        }
        for i, ch in enumerate(chunks)
    ]

In [30]:
def load_file(path):
    if path.endswith(".txt"):
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()
    elif path.endswith(".pdf"):
        reader = PdfReader(path)
        text = ""
        for page in reader.pages:
            if page.extract_text():
                text += page.extract_text() + "\n"
        return text
    return ""

def process_kb(folder="kb/"):
    all_chunks = []
    for path in glob.glob(os.path.join(folder, "*")):
        text = load_file(path)
        if not text.strip():
            continue
        chunks = chunk_text(text, path)
        print(f"📂 {os.path.basename(path)} -> {len(chunks)} chunks")
        all_chunks.extend(chunks)
    print(f"\n✅ Total chunks: {len(all_chunks)}")
    return all_chunks

In [31]:
all_chunks = process_kb(clean_kb_dir)

📂 0837b3ae6ae76f97f7eb1532ac21e01f.txt -> 19 chunks
📂 7c3fc7a70029d4facb8b8c9ca3c3a4a3.txt -> 11 chunks
📂 atomic-habits-summary.txt -> 13 chunks
📂 bfed0aeb7b44f9a75316b748d19d3645.txt -> 26 chunks
📂 c1101a13f67d0d48f283bc291e1e4edf.txt -> 9 chunks
📂 Cirillo-Pomodoro-Technique.pdf.txt -> 209 chunks
📂 d82c0864ccba944cc44060a11e806bda.txt -> 4 chunks
📂 Deep-Work-Summary.pdf.txt -> 67 chunks
📂 Pomodoro-Technique.pdf.txt -> 9 chunks
📂 sleep_hygiene.html.txt -> 4 chunks
📂 time-blocking.txt -> 67 chunks
📂 time-management-islamic-perspective-abdul-ghaffar.txt -> 18 chunks
📂 time_management_islam.html.txt -> 72 chunks

✅ Total chunks: 528


In [32]:
with open("kb_chunks.json", "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=2)

print(f"\n[✓] Finished! Total chunks: {len(all_chunks)} saved in kb_chunks.json")



[✓] Finished! Total chunks: 528 saved in kb_chunks.json
