In [1]:
import aiohttp
import asyncio
import nest_asyncio
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import json

nest_asyncio.apply()

# Load your 4633 cleaned URLs
df = pd.read_csv("unique_links.csv")
urls = df["clean_url"].dropna().unique().tolist()

# HTML tags to keep
TARGET_TAGS = ["h1", "h2", "h3", "p", "li"]

# Async scraper function
async def fetch(session, url):
    try:
        async with session.get(url, timeout=10) as resp:
            if resp.status == 200:
                html = await resp.text()
                soup = BeautifulSoup(html, "html.parser")
                elements = [
                    {"tag": tag.name, "text": tag.get_text(strip=True)}
                    for tag in soup.find_all(TARGET_TAGS)
                    if tag.get_text(strip=True)
                ]
                return {"url": url, "elements": elements}
    except:
        return None

# Batch runner
async def scrape_all(urls, batch_size=50):
    results = []
    connector = aiohttp.TCPConnector(limit=25)
    async with aiohttp.ClientSession(connector=connector) as session:
        for i in tqdm(range(0, len(urls), batch_size)):
            batch = urls[i:i+batch_size]
            tasks = [fetch(session, url) for url in batch]
            responses = await asyncio.gather(*tasks)
            results.extend([r for r in responses if r])
    return results

# Run scraping
scraped_data = asyncio.run(scrape_all(urls))

# Save as JSON
with open("scraped_pages.json", "w", encoding="utf-8") as f:
    json.dump(scraped_data, f, indent=2)

print(f"✅ Scraped and saved {len(scraped_data)} pages to 'scraped_pages.json'")


100%|██████████████████████████████████████████████████████████████████████████████████| 93/93 [09:18<00:00,  6.01s/it]


✅ Scraped and saved 4592 pages to 'scraped_pages.json'


In [3]:
import json
import pandas as pd

# Load JSON
with open("scraped_pages.json", "r", encoding="utf-8") as f:
    scraped_data = json.load(f)

# Flatten into DataFrame
records = []
for page in scraped_data:
    url = page["url"]
    for elem in page.get("elements", []):
        text = elem.get("text", "").strip()
        tag = elem.get("tag", "")
        if text:  # Skip empty strings
            records.append({"url": url, "tag": tag, "text": text})

df = pd.DataFrame(records)
print(f"✅ Loaded {len(df)} elements from {len(scraped_data)} pages")


✅ Loaded 1078515 elements from 4592 pages


In [7]:
# Remove short items (e.g., 1-word nav links)
df = df[df["text"].str.split().str.len() > 3]

# Optional: Remove common junk words (customize if needed)
blacklist = ["resources", "staff listing", "map", "calendar", "directory", "home"]
df = df[~df["text"].str.lower().isin(blacklist)]

# Optional: Drop duplicates per URL+text
df = df.drop_duplicates(subset=["url", "text"])

print(f"✅ Filtered to {len(df)} meaningful text elements.")


✅ Filtered to 269534 meaningful text elements.


In [9]:
df

Unnamed: 0,url,tag,text
18,https://jindal.utdallas.edu/student-resources/...,li,About the Jindal SchoolCommunity EngagementCom...
24,https://jindal.utdallas.edu/student-resources/...,li,Message from the Dean
28,https://jindal.utdallas.edu/student-resources/...,li,Outcomes & Success Factors
30,https://jindal.utdallas.edu/student-resources/...,li,ProgramsAcademic ProgramsUndergraduateMaster’s...
40,https://jindal.utdallas.edu/student-resources/...,li,FacultyJindal School FacultyAccountingFinance ...
...,...,...,...
1078503,https://jindal.utdallas.edu/faq/phd-admissions...,h2,"If I have an MBA or master's degree, will it h..."
1078504,https://jindal.utdallas.edu/faq/phd-admissions...,h2,How many doctoral students do you admit in you...
1078506,https://jindal.utdallas.edu/faq/phd-admissions...,p,Thank you for your interest in the Naveen Jind...
1078508,https://jindal.utdallas.edu/faq/phd-admissions...,p,"Copyright © 2025, All rights reserved.800 W Ca..."


In [11]:
chunks = df.groupby("url")["text"].apply(lambda texts: "\n".join(texts)).reset_index()
chunks.rename(columns={"text": "chunk_text"}, inplace=True)



In [13]:
chunks

Unnamed: 0,url,chunk_text
0,http://jindal.utdallas.edu/academic-areas/info...,About the Jindal SchoolCommunity EngagementCom...
1,http://jindal.utdallas.edu/academics-news-cate...,About the Jindal SchoolCommunity EngagementCom...
2,http://jindal.utdallas.edu/accounting,About the Jindal SchoolCommunity EngagementCom...
3,http://jindal.utdallas.edu/accounting/bs-accou...,About the Jindal SchoolCommunity EngagementCom...
4,http://jindal.utdallas.edu/accounting/bs-accou...,About the Jindal SchoolCommunity EngagementCom...
...,...,...
4587,https://jindal.utdallas.edu/vlog-blog,About the Jindal SchoolCommunity EngagementCom...
4588,https://jindal.utdallas.edu/voices,About the Jindal SchoolCommunity EngagementCom...
4589,https://jindal.utdallas.edu/voices/blog/best-s...,About the Jindal SchoolCommunity EngagementCom...
4590,https://jindal.utdallas.edu/web-standards-block,About the Jindal SchoolCommunity EngagementCom...


In [15]:
chunks.iloc[0]["chunk_text"]

'About the Jindal SchoolCommunity EngagementCompany EngagementConferencesContact InformationLeadership TeamMessage from the DeanNaveen JindalRankingsStudent ExperienceOutcomes & Success FactorsMANAGEMENT Magazine\nMessage from the Dean\nOutcomes & Success Factors\nProgramsAcademic ProgramsUndergraduateMaster’sMBAPhDDBAExecutive EducationCertificate ProgramsHonors Programs\nFacultyJindal School FacultyAccountingFinance and Managerial EconomicsInformation SystemsMarketingOperations ManagementOrganizations, Strategy and International ManagementFaculty Research\nFinance and Managerial Economics\nOrganizations, Strategy and International Management\nStudentsStudent ResourcesAcademic AdvisingBusiness CardsBusiness Communication CenterBusiness CompetitionsCareer Management CenterComet ClosetDean’s CouncilFaculty MentorsFAQFinance LabGrammarlyEDU & QuinnciaInternship ExperiencesLabsLiving Learning CommunityLockersScholarshipsSpecial Event RequestsStudent OrganizationsStudy AbroadTA/RA Graduate

In [17]:
## Cleaning the chunk

In [19]:
import re

def clean_chunk(text):
    # Remove footer-like boilerplate
    blacklist = [
        "sitemap", "privacy policy", "content owners",
        "copyright", "mark thouin", "accessibility"
    ]
    for term in blacklist:
        text = re.sub(rf"\b{term}\b.*", "", text, flags=re.IGNORECASE)

    # Remove duplicate lines
    lines = list(dict.fromkeys(text.split("\n")))

    # Remove short/empty/noisy lines
    lines = [line.strip() for line in lines if len(line.strip()) > 5 and not line.strip().isdigit()]

    return "\n".join(lines)


In [21]:
chunks["cleaned_chunk"] = chunks["chunk_text"].apply(clean_chunk)
chunks = chunks[chunks["cleaned_chunk"].str.len() > 100]  # Filter very short chunks


In [25]:
chunks.iloc[0]["cleaned_chunk"]

'About the Jindal SchoolCommunity EngagementCompany EngagementConferencesContact InformationLeadership TeamMessage from the DeanNaveen JindalRankingsStudent ExperienceOutcomes & Success FactorsMANAGEMENT Magazine\nMessage from the Dean\nOutcomes & Success Factors\nProgramsAcademic ProgramsUndergraduateMaster’sMBAPhDDBAExecutive EducationCertificate ProgramsHonors Programs\nFacultyJindal School FacultyAccountingFinance and Managerial EconomicsInformation SystemsMarketingOperations ManagementOrganizations, Strategy and International ManagementFaculty Research\nFinance and Managerial Economics\nOrganizations, Strategy and International Management\nStudentsStudent ResourcesAcademic AdvisingBusiness CardsBusiness Communication CenterBusiness CompetitionsCareer Management CenterComet ClosetDean’s CouncilFaculty MentorsFAQFinance LabGrammarlyEDU & QuinnciaInternship ExperiencesLabsLiving Learning CommunityLockersScholarshipsSpecial Event RequestsStudent OrganizationsStudy AbroadTA/RA Graduate

In [27]:
import re

def clean_chunk_text(text):
    lines = list(dict.fromkeys(text.split("\n")))  # Remove duplicates, preserve order
    cleaned_lines = []

    blacklist = [
        "connect with jindal alums", "sitemap", "privacy policy", "content owners", 
        "last updated", "copyright", "mark thouin", "accessibility",
        "student resources", "centers & institutes", "faculty", "admission", 
        "academic programs", "rankings", "about the jindal school"
    ]

    for line in lines:
        clean_line = line.strip()
        if len(clean_line.split()) > 3 and not any(b in clean_line.lower() for b in blacklist):
            cleaned_lines.append(clean_line)

    return "\n".join(cleaned_lines)


In [29]:
chunks["cleaned_text"] = chunks["chunk_text"].apply(clean_chunk_text)
chunks = chunks[chunks["cleaned_text"].str.len() > 100]


In [31]:
def split_into_subchunks(text, max_words=300):
    words = text.split()
    return [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

subchunks = []
for _, row in chunks.iterrows():
    small_chunks = split_into_subchunks(row["cleaned_text"])
    for i, chunk in enumerate(small_chunks):
        subchunks.append({
            "url": row["url"],
            "chunk_id": f"{row['url'].split('/')[-1]}_{i}",
            "text": chunk
        })

final_df = pd.DataFrame(subchunks)



In [33]:
final_df

Unnamed: 0,url,chunk_id,text
0,http://jindal.utdallas.edu/academic-areas/info...,certificate-programs_0,Message from the Dean Outcomes & Success Facto...
1,http://jindal.utdallas.edu/academics-news-cate...,academics-news-category_0,Message from the Dean Outcomes & Success Facto...
2,http://jindal.utdallas.edu/academics-news-cate...,academics-news-category_1,launch in fall 2025. The degree was created in...
3,http://jindal.utdallas.edu/academics-news-cate...,academics-news-category_2,earn a master’s degree. The Jindal School will...
4,http://jindal.utdallas.edu/academics-news-cate...,academics-news-category_3,"and Industry “Low on code, high on value” was ..."
...,...,...,...
12017,https://jindal.utdallas.edu/voices/blog/best-s...,best-study-spots-utdallas_1,work done in a quiet setting. Students are alw...
12018,https://jindal.utdallas.edu/voices/blog/best-s...,best-study-spots-utdallas_2,take an exam or just want to watch Netflix. Th...
12019,https://jindal.utdallas.edu/voices/blog/best-s...,best-study-spots-utdallas_3,your studying.Fat StrawsandJava Landare very c...
12020,https://jindal.utdallas.edu/web-standards-block,web-standards-block_0,Message from the Dean Outcomes & Success Facto...


In [35]:
final_df.iloc[1]["text"]

'Message from the Dean Outcomes & Success Factors Finance and Managerial Economics Organizations, Strategy and International Management Technology & Facilities Services Mentor & Volunteer Opportunities Center and Laboratory for Behavioral Operations and Economics Center for Finance Strategy & Innovation Center for Global Business Center for Healthcare Leadership and Management Center for Information Technology and Management Center for Intelligent Supply Networks Center for Internal Auditing Excellence Center for Professional Sales Center for Retail Innovation and Strategy Excellence Center for the Management of Financial and Digital Asset Technologies Herbert D. Weitzman Institute for Real Estate Institute for Excellence in Corporate Governance International Center for Decision and Risk Analysis The Institute for Innovation & Entrepreneurship Sustainable Global Business Initiative Academics – News Category Double Degree Programs at Jindal School Offer Key Advantages The academic exper

In [37]:
## chunking using Heading and sliding method


In [79]:
import re

def is_meaningful(text):
    """
    Filter out short, duplicate-like, or navigation-style lines.
    """
    text = text.strip()
    if len(text.split()) <= 3:
        return False
    blacklist_keywords = [
        "calendar", "resources", "apply", "faculty", "admission",
        "academic programs", "honors", "connect", "contact us",
        "staff listing", "site map", "privacy policy", "menu", "lockers"
    ]
    return not any(b in text.lower() for b in blacklist_keywords)

def sliding_window(tokens, window_size=200, overlap=75):
    """
    Split a list of tokens into overlapping windows.
    """
    chunks = []
    for start in range(0, len(tokens), window_size - overlap):
        end = start + window_size
        chunk = " ".join(tokens[start:end])
        if chunk.strip():
            chunks.append(chunk)
    return chunks

def heading_sliding_chunker(pages, window_size=200, overlap=75):
    """
    Combines heading-based grouping and sliding window chunking.
    """
    final_chunks = []

    for page in pages:
        url = page["url"]
        elements = page.get("elements", [])

        current_heading = "Unknown Section"
        buffer = []

        for elem in elements:
            tag = elem["tag"].lower()
            text = elem["text"].strip()

            if tag in ["h1", "h2"]:
                # Process previous section
                if buffer:
                    tokens = " ".join(buffer).split()
                    slides = sliding_window(tokens, window_size, overlap)
                    for i, chunk_text in enumerate(slides):
                        final_chunks.append({
                            "url": url,
                            "section": current_heading,
                            "chunk_id": f"{url.split('/')[-1]}_{i}",
                            "text": chunk_text
                        })
                    buffer = []

                current_heading = text or "Unnamed Section"

            elif tag in ["p", "li", "h3"]:
                if is_meaningful(text):
                    buffer.append(text)

        # Process last section if any
        if buffer:
            tokens = " ".join(buffer).split()
            slides = sliding_window(tokens, window_size, overlap)
            for i, chunk_text in enumerate(slides):
                final_chunks.append({
                    "url": url,
                    "section": current_heading,
                    "chunk_id": f"{url.split('/')[-1]}_{i}",
                    "text": chunk_text
                })

    return final_chunks


In [81]:
import json
import pandas as pd

# Load scraped file
with open("scraped_pages.json", "r", encoding="utf-8") as f:
    scraped = json.load(f)

# Chunk with sliding window
chunks = heading_sliding_chunker(scraped, window_size=200, overlap=75)

# Save output
df = pd.DataFrame(chunks)
df.to_csv("heading_sliding_chunks.csv", index=False)
print(f"✅ Saved {len(df)} structured, clean chunks.")


✅ Saved 48901 structured, clean chunks.


In [83]:
# Cell 1: Load chunks & compute statistics
import pandas as pd

# Assumes you’ve saved your heading+sliding chunks to CSV:
# columns: url, section, chunk_id, text
df = pd.read_csv("heading_sliding_chunks.csv")

# Token lengths
df['token_length'] = df['text'].str.split().apply(len)

# Print stats
print("Total chunks:", len(df))
print("Unique URLs:", df['url'].nunique())
print("Avg tokens per chunk:", df['token_length'].mean())
print("Min tokens:", df['token_length'].min(), "Max tokens:", df['token_length'].max())
print("Avg chunks per URL:", df.groupby('url').size().mean())


Total chunks: 48901
Unique URLs: 4592
Avg tokens per chunk: 118.99106357743196
Min tokens: 1 Max tokens: 200
Avg chunks per URL: 10.649172473867596
