In [18]:
import requests
from bs4 import BeautifulSoup
import json
import time
import os
from urllib.parse import urljoin, urlparse

class MSDataScienceFocusedScraper:
    """Optimized web scraper for MS in Applied Data Science program"""

    def __init__(self):
        self.base_url = "https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/"
        self.target_path = "/education/masters-programs/ms-in-applied-data-science/"
        self.visited_urls = set()
        self.data = []
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        # Define important URLs to ensure these pages are scraped
        self.important_urls = [
            self.base_url,                                      # Main page
            urljoin(self.base_url, "curriculum/"),              # Curriculum
            urljoin(self.base_url, "admission/"),               # Admission requirements
            urljoin(self.base_url, "faculty/"),                 # Faculty information
            urljoin(self.base_url, "tuition-and-aid/"),         # Tuition and financial aid
            urljoin(self.base_url, "careers/"),                 # Career outcomes
            urljoin(self.base_url, "capstone-projects/"),       # Capstone projects
            urljoin(self.base_url, "faq/"),                     # Frequently asked questions
            urljoin(self.base_url, "student-experience/")       # Student experience
        ]

    def start_scraping(self):
        """Start the scraping process, beginning with important pages"""
        print("==== Starting to scrape MS in Applied Data Science program information ====")

        # First scrape the known important URLs
        for url in self.important_urls:
            # Try to access the page, continue even if URL doesn't exist
            try:
                page_type = self.determine_page_type_from_url(url)
                self.scrape_page(url, page_type)
            except Exception as e:
                print(f"Important page {url} scraping failed: {str(e)}")

        # Then start from main page to discover additional related pages
        if self.base_url not in self.visited_urls:
            self.scrape_page(self.base_url, "main")

        return self.data

    def scrape_page(self, url, page_type):
        """Scrape a single page and extract its content"""
        if url in self.visited_urls:
            return

        # Check if URL belongs to the target path
        parsed_url = urlparse(url)
        if self.target_path not in parsed_url.path:
            return

        print(f"Scraping: {url} (Type: {page_type})")
        self.visited_urls.add(url)

        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            if response.status_code != 200:
                print(f"Failed to retrieve {url}: Status code {response.status_code}")
                return

            soup = BeautifulSoup(response.text, 'html.parser')

            # --- MODIFIED: Pass 'url' as the base_url for link resolution ---
            content = self.extract_content(soup, page_type, url)

            if content:
                self.data.append({
                    "url": url,
                    "page_type": page_type,
                    "title": self.extract_title(soup),
                    "content": content,
                    "last_scraped": time.strftime("%Y-%m-%d %H:%M:%S")
                })
                print(f"  ✓ Content successfully extracted")
            else:
                print(f"  ✗ Failed to extract content")

            # Find other related links and recursively scrape them
            related_links = self.find_related_links(soup, url)
            print(f"  Found {len(related_links)} related links")

            for link_url, link_type in related_links:
                self.scrape_page(link_url, link_type)

            # Polite delay between requests
            time.sleep(1)

        except Exception as e:
            print(f"Error scraping {url}: {str(e)}")

    # --- NEW HELPER FUNCTION ---
    def _resolve_links_in_tag(self, tag, base_url):
        """
        Find all <a> tags within a given BeautifulSoup tag and
        modify their href attribute to be an absolute URL.
        """
        if not tag:
            return

        for a_tag in tag.find_all('a', href=True):
            href = a_tag.get('href')

            # Skip javascript: or mailto: links
            if not href or href.startswith('javascript:') or href.startswith('mailto:'):
                continue

            # Use urljoin to create the absolute URL
            # This correctly handles:
            # 1. href="#FeeWaiver"
            # 2. href="/admissions/"
            # 3. href="https.other.site/page.html"
            absolute_href = urljoin(base_url, href)

            # Modify the tag's href attribute in-place
            a_tag['href'] = absolute_href

    def extract_title(self, soup):
        """Extract the page title using different methods"""
        # (This function is unchanged)
        title = soup.find('h1')
        if title and title.text.strip():
            return title.text.strip()
        title_tag = soup.find('title')
        if title_tag and title_tag.text.strip():
            title_text = title_tag.text.strip()
            if " | " in title_text:
                title_text = title_text.split(" | ")[0]
            return title_text
        for tag in ['h2', 'h3']:
            title = soup.find(tag)
            if title and title.text.strip():
                return title.text.strip()
        return "Unknown Title"

    # --- MODIFIED: Accept and use base_url ---
    def extract_content(self, soup, page_type, base_url):
        """Extract content based on page type with specialized extraction"""
        main_content = soup.find('main') or soup.find('div', class_='content-area') or soup.find('article') or soup
        content = {}

        # Extract paragraphs (as HTML strings)
        paragraphs = main_content.find_all('p')
        for p in paragraphs:
            self._resolve_links_in_tag(p, base_url) # Resolve links before saving
        content['paragraphs'] = [str(p) for p in paragraphs if p.text.strip()]

        # Extract headings (as text, links unlikely)
        headings = main_content.find_all(['h2', 'h3', 'h4'])
        content['headings'] = [h.text.strip() for h in headings if h.text.strip()]

        # Extract lists (as HTML strings)
        lists = main_content.find_all(['ul', 'ol'])
        content['lists'] = []
        for lst in lists:
            items = lst.find_all('li')
            if items:
                for item in items:
                    self._resolve_links_in_tag(item, base_url) # Resolve links
                content['lists'].append([str(item) for item in items if item.text.strip()])

        # Extract tables (as HTML strings)
        tables = main_content.find_all('table')
        content['tables'] = []
        for table in tables:
            rows = table.find_all('tr')
            if rows:
                table_data = []
                for row in rows:
                    cells = row.find_all(['td', 'th'])
                    if cells:
                        for cell in cells:
                            self._resolve_links_in_tag(cell, base_url) # Resolve links
                        table_data.append([str(cell) for cell in cells])
                content['tables'].append(table_data)

        # Extract page-specific content based on page type
        if page_type == "curriculum":
            courses = self.extract_courses(main_content, base_url) # Pass base_url
            if courses:
                content['courses'] = courses

        elif page_type == "admission":
            requirements = self.extract_requirements(main_content, base_url) # Pass base_url
            if requirements:
                content['requirements'] = requirements

        elif page_type == "tuition":
            tuition_info = self.extract_tuition(main_content, base_url) # Pass base_url
            if tuition_info:
                content['tuition_info'] = tuition_info

        elif page_type == "faculty":
            faculty = self.extract_faculty(main_content, base_url) # Pass base_url
            if faculty:
                content['faculty'] = faculty

        # Extract any button text and links on the page
        buttons = main_content.find_all(['a', 'button'], class_=lambda c: c and ('btn' in c or 'button' in c))
        if buttons:
            content['buttons'] = []
            for button in buttons:
                button_info = {
                    'text': button.text.strip()
                }
                if button.name == 'a' and button.get('href'):
                    # --- FIXED: Resolve button links too ---
                    button_info['link'] = urljoin(base_url, button['href'])
                content['buttons'].append(button_info)

        has_content = False
        for key, value in content.items():
            if value:
                has_content = True
                break
        return content if has_content else None

    # --- MODIFIED: Accept and use base_url ---
    def extract_courses(self, content_area, base_url):
        """Extract course information from the curriculum page"""
        courses = []
        course_sections = content_area.find_all('div', class_=lambda c: c and ('course' in c.lower()))
        if not course_sections:
            course_headers = content_area.find_all(['h2', 'h3', 'h4'], string=lambda s: s and ('course' in s.lower() or 'curriculum' in s.lower()))
            for header in course_headers:
                next_elements = list(header.next_siblings)
                if next_elements:
                    course_sections.append(header.parent)
        if course_sections:
            for section in course_sections:
                title_elem = section.find(['h3', 'h4', 'h5', 'strong'])
                title = title_elem.text.strip() if title_elem else "Unknown Course"
                desc_elem = section.find('p')
                self._resolve_links_in_tag(desc_elem, base_url) # Resolve links
                description = str(desc_elem) if desc_elem else ""
                course = {'title': title, 'description': description}
                list_items = section.find_all('li')
                if list_items:
                    for item in list_items:
                        self._resolve_links_in_tag(item, base_url) # Resolve links
                    details = [str(item) for item in list_items]
                    course['details'] = details
                courses.append(course)
        if not courses:
            course_lists = content_area.find_all('ul')
            for ul in course_lists:
                items = ul.find_all('li')
                if len(items) > 3:
                    text = ul.text.lower()
                    if 'course' in text or 'curriculum' in text or 'class' in text:
                        for item in items:
                            self._resolve_links_in_tag(item, base_url) # Resolve links
                            courses.append({
                                'title': item.text.strip(),
                                'description': "",
                                'details': [str(item)]
                            })
                        break
        return courses

    # --- MODIFIED: Accept and use base_url ---
    def extract_requirements(self, content_area, base_url):
        """Extract admission requirements information"""
        requirements = {}
        req_headers = content_area.find_all(['h2', 'h3', 'h4'], string=lambda s: s and any(word in s.lower() for word in ['requirement', 'prerequisite', 'application', 'admission']))
        for header in req_headers:
            section_title = header.text.strip()
            section_content = []
            current = header.next_sibling
            while current and current.name != header.name:
                if current.name == 'p' and current.text.strip():
                    self._resolve_links_in_tag(current, base_url) # Resolve links
                    section_content.append(str(current))
                elif current.name in ['ul', 'ol']:
                    items = current.find_all('li')
                    for item in items:
                        self._resolve_links_in_tag(item, base_url) # Resolve links
                    section_content.extend([str(item) for item in items if item.text.strip()])
                current = current.next_sibling
            if section_content:
                requirements[section_title] = section_content
        if not requirements:
            relevant_paras = []
            for p in content_area.find_all('p'):
                text = p.text.lower()
                if any(word in text for word in ['requirement', 'prerequisite', 'application', 'admission', 'apply', 'gre', 'toefl']):
                    self._resolve_links_in_tag(p, base_url) # Resolve links
                    relevant_paras.append(str(p))
            if relevant_paras:
                requirements['General Requirements'] = relevant_paras
        return requirements

    # --- MODIFIED: Accept and use base_url ---
    def extract_tuition(self, content_area, base_url):
        """Extract tuition and financial aid information"""
        tuition_info = {}
        tables = content_area.find_all('table')
        for table in tables:
            table_text = table.text.lower()
            if 'tuition' in table_text or 'fee' in table_text or 'cost' in table_text:
                rows = table.find_all('tr')
                tuition_table = []
                for row in rows:
                    cells = row.find_all(['td', 'th'])
                    if cells:
                        for cell in cells:
                            self._resolve_links_in_tag(cell, base_url) # Resolve links
                        tuition_table.append([str(cell) for cell in cells])
                if tuition_table:
                    tuition_info['tuition_table'] = tuition_table
        tuition_paras = []
        for p in content_area.find_all('p'):
            text = p.text.lower()
            if 'tuition' in text or 'fee' in text or 'cost' in text or 'financial' in text or 'scholarship' in text:
                self._resolve_links_in_tag(p, base_url) # Resolve links
                tuition_paras.append(str(p))
        if tuition_paras:
            tuition_info['tuition_text'] = tuition_paras
        return tuition_info

    # --- MODIFIED: Accept and use base_url ---
    def extract_faculty(self, content_area, base_url):
        """Extract faculty information"""
        faculty = []
        faculty_sections = content_area.find_all('div', class_=lambda c: c and any(word in c.lower() for word in ['faculty', 'instructor', 'professor', 'staff']))
        for section in faculty_sections:
            name_elem = section.find(['h3', 'h4', 'h5', 'strong'])
            if not name_elem:
                continue
            faculty_member = {'name': name_elem.text.strip()}
            title_elem = name_elem.find_next('p')
            if title_elem:
                faculty_member['title'] = title_elem.text.strip()
            bio = []
            current = title_elem.next_sibling if title_elem else name_elem.next_sibling
            while current and current.name not in ['h3', 'h4', 'h5', 'strong']:
                if current.name == 'p' and current.text.strip():
                    self._resolve_links_in_tag(current, base_url) # Resolve links
                    bio.append(str(current))
                current = current.next_sibling
            if bio:
                faculty_member['bio'] = bio
            faculty.append(faculty_member)
        if not faculty:
            faculty_lists = content_area.find_all('ul')
            for ul in faculty_lists:
                if 'faculty' in ul.text.lower() or 'instructor' in ul.text.lower() or 'professor' in ul.text.lower():
                    items = ul.find_all('li')
                    for item in items:
                        self._resolve_links_in_tag(item, base_url) # Resolve links
                        faculty.append({
                            'name': item.text.strip(),
                            'bio': [str(item)]
                        })
                    break
        return faculty

    def find_related_links(self, soup, current_url):
        """Find related links on the page that belong to the same project"""
        # (This function is unchanged)
        related_links = []
        domain = "datascience.uchicago.edu"
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            if not href or href.startswith('#') or href.startswith('javascript:'):
                continue
            full_url = urljoin(current_url, href)
            if full_url in self.visited_urls or domain not in full_url:
                continue
            parsed_url = urlparse(full_url)
            if self.target_path in parsed_url.path:
                page_type = self.determine_page_type(parsed_url.path, a_tag.text)
                related_links.append((full_url, page_type))
        return related_links

    def determine_page_type(self, path, link_text):
        """Determine page type based on URL path and link text"""
        # (This function is unchanged)
        path = path.lower()
        link_text = link_text.lower() if link_text else ""
        if "/curriculum" in path or any(word in link_text for word in ["course", "curriculum", "class", "program structure"]):
            return "curriculum"
        elif "/admission" in path or any(word in link_text for word in ["apply", "application", "admission", "requirements"]):
            return "admission"
        elif "/faculty" in path or any(word in link_text for word in ["faculty", "professor", "instructor", "teacher", "staff"]):
            return "faculty"
        elif "/tuition" in path or "/cost" in path or "/aid" in path or any(word in link_text for word in ["tuition", "cost", "fee", "financial aid", "scholarship"]):
            return "tuition"
        elif "/career" in path or "/job" in path or any(word in link_text for word in ["career", "job", "employment", "placement", "alumni"]):
            return "career"
        elif "/capstone" in path or "/project" in path or any(word in link_text for word in ["capstone", "project", "portfolio"]):
            return "project"
        elif "/faq" in path or any(word in link_text for word in ["faq", "question", "answer"]):
            return "faq"
        elif "/student" in path or "/experience" in path or any(word in link_text for word in ["student", "experience", "life", "testimonial"]):
            return "student"
        else:
            return "general"

    def determine_page_type_from_url(self, url):
        """Determine page type directly from URL, used for important URLs"""
        # (This function is unchanged)
        path = urlparse(url).path.lower()
        if path == self.target_path or path.endswith('/'):
            return "main"
        elif "/curriculum" in path:
            return "curriculum"
        elif "/admission" in path:
            return "admission"
        elif "/faculty" in path:
            return "faculty"
        elif "/tuition" in path or "/aid" in path:
            return "tuition"
        elif "/career" in path:
            return "career"
        elif "/capstone" in path or "/project" in path:
            return "project"
        elif "/faq" in path:
            return "faq"
        elif "/student" in path or "/experience" in path:
            return "student"
        else:
            return "general"

    def save_data(self, filename):
        """Save data to file with detailed statistics"""
        # (This function is unchanged)
        os.makedirs(os.path.dirname(filename) if os.path.dirname(filename) else '.', exist_ok=True)
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.data, f, ensure_ascii=False, indent=2)
        page_type_counts = {}
        for item in self.data:
            page_type = item['page_type']
            page_type_counts[page_type] = page_type_counts.get(page_type, 0) + 1
        print("\n==== Scraping Statistics ====")
        print(f"Total URLs visited: {len(self.visited_urls)}")
        print(f"Successfully extracted content items: {len(self.data)}")
        print("\nPage type distribution:")
        for page_type, count in page_type_counts.items():
            print(f"  - {page_type}: {count} pages")
        print(f"\nData saved to {filename}")

# Execute scraping
if __name__ == "__main__":
    # (This section is unchanged)
    os.makedirs("data", exist_ok=True)
    scraper = MSDataScienceFocusedScraper()
    data = scraper.start_scraping()
    scraper.save_data("data/ms_applied_data_science_data.json")

==== Starting to scrape MS in Applied Data Science program information ====
Scraping: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/ (Type: main)
  ✓ Content successfully extracted
  Found 8 related links
Scraping: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/in-person-program/ (Type: general)
  ✓ Content successfully extracted
  Found 48 related links
Scraping: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/capstone-projects/ (Type: project)
  ✓ Content successfully extracted
  Found 14 related links
Scraping: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/course-progressions/ (Type: curriculum)
  ✓ Content successfully extracted
  Found 77 related links
Scraping: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/how-to-apply/ (Type: admission)
  ✓ Content successfully extracted
  Found 1

In [19]:
import json
import re
import os
import sys
import urllib.parse  # <-- This import is the key to fixing links
from typing import List, Dict, Any, Tuple
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datetime import datetime
import spacy
from bs4 import BeautifulSoup # Import BeautifulSoup

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


try:
    nlp = spacy.load("en_core_web_sm")
except OSError:  # Catch the specific error
    print("Model 'en_core_web_sm' not found. Downloading...")
    import subprocess

    # Use sys.executable to ensure we're using the correct python interpreter
    subprocess.call([
        sys.executable, "-m", "spacy", "download", "en_core_web_sm"
    ])

    # After download, import the package directly and load it
    try:
        import en_core_web_sm
        nlp = en_core_web_sm.load()
    except ImportError:
        print("\n---")
        print("Failed to load model even after download.")
        print("Please try running this command in your terminal:")
        print(f"   {sys.executable} -m spacy download en_core_web_sm")
        print("Then, restart the script.")
        print("---")
        exit() # Exit the script if the model can't be loaded


class EnhancedDataPreprocessor:
    """Enhanced data preprocessor with semantic tagging and keyword extraction"""

    def __init__(self, input_file: str):
        """Initialize with path to scraped data file"""
        self.input_file = input_file
        with open(input_file, 'r', encoding='utf-8') as f:
            self.raw_data = json.load(f)

        self.processed_chunks = []
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()


        self.domain_keywords = {
            'education': ['program', 'course', 'curriculum', 'study', 'learn', 'academic', 'education', 'degree', 'student', 'faculty'],
            'application': ['apply', 'admission', 'requirement', 'deadline', 'application', 'submit', 'gre', 'toefl', 'ielts', 'recommendation'],
            'career': ['job', 'career', 'employment', 'industry', 'professional', 'opportunity', 'salary', 'placement', 'alumni', 'hire'],
            'financial': ['tuition', 'fee', 'cost', 'financial', 'aid', 'scholarship', 'funding', 'loan', 'payment', 'assistantship']
        }

    # --- THIS IS THE FUNCTION THAT FIXES THE LINKS ---
    def _extract_text_and_links(self, html_string: str, base_url: str) -> Tuple[str, List[Dict[str, str]]]:
        """Helper to get clean text and all embedded links from an HTML string."""
        if not html_string:
            return "", []

        try:
            soup = BeautifulSoup(html_string, 'html.parser')
            text = soup.get_text(separator=' ', strip=True)
            links = []
            seen_hrefs = set()

            for a_tag in soup.find_all('a', href=True):
                relative_href = a_tag['href']

                # Use urljoin to build the full, absolute URL.
                # This correctly handles:
                # 1. href="#FeeWaiver"
                # 2. href="/admissions/..."
                # 3. href="https://...#FeeWaiver"
                absolute_href = urllib.parse.urljoin(base_url, relative_href)

                href = absolute_href

                if href not in seen_hrefs:
                    links.append({
                        'text': a_tag.get_text(strip=True),
                        'href': href
                    })
                    seen_hrefs.add(href)

            return text, links
        except Exception:
            # Fallback for plain text if HTML parsing fails
            return html_string, []

    def preprocess(self):
        """Main preprocessing function with enhanced features"""
        print("Starting enhanced data preprocessing...")

        # Process each scraped item
        for item in self.raw_data:
            chunks = self.process_item(item)
            self.processed_chunks.extend(chunks)

        print(f"Preprocessing complete. Created {len(self.processed_chunks)} text chunks with semantic tags and keywords.")
        return self.processed_chunks

    # --- THIS FUNCTION PASSES THE base_url ---
    def process_item(self, item: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Process a single scraped item into multiple text chunks with enhanced metadata"""
        chunks = []
        content = item['content']
        base_url = item['url']  # This is the base URL for resolving relative links

        # Create basic metadata for all chunks from this item
        metadata = {
            "source": base_url,
            "title": item['title'],
            "page_type": item['page_type'],
            "last_scraped": item.get('last_scraped', '')
        }

        # Process paragraphs
        if 'paragraphs' in content and content['paragraphs']:
            for paragraph_html in content['paragraphs']:
                # Pass base_url to the helper function
                text, links = self._extract_text_and_links(paragraph_html, base_url)

                if text and len(text.split()) >= 10:
                    keywords = self.extract_keywords(text)
                    semantic_tags = self.extract_semantic_tags(text)

                    chunks.append({
                        "text": text,
                        "metadata": {
                            **metadata,
                            "content_type": "paragraph",
                            "keywords": keywords,
                            "semantic_tags": semantic_tags,
                            "embedded_links": links
                        }
                    })

        # Process lists
        if 'lists' in content and content['lists']:
            for list_items_html in content['lists']:
                if list_items_html:
                    text_parts = []
                    all_links = []

                    for item_html in list_items_html:
                        # Pass base_url to the helper function
                        text, links = self._extract_text_and_links(item_html, base_url)
                        text_parts.append(text)
                        all_links.extend(links)

                    list_text = "\n• " + "\n• ".join(text_parts)
                    keywords = self.extract_keywords(list_text)
                    semantic_tags = self.extract_semantic_tags(list_text)

                    chunks.append({
                        "text": list_text,
                        "metadata": {
                            **metadata,
                            "content_type": "list",
                            "keywords": keywords,
                            "semantic_tags": semantic_tags,
                            "embedded_links": all_links
                        }
                    })

        # Process specific content types
        if 'courses' in content and content['courses']:
            for course in content['courses']:
                course_text_parts = []
                all_links = []
                title_text = course.get('title', '')
                course_text_parts.append(f"Course: {title_text}")

                # Pass base_url to the helper function
                desc_text, desc_links = self._extract_text_and_links(course.get('description', ''), base_url)
                if desc_text:
                    course_text_parts.append(f"Description: {desc_text}")
                    all_links.extend(desc_links)

                if 'details' in course and course['details']:
                    detail_text_parts = []
                    for item_html in course['details']:
                        # Pass base_url to the helper function
                        text, links = self._extract_text_and_links(item_html, base_url)
                        detail_text_parts.append(text)
                        all_links.extend(links)
                    course_text_parts.append("Details:\n• " + "\n• ".join(detail_text_parts))

                course_text = "\n".join(course_text_parts)
                keywords = self.extract_keywords(course_text)
                semantic_tags = self.extract_semantic_tags(course_text)
                keywords.append("course")

                chunks.append({
                    "text": course_text,
                    "metadata": {
                        **metadata,
                        "content_type": "course",
                        "keywords": keywords,
                        "semantic_tags": semantic_tags,
                        "course_title": course.get('title', ''),
                        "embedded_links": all_links
                    }
                })

        # Process requirements if present
        if 'requirements' in content and content['requirements']:
            for req_title, req_content_html in content['requirements'].items():
                req_text_parts = [f"{req_title}:"]
                all_links = []

                if isinstance(req_content_html, list):
                    for item_html in req_content_html:
                        # Pass base_url to the helper function
                        text, links = self._extract_text_and_links(item_html, base_url)
                        req_text_parts.append(text)
                        all_links.extend(links)
                else:
                    # Pass base_url to the helper function
                    text, links = self._extract_text_and_links(req_content_html, base_url)
                    req_text_parts.append(text)
                    all_links.extend(links)

                req_text = "\n".join(req_text_parts)
                keywords = self.extract_keywords(req_text)
                semantic_tags = self.extract_semantic_tags(req_text)
                keywords.extend(["requirement", "admission"])

                chunks.append({
                    "text": req_text,
                    "metadata": {
                        **metadata,
                        "content_type": "requirement",
                        "keywords": keywords,
                        "semantic_tags": semantic_tags,
                        "requirement_type": req_title,
                        "embedded_links": all_links
                    }
                })

        # Process faculty information
        if 'faculty' in content and content['faculty']:
            for faculty in content['faculty']:
                faculty_text_parts = [f"Name: {faculty.get('name', '')}"]
                all_links = []

                if 'title' in faculty:
                    faculty_text_parts.append(f"Title: {faculty['title']}")

                if 'bio' in faculty and faculty['bio']:
                    bio_text_parts = []
                    for item_html in faculty['bio']:
                        # Pass base_url to the helper function
                        text, links = self._extract_text_and_links(item_html, base_url)
                        bio_text_parts.append(text)
                        all_links.extend(links)
                    faculty_text_parts.append("Bio: " + " ".join(bio_text_parts))

                faculty_text = "\n".join(faculty_text_parts)
                keywords = self.extract_keywords(faculty_text)
                semantic_tags = self.extract_semantic_tags(faculty_text)
                keywords.extend(["faculty", "professor", "instructor"])

                chunks.append({
                    "text": faculty_text,
                    "metadata": {
                        **metadata,
                        "content_type": "faculty",
                        "keywords": keywords,
                        "semantic_tags": semantic_tags,
                        "faculty_name": faculty.get('name', ''),
                        "embedded_links": all_links
                    }
                })

        # Process tuition info if present
        if 'tuition_info' in content and content['tuition_info']:
            tuition_info = content['tuition_info']
            tuition_text_parts = []
            all_links = []

            if 'tuition_text' in tuition_info:
                for item_html in tuition_info['tuition_text']:
                    # Pass base_url to the helper function
                    text, links = self._extract_text_and_links(item_html, base_url)
                    tuition_text_parts.append(text)
                    all_links.extend(links)

            if 'tuition_table' in tuition_info:
                tuition_text_parts.append("\nTuition Table:")
                for row in tuition_info['tuition_table']:
                    row_text_parts = []
                    for cell_html in row:
                        # Pass base_url to the helper function
                        text, links = self._extract_text_and_links(cell_html, base_url)
                        row_text_parts.append(text)
                        all_links.extend(links)
                    tuition_text_parts.append(" | ".join(row_text_parts))

            tuition_text = "\n".join(tuition_text_parts)

            if tuition_text:
                keywords = self.extract_keywords(tuition_text)
                semantic_tags = self.extract_semantic_tags(tuition_text)
                keywords.extend(["tuition", "cost", "fee", "financial"])

                chunks.append({
                    "text": tuition_text,
                    "metadata": {
                        **metadata,
                        "content_type": "tuition",
                        "keywords": keywords,
                        "semantic_tags": semantic_tags,
                        "embedded_links": all_links
                    }
                })

        # If no chunks were created, create one with headings
        if not chunks and 'headings' in content and content['headings']:
            heading_text = "\n".join(content['headings'])
            keywords = self.extract_keywords(heading_text)
            semantic_tags = self.extract_semantic_tags(heading_text)

            chunks.append({
                "text": heading_text,
                "metadata": {
                    **metadata,
                    "content_type": "headings",
                    "keywords": keywords,
                    "semantic_tags": semantic_tags,
                    "embedded_links": []
                }
            })

        return chunks

    def extract_keywords(self, text: str, max_keywords: int = 10) -> List[str]:
        """Extract important keywords from text using NLP techniques"""
        doc = nlp(text)
        potential_keywords = []
        for ent in doc.ents:
            potential_keywords.append(ent.text.lower())
        for chunk in doc.noun_chunks:
            potential_keywords.append(chunk.text.lower())
        for token in doc:
            if (token.pos_ in ['NOUN', 'PROPN', 'VERB', 'ADJ'] and
                not token.is_stop and
                len(token.text) > 2):
                potential_keywords.append(token.lemma_.lower())
        keyword_freq = {}
        for kw in potential_keywords:
            if len(kw) <= 2 or kw in self.stop_words:
                continue
            kw = re.sub(r'[^\w\s]', '', kw).strip()
            if not kw:
                continue
            keyword_freq[kw] = keyword_freq.get(kw, 0) + 1
        for domain, terms in self.domain_keywords.items():
            for term in terms:
                if term in keyword_freq:
                    keyword_freq[term] *= 1.5
        sorted_keywords = sorted(keyword_freq.items(), key=lambda x: x[1], reverse=True)
        top_keywords = [k for k, v in sorted_keywords[:max_keywords]]
        return list(set(top_keywords))

    def extract_semantic_tags(self, text: str) -> Dict[str, Any]:
        """Extract semantic tags including dates, numbers, and special information types"""
        semantic_tags = {}
        doc = nlp(text)
        dates = []
        date_patterns = [
            r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th)?,\s+\d{4}\b',
            r'\b\d{1,2}/\d{1,2}/\d{2,4}\b',
            r'\b\d{4}-\d{1,2}-\d{1,2}\b',
            r'\b\d{1,2}-\d{1,2}-\d{4}\b',
            r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b',
            r'\bdeadline\b.*?\b\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4}\b',
            r'\bdue\s+by\b.*?\b\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4}\b',
            r'\bdue\s+date\b.*?\b\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4}\b'
        ]
        for pattern in date_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                dates.append(match.group(0))
        for ent in doc.ents:
            if ent.label_ == "DATE":
                dates.append(ent.text)
        if dates:
            semantic_tags["dates"] = list(set(dates))
        numbers = []
        number_patterns = [
            r'\$\d+(?:,\d+)*(?:\.\d+)?',
            r'\d+(?:,\d+)*\s+dollars',
            r'\d+(?:\.\d+)?%',
            r'\d+(?:,\d+)*(?:\.\d+)?\s+credit(?:s)?',
            r'\d+(?:,\d+)*(?:\.\d+)?\s+hour(?:s)?',
        ]
        for pattern in number_patterns:
            matches = re.finditer(pattern, text)
            for match in matches:
                numbers.append(match.group(0))
        for ent in doc.ents:
            if ent.label_ in ["MONEY", "PERCENT", "QUANTITY", "CARDINAL"]:
                numbers.append(ent.text)
        if numbers:
            semantic_tags["numbers"] = list(set(numbers))
        info_type_patterns = {
            "application_info": [r'application', r'apply', r'admit', r'admission', 'deadline', r'requirement'],
            "contact_info": [r'contact', r'email', r'phone', r'@\w+\.\w+', r'\(\d{3}\)\s*\d{3}-\d{4}', r'\d{3}-\d{3}-\d{4}'],
            "course_info": [r'course', r'class', r'curriculum', r'syllabus', r'credit'],
            "financial_info": [r'tuition', r'cost', 'fee', r'financial aid', r'scholarship', r'grant', r'loan'],
            "faculty_info": [r'faculty', r'professor', r'instructor', r'teacher',r'staff'],
            "career_info": [r'career', r'job', 'employment', r'placement', r'opportunity', r'alumni']
        }
        for info_type, patterns in info_type_patterns.items():
            for pattern in patterns:
                if re.search(pattern, text, re.IGNORECASE):
                    semantic_tags[info_type] = True
                    break
        urls = re.findall(r'https?://\S+', text)
        if urls:
            semantic_tags["contains_urls"] = urls
        emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
        if emails:
            semantic_tags["contains_emails"] = emails
        return semantic_tags

    def clean_text(self, text: str) -> str:
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'\n+', '\n', text)
        return text.strip()

    def save_processed_data(self, output_file: str):
        """Save processed chunks to a JSON file"""
        os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(self.processed_chunks, f, ensure_ascii=False, indent=2)
        print(f"Processed data saved to {output_file}")
        if not self.processed_chunks:
            print("No chunks were processed, cannot generate statistics.")
            return
        keywords_per_chunk = [len(chunk['metadata'].get('keywords', [])) for chunk in self.processed_chunks]
        semantic_tags_per_chunk = [len(chunk['metadata'].get('semantic_tags', {})) for chunk in self.processed_chunks]
        chunks_with_links = sum(1 for chunk in self.processed_chunks if chunk["metadata"].get("embedded_links"))
        print(f"Average keywords per chunk: {sum(keywords_per_chunk)/len(keywords_per_chunk):.2f}")
        print(f"Average semantic tags per chunk: {sum(semantic_tags_per_chunk)/len(semantic_tags_per_chunk):.2f}")
        print(f"Chunks containing embedded links: {chunks_with_links} (out of {len(self.processed_chunks)})")

# Usage example
if __name__ == "__main__":
    script_dir = os.path.dirname(__file__) if '__file__' in locals() else os.getcwd()
    input_data_path = os.path.join(script_dir, "data/ms_applied_data_science_data.json")
    output_data_path = os.path.join(script_dir, "data/ms_applied_data_science_enhanced_chunks.json")

    if not os.path.exists(input_data_path):
        print(f"Error: Input file not found at {input_data_path}")
        print("Please make sure 'ms_applied_data_science_data.json' is in a 'data' folder.")
    else:
        preprocessor = EnhancedDataPreprocessor(input_data_path)
        chunks = preprocessor.preprocess()

        # --- THIS IS THE LINE I FIXED (I added the parentheses) ---
        if chunks:
            preprocessor.save_processed_data(output_data_path)
        else:
            print("Preprocessing finished, but no chunks were created. Output file not saved.")

[nltk_data] Downloading package punkt to /Users/alexlee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexlee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/alexlee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/alexlee/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Starting enhanced data preprocessing...


KeyboardInterrupt: 