In [3]:
pip install requests beautifulsoup4 lxml

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [12]:
import requests
from bs4 import BeautifulSoup
import time
import csv
import logging
from urllib.parse import urljoin
import json # Keep for potential future use if needed, but not for primary output
import re

# --- Configuration ---
LOG_FILE = 'scraper_flattened.log'
OUTPUT_CSV_FILE = 'egypt_institutions_programs_flattened.csv'
LIST_PAGE_URL = "https://study-in-egypt.gov.eg/categories/All"

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36'
}
REQUEST_DELAY = 3 # Politeness delay

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - [%(funcName)s] %(message)s',
    handlers=[
        logging.FileHandler(LOG_FILE, mode='w', encoding='utf-8'),
        logging.StreamHandler()
    ]
)
# --- End Configuration ---

def get_detail_page_links(list_url):
    """
    Scrapes list page for name, detail URL, location, and category.
    Returns list of dicts: [{'name': '...', 'detail_url': '...', 'location': '...', 'category': '...'}]
    """
    institution_data_list = []
    logging.info(f"Fetching list page: {list_url}")
    try:
        response = requests.get(list_url, headers=HEADERS, timeout=60)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        soup = BeautifulSoup(response.content, 'lxml')

        institution_cards = soup.find_all('div', class_='gird_with_list')
        logging.info(f"Found {len(institution_cards)} cards on the list page.")

        if not institution_cards:
            logging.warning("No institution cards found. Check selector 'div.gird_with_list'.")
            return []

        for index, card in enumerate(institution_cards):
            card_data = {'name': 'N/A', 'detail_url': 'N/A', 'location': 'N/A', 'category': 'N/A'}

            # --- Extract Name and Detail URL ---
            link_element = card.select_one('div.caption h4.group a')
            if link_element:
                card_data['name'] = link_element.text.strip()
                relative_url = link_element.get('href')
                if relative_url:
                    card_data['detail_url'] = urljoin(response.url, relative_url)
            else:
                 logging.warning(f"Card {index+1}: Could not find name/link element.")
                 continue

            # --- Extract Location ---
            location_element = card.select_one('div.venue a')
            if location_element:
                card_data['location'] = ' '.join(location_element.text.split())
                logging.info(f"Card {index+1} ({card_data['name']}): Found Location: {card_data['location']}")
            else:
                logging.warning(f"Card {index+1} ({card_data['name']}): Location element 'div.venue a' not found.")

            # --- Extract Institution Category ---
            category_element = card.select_one('div.caption ul.list-inline li.tool-tip a')
            if category_element:
                 parent_li = category_element.find_parent('li')
                 if parent_li and parent_li.get('title', '').lower().endswith('category'):
                     card_data['category'] = category_element.text.strip()
                     logging.info(f"Card {index+1} ({card_data['name']}): Found Category: {card_data['category']}")
                 else:
                     first_li_link = card.select_one('div.caption ul.list-inline li a')
                     if first_li_link:
                         card_data['category'] = first_li_link.text.strip()
                         logging.info(f"Card {index+1} ({card_data['name']}): Found Category (Fallback): {card_data['category']}")
            else:
                 logging.warning(f"Card {index+1} ({card_data['name']}): Category element ('li.tool-tip a') not found.")

            institution_data_list.append(card_data)

        logging.info(f"Extracted initial data for {len(institution_data_list)} institutions from list page.")
        return institution_data_list

    except requests.exceptions.Timeout: logging.error(f"Timeout fetching list page: {list_url}"); return []
    except requests.exceptions.RequestException as e: logging.error(f"Request failed for list page {list_url}: {e}"); return []
    except Exception as e: logging.error(f"Error parsing list page {list_url}: {e}", exc_info=True); return []


def parse_program_table(table_soup, category_title, faculty_name):
    """Parses the program table within a faculty accordion item."""
    programs = []
    if not table_soup: return programs
    headers_raw = [th.text.strip().lower() for th in table_soup.select('thead th')]
    tbody = table_soup.find('tbody')
    if not tbody: return programs

    # Simple header cleaning (remove extra spaces, maybe handle minor variations)
    headers = [' '.join(h.split()) for h in headers_raw]

    # Map headers to consistent keys
    header_map = {
        'name': 'Program Name',
        'description': 'Program Description',
        'years of study': 'Years Of Study',
        'min. study years': 'Years Of Study', # Map variations to the same key
        'number of semesters': 'Number Of Semesters',
        'number of semesters in egypt': 'Number Of Semesters', # Map variations
        'fee in usd': 'Fee In USD',
        'fee in egp': 'Fee In EGP',
        'prerequisites': 'Prerequisites',
        'credit hours': 'Credit Hours',
        'max. study years': 'Max Study Years',
        'affiliated universities': 'Affiliated Universities',
        'number of semesters abroad': 'Semesters Abroad',
        # Add other headers you expect here
    }

    # Find column indices based on cleaned headers
    col_indices = {header_map.get(h): i for i, h in enumerate(headers) if h in header_map}
    # Ensure we have a name column index at least
    if 'Program Name' not in col_indices:
         logging.warning(f"Could not find 'Name' column header in table for {faculty_name} under {category_title}. Headers found: {headers}")
         # Attempt to use the first column as name as a fallback
         if headers: col_indices['Program Name'] = 0
         else: return [] # Cannot proceed without headers/columns


    rows = tbody.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if not cells: continue

        prog_data = {
            'Program Category': category_title,
            'Faculty': faculty_name
        }
        num_cells = len(cells)
        num_headers = len(headers)

        # Try to extract data based on mapped indices
        for key, index in col_indices.items():
            # Basic handling for potential colspan issues - if cell count is less than header count assume mismatch
            # A more robust solution would inspect actual colspan attributes if needed
            effective_index = index
            if num_cells < num_headers and index > 0:
                # Simplistic check: if description likely had colspan, shift subsequent indices
                # Requires knowing which column typically has colspan (often description)
                desc_index = col_indices.get('Program Description', -1)
                if desc_index != -1 and index > desc_index:
                     effective_index = index - (num_headers - num_cells) # Adjust by difference

            if 0 <= effective_index < num_cells:
                prog_data[key] = cells[effective_index].text.strip()
            else:
                prog_data[key] = 'N/A' # Index out of bounds

        # Ensure essential program name is not empty
        if prog_data.get('Program Name', 'N/A') != 'N/A' and prog_data.get('Program Name'):
             programs.append(prog_data)
        else:
             logging.debug(f"Skipping row in {faculty_name} - likely missing program name. Cells: {[c.text.strip() for c in cells]}")


    return programs


def scrape_detail_page(institution_list_data):
    """
    Scrapes detail page.
    Returns a LIST of dictionaries, one for each program found.
    If no programs, returns a list with one dictionary containing only university details.
    """
    detail_url = institution_list_data.get('detail_url', 'N/A')
    output_rows = [] # This function will now return a list of rows

    # --- Initialize Base University Details ---
    base_details = {
        'Name': institution_list_data.get('name', 'N/A'),
        'Detail Page URL': detail_url,
        'Location': institution_list_data.get('location', 'N/A'),
        'Category': institution_list_data.get('category', 'N/A'),
        'Year Built': 'N/A',
        'Email': 'N/A',
        'Description': 'N/A',
        'Phone': 'N/A',
        'Coordinator': 'N/A',
        'Amenities': 'N/A',
        'Logo URL': 'N/A' # Neglecting logo for now based on previous request
    }
    # Program specific fields (will be filled later per program)
    program_fields = {
        'Program Category': 'N/A',
        'Faculty': 'N/A',
        'Program Name': 'N/A',
        'Program Description': 'N/A',
        'Years Of Study': 'N/A',
        'Number Of Semesters': 'N/A',
        'Fee In USD': 'N/A',
        'Fee In EGP': 'N/A',
        'Prerequisites': 'N/A',
        'Credit Hours': 'N/A',
        'Max Study Years': 'N/A',
        'Affiliated Universities': 'N/A',
        'Semesters Abroad': 'N/A',
    }


    if detail_url == 'N/A':
         logging.error(f"Missing detail_url for name '{base_details['Name']}'. Cannot scrape.")
         # Return a single row with base details and errors in program fields
         error_row = base_details.copy()
         error_row.update({k: 'Error: Missing Detail URL' for k in program_fields})
         output_rows.append(error_row)
         return output_rows

    logging.info(f"Scraping detail page for {base_details['Name']}: {detail_url}")

    try:
        logging.debug(f"Waiting {REQUEST_DELAY} seconds...")
        time.sleep(REQUEST_DELAY)

        response = requests.get(detail_url, headers=HEADERS, timeout=60)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        soup = BeautifulSoup(response.content, 'lxml')

        # --- Scrape Base University Details from Detail Page ---
        # Name (Confirm/update)
        name_element_h1 = soup.find('h1'); name_element_strong = soup.select_one('div.page-title strong')
        confirmed_name = None
        if name_element_h1: confirmed_name = name_element_h1.text.strip()
        elif name_element_strong: confirmed_name = name_element_strong.text.strip()
        if confirmed_name and confirmed_name != base_details['Name']:
             logging.info(f"Updated name from '{base_details['Name']}' to '{confirmed_name}'.")
             base_details['Name'] = confirmed_name
        else: logging.info(f"Name '{base_details['Name']}' confirmed.")

        # Email
        email_element = soup.select_one('li.track-me a[href^="mailto:"]')
        if email_element: base_details['Email'] = email_element.text.strip(); logging.info(f"Found Email: {base_details['Email']}")
        else: base_details['Email'] = 'Not Found'; logging.info(f"Email not found.")

        # Phone
        phone_element = soup.select_one('a[href^="tel:"]')
        if phone_element: base_details['Phone'] = phone_element.text.strip(); logging.info(f"Found Phone: {base_details['Phone']}")
        else: base_details['Phone'] = 'Not Found'; logging.info(f"Phone not found.")

        # Year Built
        contact_links_year = soup.select('span a, div a'); found_year = False
        for link in contact_links_year:
            link_text = link.text.strip()
            if link_text.lower().startswith('year build:'):
                 year_match = re.search(r'\d{4}', link_text)
                 if year_match: base_details['Year Built'] = year_match.group(0); found_year = True; logging.info(f"Found Year Built: {base_details['Year Built']}"); break
        if not found_year: base_details['Year Built'] = 'Not Found'; logging.info("Year Built not found.")

        # Coordinator
        contact_links_coord = soup.select('a'); found_coordinator = False
        for link in contact_links_coord:
            link_text = link.text.strip()
            if link_text.startswith('Coordinator:'): base_details['Coordinator'] = link_text.replace('Coordinator:', '').strip(); found_coordinator = True; logging.info(f"Found Coordinator: {base_details['Coordinator']}"); break
        if not found_coordinator: base_details['Coordinator'] = 'Not Found'; logging.info(f"Coordinator not found.")

        # Amenities
        amenities_list = []
        amenities_panel_ul = soup.select_one('div.panel-body > ul')
        if amenities_panel_ul:
            list_items = amenities_panel_ul.find_all('li', recursive=False)
            if list_items:
                for item in list_items: text = ' '.join(item.text.split()).strip();
                if text: amenities_list.append(text)
                base_details['Amenities'] = "\n".join(amenities_list); logging.info(f"Found {len(amenities_list)} amenities.")
            else: base_details['Amenities'] = 'None Listed'; logging.info(f"Amenities section empty.")
        else: base_details['Amenities'] = 'Not Found'; logging.info(f"Amenities section not found.")

        # Description
        description_panel = None; all_panels = soup.select('div.panel-body')
        for panel in all_panels:
            has_amenities_ul = panel.select_one(':scope > ul') is not None; has_row_div = panel.select_one(':scope > div.row') is not None; has_direct_text = any(isinstance(c, str) and c.strip() for c in panel.contents)
            if not has_amenities_ul and (has_row_div or has_direct_text): description_panel = panel; break
        if description_panel:
            inner_div = description_panel.select_one('div.row > div'); desc_text = ''
            if inner_div: desc_text = ' '.join(inner_div.text.split())
            if not desc_text: desc_text = ' '.join(description_panel.text.split())
            if desc_text: base_details['Description'] = desc_text; logging.info(f"Found Description.")
            else: base_details['Description'] = 'None Provided'; logging.info(f"Description panel empty.")
        else: base_details['Description'] = 'Not Found'; logging.info(f"Description panel structure not found.")

        # Neglecting Logo based on previous request
        base_details['Logo URL'] = 'Not Scraped'

        # --- Scrape Programs/Departments ---
        logging.info("Extracting programs...")
        all_programs_data = []
        main_accordion_list = soup.select_one('ul.accordion__list')
        if main_accordion_list:
            category_items = main_accordion_list.select(':scope > li.accordion__item')
            logging.info(f"Found {len(category_items)} main program categories.")
            for category_item in category_items:
                category_title_element = category_item.select_one(':scope > .accordion__itemTitleWrap > h3.accordion__itemTitle')
                category_title = category_title_element.text.strip() if category_title_element else "Unknown Category"
                logging.info(f"Processing Category: {category_title}")
                faculty_list = category_item.select_one(':scope > .accordion__itemContent > ul.accordion__list')
                if faculty_list:
                    faculty_items = faculty_list.select(':scope > li.accordion__item')
                    logging.debug(f"Found {len(faculty_items)} faculties under {category_title}")
                    for faculty_item in faculty_items:
                        faculty_title_element = faculty_item.select_one(':scope > .accordion__itemTitleWrap h3.accordion__itemTitle')
                        faculty_name = faculty_title_element.text.strip() if faculty_title_element else "Unknown Faculty"
                        faculty_name = faculty_name.replace("Faculty of", "").strip()
                        logging.debug(f"--> Processing Faculty: {faculty_name}")
                        program_table = faculty_item.select_one(':scope > .accordion__itemContent table')
                        if program_table:
                             faculty_programs = parse_program_table(program_table, category_title, faculty_name)
                             if faculty_programs:
                                 logging.debug(f"    Parsed {len(faculty_programs)} programs for {faculty_name}")
                                 all_programs_data.extend(faculty_programs)
                             else: logging.debug(f"    Table found for {faculty_name}, but no programs parsed.")
                        else: logging.debug(f"    No program table found for {faculty_name}")
                else: logging.debug(f"No faculty list under category: {category_title}")
        else: logging.warning("Main program accordion list not found.")

        # --- Combine Base Details with Each Program ---
        if all_programs_data:
            logging.info(f"Found {len(all_programs_data)} total programs. Creating flattened rows...")
            for program_data in all_programs_data:
                row = base_details.copy() # Start with university info
                row.update(program_data)  # Add program specific info
                output_rows.append(row)
        else:
            # If no programs found, add one row with only base university details
            logging.info("No programs found for this university. Adding single row with base details.")
            row = base_details.copy()
            row.update(program_fields) # Add program field keys with N/A values
            output_rows.append(row)

        logging.info(f"Finished processing detail page for: {base_details['Name']}")
        return output_rows

    except requests.exceptions.Timeout: error_msg = 'Error: Request Timeout'
    except requests.exceptions.RequestException as e: error_msg = f'Error: HTTP Request Failed ({e})'
    except Exception as e: error_msg = f'Error: Parsing Failed ({e})'; logging.error(f"Error parsing detail page {detail_url}: {e}", exc_info=True)

    logging.error(f"{error_msg} while scraping detail page {detail_url}")
    # If an exception occurred, return a single row with error messages
    error_row = base_details.copy()
    error_row.update({k: error_msg for k in program_fields}) # Update program fields with error
    output_rows.append(error_row)
    return output_rows


def save_to_csv(data, filename):
    """Saves the flattened data list (of dictionaries) to a CSV file."""
    if not data:
        logging.warning("No data collected to save.")
        return

    # Define the order of columns in the CSV file - FLAT STRUCTURE
    fieldnames = [
        # University Details
        'Name', 'Category', 'Year Built', 'Description', 'Location', 'Phone', 'Email',
        'Coordinator', 'Amenities', 'Detail Page URL', 'Logo URL',
        # Program Details (prefixed for clarity, adjust if needed)
        'Program Category', 'Faculty', 'Program Name', 'Program Description',
        'Years Of Study', 'Number Of Semesters', 'Fee In USD', 'Fee In EGP',
        'Prerequisites', 'Credit Hours', 'Max Study Years',
        'Affiliated Universities', 'Semesters Abroad',
    ]
    logging.info(f"Attempting to save {len(data)} rows to {filename}")
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            # Use extrasaction='ignore' in case some rows miss minor fields due to parsing variations
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames, extrasaction='ignore')
            writer.writeheader()
            writer.writerows(data)
        logging.info(f"Data successfully saved to {filename}")
    except IOError as e: logging.error(f"Could not write to CSV file {filename}: {e}")
    except Exception as e: logging.error(f"An unexpected error during CSV writing: {e}", exc_info=True)


# --- Main Execution ---
if __name__ == "__main__":
    logging.info("--- Scraper Started ---")

    # Step 1: Get initial data (incl. Location, Category) from list page
    institution_list_data = get_detail_page_links(LIST_PAGE_URL)

    all_flattened_rows = [] # Renamed list to reflect content
    if institution_list_data:
        total_institutions = len(institution_list_data)
        logging.info(f"Starting to scrape detail pages for {total_institutions} institutions...")
        # Step 2: Scrape details, getting back a list of rows per institution
        for i, initial_data in enumerate(institution_list_data):
            logging.info(f"--- Processing institution {i+1} of {total_institutions} ---")
            # scrape_detail_page now returns a list of program rows (or one base row)
            program_rows_list = scrape_detail_page(initial_data)
            # Use extend to add all rows from the list to the main list
            all_flattened_rows.extend(program_rows_list)
        logging.info(f"Finished scraping detail pages.")
    else:
        logging.warning("Did not find any institution data on the list page.")

    # Step 3: Save all collected flattened rows
    save_to_csv(all_flattened_rows, OUTPUT_CSV_FILE)

    logging.info("--- Scraper Finished ---")

2025-04-21 15:10:05,761 - INFO - --- Scraper Started ---
2025-04-21 15:10:05,762 - INFO - Fetching list page: https://study-in-egypt.gov.eg/categories/All
2025-04-21 15:10:15,770 - INFO - Found 303 cards on the list page.
2025-04-21 15:10:15,773 - INFO - Card 1 (Cairo Higher Institute for Languages, Interpretation, Administrative Sciences and Computer - Mokattam): Found Location: Mokattam - the commercial station - 5 Street 54 Cairo Egypt
2025-04-21 15:10:15,774 - INFO - Card 1 (Cairo Higher Institute for Languages, Interpretation, Administrative Sciences and Computer - Mokattam): Found Category (Fallback): High institute
2025-04-21 15:10:15,777 - INFO - Card 2 (Ain Shams University ASU): Found Location: El-Khalifa El-Maamoun Street, El-Abbassia, 11566 Cairo Egypt
2025-04-21 15:10:15,779 - INFO - Card 2 (Ain Shams University ASU): Found Category (Fallback): Public university
2025-04-21 15:10:15,781 - INFO - Card 3 (Al Azhar University): Found Location: Autostrad Road, Nasr City Cairo E