In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import schedule
import time
import logging
import random

# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

def scrape_and_update_excel():
    url = 'https://rds2.northsouth.edu/index.php/common/showofferedcourses'
    courses = get_course_data(url)
    if courses:
        save_courses_to_excel(courses, 'course_data.xlsx')
    else:
        logging.warning('No courses found to save.')

def get_course_data(url, max_pages=2, max_retries=3, timeout=10):
    courses = []
    page = 1
    session = requests.Session()

    try:
        while page <= max_pages:
            retry_count = 0
            while retry_count < max_retries:
                try:
                    logging.info(f'Fetching page {page}')
                    start_time = time.time()

                    response = session.get(url, params={'page': page}, timeout=timeout)
                    response.raise_for_status()

                    soup = BeautifulSoup(response.content, 'html.parser')

                    table = soup.find('table', id='offeredCourseTbl')
                    if not table:
                        logging.warning(f'No table found on page {page}')
                        break

                    rows = table.find('tbody').find_all('tr')

                    if not rows:
                        logging.warning(f'No rows found on page {page}')
                        break

                    for row in rows:
                        cols = row.find_all('td')
                        if len(cols) == 7:  # Ensure there are exactly 7 columns
                            index = cols[0].text.strip()
                            course_name = cols[1].text.strip()
                            section = cols[2].text.strip()
                            faculty = cols[3].text.strip()
                            course_time = cols[4].text.strip()
                            room_number = cols[5].text.strip()
                            seats_available = cols[6].text.strip()
                            courses.append((index, course_name, section, faculty, course_time, room_number, seats_available))
                        else:
                            logging.warning(f'Row with unexpected number of columns: {len(cols)}')

                    page += 1
                    logging.info(f'Page {page} fetched in {time.time() - start_time:.2f} seconds')
                    break  # Break out of the retry loop if successful
                except requests.Timeout:
                    retry_count += 1
                    logging.warning(f'Timeout occurred. Retrying... ({retry_count}/{max_retries})')
                    time.sleep(2 ** retry_count)  # Exponential backoff
                except requests.RequestException as e:
                    logging.error(f'Error fetching data: {e}')
                    break

            if retry_count == max_retries:
                logging.error(f'Max retries reached for page {page}. Skipping to next page.')

            # Adding a random delay to avoid hitting the server too quickly
            time.sleep(random.uniform(1, 3))

    except Exception as e:
        logging.error(f'Unexpected error occurred: {e}')

    return courses

def save_courses_to_excel(courses, filename):
    try:
        df = pd.DataFrame(courses, columns=['Index', 'Course Name', 'Section', 'Faculty', 'Time', 'Room Number', 'Seats Available'])
        df.to_excel(filename, index=False)
        logging.info(f'Successfully saved courses to {filename}')
    except Exception as e:
        logging.error(f'Error saving courses to Excel: {e}')

# Schedule the update every 5 minutes
schedule.every(5).minutes.do(scrape_and_update_excel)

if __name__ == "__main__":
    scrape_and_update_excel()  # Initial run to create the file

    while True:
        schedule.run_pending()
        time.sleep(1)
