In [20]:
import requests
import os
from bs4 import BeautifulSoup as bs
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
from selenium.common.exceptions import TimeoutException

In [86]:
def fetch_and_save_stats(url, filename_prefix, div_id=None, div_class=None, section_class=None):
    # Initialize the WebDriver
    driver = webdriver.Chrome()

    try:
        # Load the webpage
        driver.get(url)

        # Wait for the div with the specified ID, class, or section class to load
        wait = WebDriverWait(driver, 20)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        if div_id:
            wait.until(EC.presence_of_element_located((By.ID, div_id)))
        elif div_class:
            wait.until(EC.presence_of_element_located((By.CLASS_NAME, div_class)))
        elif section_class:
            wait.until(EC.presence_of_element_located((By.CLASS_NAME, section_class)))

        # Get page source and parse it with BeautifulSoup
        page_source = driver.page_source
        soup = bs(page_source, 'html.parser')

        # Find the target div or section
        target_element = None
        if div_id:
            target_element = soup.find('div', {'id': div_id})
        elif div_class:
            target_element = soup.find('div', {'class': div_class})
        elif section_class:
            target_element = soup.find('section', {'class': section_class})

        if not target_element:
            print(f"Error: element with {'id' if div_id else 'class' if div_class else 'section class'} '{div_id if div_id else div_class if div_class else section_class}' not found.")
            return

        # Extract all relevant content from the target element
        content = {}

        # Extract headers (h1, h2, h3, etc.) within the target element
        headers = target_element.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        content['headers'] = [header.text.strip() for header in headers if header and header.text]

        # Extract paragraphs within the target element
        paragraphs = target_element.find_all('p')
        content['paragraphs'] = [paragraph.text.strip() for paragraph in paragraphs if paragraph and paragraph.text]

        # Find all tables within the target element
        tables = target_element.find_all('table')
        content['tables'] = []

        if tables:
            # Iterate through each table and extract data
            for table_idx, table in enumerate(tables):
                table_data = extract_table_data(table, table_idx)
                if table_data:
                    content['tables'].append(table_data)

        # Extract course information within includeBox class
        include_boxes = target_element.find_all('div', class_='includeBox')
        content['courses'] = extract_course_data(include_boxes)

        # Save data to files
        save_data(content, filename_prefix)

    except Exception as e:
        print(f"Error occurred: {e}")
    finally:
        # Ensure the WebDriver is closed properly
        driver.quit()

def extract_table_data(table, table_idx):
    """Extracts data from a table element."""
    headers = []
    thead = table.find('thead')
    if not thead:
        print(f"Error: thead not found in table {table_idx}.")
        return None

    # Assuming the first two rows are header rows
    first_row = thead.find_all('tr')[0]
    second_row = thead.find_all('tr')[1]

    # Combine headers
    combined_headers = []
    first_row_headers = []

    for th in first_row.find_all('th'):
        colspan = int(th.get('colspan', 1))
        label = th.text.strip() if th.text else ''
        first_row_headers.extend([label] * colspan)

    for idx, th in enumerate(second_row.find_all('th')):
        aria_label = th.text.strip() if th.text else ''
        broader_category = first_row_headers[idx]
        combined_header = f"{broader_category} - {aria_label}" if broader_category else aria_label
        combined_headers.append(combined_header)

    # Flatten headers for CSV
    flattened_headers = [header.replace(" - ", "-") for header in combined_headers]

    # Extract data from tbody
    semester_course = []
    tbody = table.find('tbody')
    if tbody:
        rows = tbody.find_all('tr')
        for row in rows:
            first_column = row.find('th').text.strip() if row.find('th') else ''
            columns = [col.text.strip() for col in row.find_all('td')]
            if not columns:
                continue

            # Create a dictionary for each row
            row_data = {flattened_headers[0]: first_column}  # First column (e.g., rank)
            for idx, column in enumerate(columns):
                row_data[flattened_headers[idx + 1]] = column

            semester_course.append(row_data)

    return semester_course

def extract_course_data(include_boxes):
    """Extracts course data from includeBox class."""
    course_data = []

    for box in include_boxes:
        strong_text = box.find('strong').text.strip() if box.find('strong') else 'N/A'
        teacher_text = box.find('em').next_sibling.strip() if box.find('em') and box.find('em').next_sibling else 'N/A'
        code_text = box.find(string=lambda text: 'CS-' in text)
        code_text = code_text.strip() if code_text else 'N/A'

        # Extract all links and titles
        links = box.find_all('a')
        course_links = [
            {'title': link.find('strong').text.strip() if link.find('strong') else 'N/A', 'url': link['href']} 
            for link in links if link and link.get('href')
        ]

        # Compile all course data
        course_data.append({
            'Course Title': strong_text,
            'Teacher': teacher_text,
            'Code': code_text,
            'Links': course_links
        })

    return course_data

def save_data(content, filename_prefix):
    """Saves the extracted data to text and CSV files."""
    # Save headers and paragraphs to a text file
    txt_filename = f"{filename_prefix}.txt"
    with open(txt_filename, 'w', encoding='utf-8') as f:
        f.write("Headers:\n")
        for header in content['headers']:
            f.write(f"{header}\n")
        f.write("\nParagraphs:\n")
        for paragraph in content['paragraphs']:
            f.write(f"{paragraph}\n")

    print(f"Content saved to {txt_filename}")

    # Save each table's data to CSV files
    for idx, table in enumerate(content['tables']):
        if table:
            df = pd.DataFrame(table)
            csv_filename = f"{filename_prefix}_table_{idx}.csv"
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"Table {idx} saved to {csv_filename}")

    # Save course data to a CSV file
    if content['courses']:
        course_df = pd.DataFrame(content['courses'])
        course_csv_filename = f"{filename_prefix}_courses.csv"
        course_df.to_csv(course_csv_filename, index=False, encoding='utf-8')
        print(f"Course data saved to {course_csv_filename}")



In [72]:
urls = {
    "ethz_computer_science_bachelor": {
    "url": "https://inf.ethz.ch/studies/bachelor.html",
    "div_id": "contentContainer",    
    },
    "ethz_computer_science_master": {
    "url": "https://inf.ethz.ch/studies/master/master-cs-2020.html",
    "div_id": "contentContainer",    
    },
    "ethz_data_science_master": {
    "url": "https://inf.ethz.ch/studies/master/master-ds.html",
    "div_id": "contentContainer",    
    },
    "ethz_data_management_anh_machine_learning_research": {
    "url": "https://inf.ethz.ch/research/data-management-machine-learning.html",
    "div_id": "contentContainer",    
    },
    "ethz_intelligent_interactive_research": {
    "url": "https://inf.ethz.ch/research/intelligent-interactive-systems-and-physical-computing.html",
    "div_id": "contentContainer",    
    },
    "ethz_visual_computing_research": {
    "url": "https://inf.ethz.ch/research/visual-computing.html",
    "div_id": "contentContainer",    
    },
    "ethz_computer_science_lab": {
    "url": "https://inf.ethz.ch/department/rooms-labs-beamers.html",
    "div_id": "contentContainer",    
    },
}

In [80]:
urls = {
    "usi_computer_science_bachelor": {
    "url": "https://www.usi.ch/en/education/bachelor/informatics/structure-and-contents",
    "div_id": None,
    "div_class": "page_content",    
    },
    "usi_data_science_bachelor": {
    "url": "https://www.usi.ch/en/education/bachelor/data-science/structure-and-contents",
    "div_id": None,
    "div_class": "page_content",    
    },
}

In [87]:
urls = {
    "unibas_computer_science_bachelor": {
    "url": "https://www.unibas.ch/en/Studies/Degree-Programs/Degree-Programs.html?study=Computer-Science-BSc",
    "div_id": None,
    "section_class": "content_block clearfix narrow-article-content",
    "div_class": "content_wides tudysubject",    
    },
    "unibas_computer_science_bachelor_associate": {
    "url": "https://www.unibas.ch/en/Studies/Degree-Programs/Degree-Programs.html?study=Computer-Science-Ausserfakultaeres-Bachelorstudienfach",
    "div_id": None,
    "section_class": "content_block clearfix narrow-article-content",
    "div_class": "content_wide studysubject",    
    },
    "unibas_computer_science_master": {
    "url": "https://www.unibas.ch/en/Studies/Degree-Programs/Degree-Programs.html?study=Computer-Science-MSc",
    "div_id": None,
    "section_class": "content_block clearfix narrow-article-content",
    "div_class": "content_wide studysubject",    
    },
    "unibas_computer_science_master_associate": {
    "url": "https://www.unibas.ch/en/Studies/Degree-Programs/Degree-Programs.html?study=Computer-Science-Ausserfakultaeres-Masterstudienfach",
    "div_id": None,
    "section_class": "content_block clearfix narrow-article-content",
    "div_class": "content_wide studysubject",    
    },
    "unibas_data_science_master": {
    "url": "https://www.unibas.ch/en/Studies/Degree-Programs/Degree-Programs.html?study=Computer-Science-MSc0",
    "div_id": None,
    "section_class": "content_block clearfix narrow-article-content",
    "div_class": "content_wide studysubject",    
    },
}

In [88]:
if not os.path.exists('crawled'):
    os.makedirs('crawled')

# Fetch data
for stat_type, info in urls.items():
    fetch_and_save_stats(info['url'], f"./crawled/data_{stat_type}", info['div_id'], info['div_class'], info['section_class'])

Error occurred: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF673BC3AB5+28005]
	(No symbol) [0x00007FF673B283B0]
	(No symbol) [0x00007FF6739C580A]
	(No symbol) [0x00007FF673A15A3E]
	(No symbol) [0x00007FF673A15D2C]
	(No symbol) [0x00007FF673A5EA97]
	(No symbol) [0x00007FF673A3BA7F]
	(No symbol) [0x00007FF673A5B8B3]
	(No symbol) [0x00007FF673A3B7E3]
	(No symbol) [0x00007FF673A075C8]
	(No symbol) [0x00007FF673A08731]
	GetHandleVerifier [0x00007FF673EB643D+3118829]
	GetHandleVerifier [0x00007FF673F06C90+3448640]
	GetHandleVerifier [0x00007FF673EFCF0D+3408317]
	GetHandleVerifier [0x00007FF673C8A40B+841403]
	(No symbol) [0x00007FF673B3340F]
	(No symbol) [0x00007FF673B2F484]
	(No symbol) [0x00007FF673B2F61D]
	(No symbol) [0x00007FF673B1EB79]
	BaseThreadInitThunk [0x00007FFA4C437374+20]
	RtlUserThreadStart [0x00007FFA4E31CC91+33]



KeyboardInterrupt: 

In [5]:
files = [
    "./crawled/data_standard_stats.csv",
    "./crawled/data_goalkeeping_stats.csv",
    "./crawled/data_shooting_stats.csv",
    "./crawled/data_passing_stats.csv",
    "./crawled/data_pass_types_stats.csv",
    "./crawled/data_goal_and_shot_creation_stats.csv",
    "./crawled/data_defensive_actions_stats.csv",
    "./crawled/data_possession_stats.csv",
    "./crawled/data_playing_time_stats.csv",
    "./crawled/data_miscellaneous_stats.csv"
]

In [6]:
def read_csv(file):
    return pd.read_csv(file)

In [7]:
base_df = read_csv("./crawled/data_standard_stats.csv")

# Merge other dataframes onto the base dataframe
for file in files[1:]: 
    if os.path.exists(file):
        df = read_csv(file)
        base_df = pd.merge(base_df, df, on=['Player', 'Pos', 'Squad'], how='left', suffixes=('', '_y'))
        base_df = base_df.loc[:, ~base_df.columns.str.endswith('_y')]

In [8]:
# Replace missing statistics 
base_df = base_df.fillna('N/a')
base_df = base_df.drop(columns=['Rk', 'Matches'], errors='ignore')

# Ensure 'Playing Time-Min' is numeric 
base_df['Playing Time-Min'] = base_df['Playing Time-Min'].replace({',': ''}, regex=True)
base_df['Playing Time-Min'] = pd.to_numeric(base_df['Playing Time-Min'], errors='coerce')

# Players who have played more than 90 minutes 
filtered_df = base_df[base_df['Playing Time-Min'] > 90]

# Sort 
sorted_df = filtered_df.sort_values(by=['Player', 'Age'], ascending=[True, False])

In [None]:
# Print results
print(f"Data:")
print(sorted_df.head())  

In [10]:
# Save the results
sorted_df.to_csv("./crawled/merged_premier_league_stats.csv", index=False)
sorted_df.to_csv("./crawled/result.csv", index=False)