In [None]:
# DATA ACQUISITION PART 1
# ATUHAIRE PAULINE 
# B35093

In [10]:
# Import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
import lxml
from io import StringIO

In [3]:
# Defining URL for scraping
# For this exercise I have previewed the CWUR 2025 website
# and decided the data is suitable for the tasks
url = "https://cwur.org/2025.php"

In [4]:
# Setting headers to mimic a browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/116.0.0.0 Safari/537.36"
}

In [5]:
# Fetching the web page
resp = requests.get(url, headers=headers)
resp.raise_for_status()

# Force UTF-8 decoding
resp.encoding = 'utf-8'  # important!
html = resp.text

print("Page fetched successfully, length:", len(html))

Page fetched successfully, length: 429713


In [6]:
# Parsing HTML with BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

# Finding the table element
# From inspecting the website, the id of the table 
# I need is "cwurTable"
table = soup.find("table", {"id": "cwurTable"} ) 

# Implementing a fallback, from best practices
# fallback: find first large table
if table is None:
    table = soup.find("table")

In [7]:
# Reading table into pandas
df = pd.read_html(StringIO(str(table)))[0]
df.head(5)

Unnamed: 0,World Rank,Institution,Location,National Rank,Education Rank,Employability Rank,Faculty Rank,Research Rank,Score
0,1 Top 0.1%,Harvard University,USA,1,1,1,1,1,100.0
1,2 Top 0.1%,Massachusetts Institute of Technology,USA,2,4,12,2,11,96.8
2,3 Top 0.1%,Stanford University,USA,3,10,4,3,4,95.2
3,4 Top 0.1%,University of Cambridge,United Kingdom,1,2,26,4,14,94.1
4,5 Top 0.1%,University of Oxford,United Kingdom,2,7,28,9,6,93.3


In [8]:
# Inspecting the Dataframe
rows, cols = df.shape
print(f"The dataset has {rows} samples and {cols} features.")
print(f"Features: {', '.join(df.columns)}")

The dataset has 2000 samples and 9 features.
Features: World Rank, Institution, Location, National Rank, Education Rank, Employability Rank, Faculty Rank, Research Rank, Score


In [9]:
# Saving dataframe to CSV
output_path = Path("data") / "cwur_2025.csv"
output_path.parent.mkdir(exist_ok=True)
df.to_csv(output_path, index=False)
print(f"Successfully saved the dataframe to {output_path}")

Successfully saved the dataframe to data\cwur_2025.csv
