# Preparing the data.

## Step 1 : Retrieving information on average class size of ST 2021/22 modules.

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# We define a function to scrape the information on average class size.
def scrape_data(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "lxml")  
        try:
            info_tag = soup.find('p', string=lambda text: "average class size 2021/22" in (text or "").lower())
            if info_tag and ':' in info_tag.text:
                size_text = info_tag.text.split(':')[-1].strip()
                return size_text, soup
            else:
                return "Average class size text not found.", soup
        except AttributeError:
            return "Attribute error caught.", soup
    else:
        return "Webpage not found.", None

# We define a function to extract the course code from the web page heading.
def extract_course_code_from_heading(soup):
    course_code_tag = soup.find("span" , id="courseCode")
    if course_code_tag:
        return course_code_tag.text.strip()
    else:
        return "Course code not found."

# We make a list of the URLs of the ST modules we want to scrape.
urls = [
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST101.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST102.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST107.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST109.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST110.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST115.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST201.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST202.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST205.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST206.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST207.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST211.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST213.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST226.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST227.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST300.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST301.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST302.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST303.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST304.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST306.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST307.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST308.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST309.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST310.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST311.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST312.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST313.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST326.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST327.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST330.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST405.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST409.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST411.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST416.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST418.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST422.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST425.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST426.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST429.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST433.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST436.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST439.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST440.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST442.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST443.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST444.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST445.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST446.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST447.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST448.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST449.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST450.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST451.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST452.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST453.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST454.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST455.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST456.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST457.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST498.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST499.htm'

]

# We make a list to keep our data.
data = []

# I include a category on the type of module it is: etc. First Year, Second Year, Third Year.
def categorise_course(code):
    year_code = code[2]
    if year_code == "1":
        return "First Year"
    if year_code == '2':
        return "Second Year"
    if year_code == "3":
        return "Third Year"
    if year_code == "4":
        return "Fourth Year"
    else:
        return "None"
    
# We loop through each URL and scrape the data.
for url in urls:
    class_size, soup = scrape_data(url) 
    if soup:  
        course_code = extract_course_code_from_heading(soup)
        course_year = categorise_course(course_code)
    else:
        course_code = "Webpage not found."
        course_year = "None"
    data.append({"Course Code": course_code,"Course Year" : course_year, "Average Class Size": class_size})

# We create a DataFrame.
class_size_df = pd.DataFrame(data)
class_size_df



Unnamed: 0,Course Code,Course Year,Average Class Size
0,ST101,First Year,28
1,ST102,First Year,26
2,ST107,First Year,20
3,ST109,First Year,35
4,ST110,First Year,1
...,...,...,...
57,ST455,Fourth Year,12
58,ST456,Fourth Year,29
59,ST457,Fourth Year,Unavailable
60,ST498,Fourth Year,Unavailable


## Step 2 : Clean the dataframe and alter to student-teacher ratio. 

In [3]:
# We create a copy of the DataFrame to avoid the warning.
cleaned_class_size_df = class_size_df.copy()

# We remove 'Unavailable' information from our DataFrame and calculate the student-teacher ratio.
cleaned_class_size_df['Average Class Size'] = pd.to_numeric(cleaned_class_size_df['Average Class Size'], errors='coerce')
cleaned_class_size_df.dropna(subset=['Average Class Size'], inplace=True)
cleaned_class_size_df['Student-Teacher Ratio'] = 1 / cleaned_class_size_df['Average Class Size']

cleaned_class_size_df


Unnamed: 0,Course Code,Course Year,Average Class Size,Student-Teacher Ratio
0,ST101,First Year,28.0,0.035714
1,ST102,First Year,26.0,0.038462
2,ST107,First Year,20.0,0.05
3,ST109,First Year,35.0,0.028571
4,ST110,First Year,1.0,1.0
5,ST115,First Year,29.0,0.034483
6,ST201,Second Year,9.0,0.111111
7,ST202,Second Year,28.0,0.035714
8,ST205,Second Year,15.0,0.066667
9,ST206,Second Year,5.0,0.2


## Step 3: Determine if the course is project or exam based.

In [4]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_percentage(text, keyword):
    # We search for all percentages which are immediate following the specified keyword. 
    pattern = rf"{keyword}[^0-9]*?(\d+)%"
    matches = re.findall(pattern, text)
    return sum(int(match) for match in matches) if matches else 0

# We define a function to compare the percentages of exam or project based modules, to determine if they are more exam or project based.
def categorise_course(url):
    response = requests.get(url)
    if response.status_code != 200:
        return "Webpage not found", "Course code not found"
    soup = BeautifulSoup(response.text, "lxml")

    assessment_div = soup.find("div", id="assessment-Content")
    if not assessment_div:
        return "Uncategorized", "Course code not found"
    
    assessment_text = assessment_div.get_text(strip=True).lower()
    category = "Uncategorized"

    exam_percentage = get_percentage(assessment_text, "exam")
    project_percentage = get_percentage(assessment_text, "project")
    
    if exam_percentage >= 50:
        category = "Exam-based"
    elif project_percentage >= 50:
        category = "Project-based"

    course_code = extract_course_code_from_heading(soup)
    return category, course_code

# We define a function to extract the course codes.
def extract_course_code_from_heading(soup):
    course_code_tag = soup.find("span", id="courseCode")
    if course_code_tag:
        return course_code_tag.text.strip()
    else:
        return "Course code not found"

# We provide the URLs we want to scrap.    
urls = [
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST101.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST102.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST107.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST108.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST109.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST110.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST115.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST201.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST202.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST205.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST206.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST207.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST211.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST213.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST226.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST227.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST300.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST301.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST302.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST303.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST304.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST306.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST307.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST308.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST309.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST310.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST311.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST312.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST326.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST327.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST330.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST405.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST409.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST411.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST416.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST418.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST422.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST425.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST426.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST429.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST433.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST436.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST439.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST440.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST442.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST443.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST444.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST445.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST446.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST447.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST448.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST449.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST450.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST451.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST452.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST453.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST454.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST455.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST456.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST498.htm',
    'https://www.lse.ac.uk/resources/calendar2021-2022/courseGuides/ST/2021_ST499.htm'
]

df_courses = pd.DataFrame(columns=["Course Code", "Category"])

for url in urls:
    category, course_code = categorise_course(url)
    df_courses = pd.concat([df_courses, pd.DataFrame({"Course Code": [course_code], "Category": [category]})], ignore_index=True)

df_courses


Unnamed: 0,Course Code,Category
0,ST101,Project-based
1,ST102,Exam-based
2,ST107,Exam-based
3,ST108,Exam-based
4,ST109,Exam-based
...,...,...
56,ST454,Project-based
57,ST455,Exam-based
58,ST456,Exam-based
59,ST498,Project-based


## Step 4: Attain data on the average score for each course.

In [6]:
import pdfplumber
import pandas as pd
import requests
from io import BytesIO
import re

# We initialise a list of dictionaries, each containing the URLs of undergraduate courses and master courses, and provide the regex patterns for each PDF. 
pdfs_info = [
    {
        "url": "https://info.lse.ac.uk/staff/divisions/academic-registrars-division/systems/Assets/PDF/LSEStatistics/UG-mod-rslts-2022/UG-mod-rslts-Sep-2022/ST-results-2021-22.pdf",
        "course_code_pattern": r"(ST\d{3}):",
        "average_score_pattern": r"2021/22\s+\d+\s+(\d+\.\d+)",
        "data_pages": slice(2, None) 
    },
    {
        "url": "https://info.lse.ac.uk/staff/divisions/academic-registrars-division/systems/Assets/PDF/LSEStatistics/PG-Masters-2022/ST-masters-results-2021-22.pdf",
        "course_code_pattern": r"(ST\d{3}):",
        "average_score_pattern": r"2021/22\s+\d+\s+(\d+\.\d+)", 
        "data_pages": slice(2, None) 
    }
]

# We initialise an empty DataFrame.
course_df = pd.DataFrame(columns=["Course Code" , "Average Score 2021/22"])

# We process each PDF by loop.
for pdf_info in pdfs_info:
    response = requests.get(pdf_info["url"])
    if response.status_code == 200:
        with pdfplumber.open(BytesIO(response.content)) as pdf:
            for page_number, page in enumerate(pdf.pages[pdf_info["data_pages"]]):
                text = page.extract_text()
                course_code_match = re.search(pdf_info["course_code_pattern"], text)
                course_code = course_code_match.group(0)[:-1] if course_code_match else "Course code not found."
                
                average_score_match = re.search(pdf_info["average_score_pattern"], text)
                average_score = average_score_match.group(1) if average_score_match else "Average score not found."
                
                temporary_df = pd.DataFrame({"Course Code": [course_code], "Average Score 2021/22": [average_score]})
                course_df = pd.concat([course_df, temporary_df], ignore_index=True)
    else:
        print(f"Failed to retrieve the PDF from {pdf_info['url']}, status code: {response.status_code}")

course_df

Unnamed: 0,Course Code,Average Score 2021/22
0,ST101,73.0
1,ST102,64.5
2,ST107,64.7
3,ST108,Average score not found.
4,ST109,73.1
5,ST115,66.1
6,ST201,64.0
7,ST202,62.2
8,ST205,62.8
9,ST206,65.3


## Step 5: Clean our current DataFrame.

In [7]:
# We set errors to 'coerce' as it will convert non-numeric values in our DataFrame to NaN.
course_df["Average Score 2021/22"] = pd.to_numeric(course_df["Average Score 2021/22"], errors='coerce')
course_df.dropna(subset=["Average Score 2021/22"], inplace=True)
course_df


Unnamed: 0,Course Code,Average Score 2021/22
0,ST101,73.0
1,ST102,64.5
2,ST107,64.7
4,ST109,73.1
5,ST115,66.1
6,ST201,64.0
7,ST202,62.2
8,ST205,62.8
9,ST206,65.3
10,ST207,83.4


## Step 6: We merge all the dataframes so we have one cohesive dataset. 

In [8]:
merged_df_first = pd.merge(cleaned_class_size_df, df_courses, on= "Course Code", how="inner")
merged_df_second = pd.merge(merged_df_first, course_df, on= "Course Code", how="inner")

merged_df_second


Unnamed: 0,Course Code,Course Year,Average Class Size,Student-Teacher Ratio,Category,Average Score 2021/22
0,ST101,First Year,28.0,0.035714,Project-based,73.0
1,ST102,First Year,26.0,0.038462,Exam-based,64.5
2,ST107,First Year,20.0,0.05,Exam-based,64.7
3,ST109,First Year,35.0,0.028571,Exam-based,73.1
4,ST115,First Year,29.0,0.034483,Project-based,66.1
5,ST201,Second Year,9.0,0.111111,Exam-based,64.0
6,ST202,Second Year,28.0,0.035714,Exam-based,62.2
7,ST205,Second Year,15.0,0.066667,Exam-based,62.8
8,ST206,Second Year,5.0,0.2,Exam-based,65.3
9,ST207,Second Year,11.0,0.090909,Project-based,83.4
