# Preparing the data.

## Step 1 : Retrieving information on average class size of ST 2021/22 modules.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# We define a function to scrape the information on average class size.
def scrape_data(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'lxml')  
        try:
            info_tag = soup.find('p', string=lambda text: 'average class size 2021/22' in (text or "").lower())
            if info_tag and ':' in info_tag.text:
                size_text = info_tag.text.split(':')[-1].strip()
                return size_text, soup
            else:
                return "Average class size text not found.", soup
        except AttributeError:
            return "Attribute error caught.", soup
    else:
        return "Webpage not found.", None

# We define a function to extract the course code from the web page heading.
def extract_course_code_from_heading(soup):
    course_code_tag = soup.find('span' , id='courseCode')
    if course_code_tag:
        return course_code_tag.text.strip()
    else:
        return "Course code not found."

# We make a list of the URLs of the ST modules we want to scrape.
urls = [
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST101.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST102.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST107.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST109.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST110.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST115.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST201.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST202.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST205.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST206.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST207.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST211.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST213.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST226.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST227.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST300.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST301.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST302.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST303.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST304.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST306.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST307.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST308.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST309.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST310.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST311.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST312.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST313.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST326.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST327.htm',
    'https://www.lse.ac.uk/resources/calendar2022-2023/courseGuides/ST/2022_ST330.htm'
]

# We make a list to keep our data.
data = []

# We loop through each URL and scrape the data.
for url in urls:
    class_size, soup = scrape_data(url) 
    if soup:  
        course_code = extract_course_code_from_heading(soup)
    else:
        course_code = "Webpage not found."
    data.append({'Course Code': course_code, 'Average Class Size': class_size})

# We create a DataFrame.
class_size_df = pd.DataFrame(data)
class_size_df


Unnamed: 0,Course Code,Average Class Size
0,ST101,28
1,ST102,26
2,ST107,20
3,ST109,35
4,ST110,1
5,ST115,29
6,ST201,9
7,ST202,28
8,ST205,15
9,ST206,5


## Step 2 : Clean the dataframe and alter to student-teacher ratio. 

In [2]:
# We create a copy of the DataFrame to avoid the warning.
cleaned_class_size_df = class_size_df.copy()

# We remove 'Unavailable' information from our DataFrame and calculate the student-teacher ratio.
cleaned_class_size_df['Average Class Size'] = pd.to_numeric(cleaned_class_size_df['Average Class Size'], errors='coerce')
cleaned_class_size_df.dropna(subset=['Average Class Size'], inplace=True)
cleaned_class_size_df['Student-Teacher Ratio'] = 1 / cleaned_class_size_df['Average Class Size']

cleaned_class_size_df

Unnamed: 0,Course Code,Average Class Size,Student-Teacher Ratio
0,ST101,28.0,0.035714
1,ST102,26.0,0.038462
2,ST107,20.0,0.05
3,ST109,35.0,0.028571
4,ST110,1.0,1.0
5,ST115,29.0,0.034483
6,ST201,9.0,0.111111
7,ST202,28.0,0.035714
8,ST205,15.0,0.066667
9,ST206,5.0,0.2
