In [479]:
import requests
import numpy as np 
import scipy
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
from itertools import chain
import re
import pandas as pd

In [501]:
major_link = 'https://www.mcgill.ca/study/2021-2022/faculties/science/undergraduate/programs/bachelor-science-bsc-major-computer-science'
page = requests.get(major_link)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.title.text # gets you the text of the <title>(...)</title>
print(title)

Bachelor of Science (B.Sc.) - Major Computer Science | eCalendar - McGill University


In [502]:
major_courses = soup.find_all(class_="program-course-title")
major_courses = [major_courses[i].contents[0][15:major_courses[i].contents[0].index(")")+1] for i in range(0, len(major_courses))]
major_courses
#print(len(major_courses))

['COMP 202 Foundations of Programming (3 credits)',
 'COMP 206 Introduction to Software Systems (3 credits)',
 'COMP 250 Introduction to Computer Science (3 credits)',
 'COMP 251 Algorithms and Data Structures (3 credits)',
 'COMP 273 Introduction to Computer Systems (3 credits)',
 'COMP 302 Programming Languages and Paradigms (3 credits)',
 'COMP 303 Software Design (3 credits)',
 'COMP 310 Operating Systems (3 credits)',
 'MATH 222 Calculus 3 (3 credits)',
 'MATH 223 Linear Algebra (3 credits)',
 'MATH 240 Discrete Structures (3 credits)',
 'COMP 330 Theory of Computation (3 credits)',
 'COMP 350 Numerical Computing (3 credits)',
 'COMP 360 Algorithm Design (3 credits)',
 'MATH 318 Mathematical Logic (3 credits)',
 'MATH 323 Probability (3 credits)',
 'MATH 324 Statistics (3 credits)',
 'MATH 340 Discrete\r Mathematics (3 credits)']

In [503]:
#Next step: find a list of *required major courses* (not complementary courses)
section_headers = [soup.find_all("h4")[i].text for i in range(0, len(soup.find_all("h4")))]
section_headers

['Required Courses (33 credits)',
 'Complementary Courses (30 credits)',
 '',
 '',
 '',
 '']

In [504]:
all_course_codes = []
all_course_names = []
all_credit_hours = []
all_terms = []
all_prerequisites = []
all_corequisites = []


for text in section_headers:
    if(text.__contains__('Complementary Courses')):
        break
    target = soup.find('h4',text=text)
    for sib in target.find_next_siblings():
        if sib.name=="h4":
            break
        else:
            
            major_courses = sib.find_all(class_="program-course")
            prerequisite_courses = []
            corequisite_courses = []
            for course in major_courses:
                #find course name for each required major course
                major_courses = course.find_all(class_="program-course-title")
                major_courses = [major_courses[i].contents[0][15:major_courses[i].contents[0].index(")")+1].replace("\r","") for i in range(0, len(major_courses))]
                full_course_name = major_courses[0]
                course_code = full_course_name[:8]
                all_course_codes.append(course_code)
                
                course_name = full_course_name[9:full_course_name.index("(")]
                
                num_credit_hours = full_course_name[(full_course_name.index("(")+1):(full_course_name.index("(")+2)]
                all_credit_hours.append(num_credit_hours)
                all_course_names.append(course_name)
                
                prereq_term_combined_info = course.find_all("p")
                
                #extract Fall/Winter term info for each required major course
                for info in prereq_term_combined_info[2:3]:
                    term_course_info = list(info.children)[0]
                    term_course_info = term_course_info[19:]
                    all_terms.append(term_course_info)
                    
                #lastly, extract prerequisites and corequisites (or lack thereof) from each major course
         
                for info in prereq_term_combined_info:
                    term_course_info = info.children
                    term_course_info = list(term_course_info)
                    prerequisite_info = [term_course_info for s in term_course_info if 'Prerequisite' in s]
                    corequisite_info = [term_course_info for s in term_course_info if 'Corequisite' in s]
                    for i in range(0, len(prerequisite_info)):
                        if len(prerequisite_info[i])>0:
                            for j in range(1, len(prerequisite_info[i])):
                                if(isinstance(prerequisite_info[i][j],str) != True):
                                    prerequisite_courses.append(prerequisite_info[i][j].contents)
                    
                    for i in range(0, len(corequisite_info)):
                        if len(corequisite_info[i])>0:
                            for j in range(1, len(corequisite_info[i])):
                                if(isinstance(corequisite_info[i][j],str) != True):
                                    corequisite_courses.append(corequisite_info[i][j].contents)
                
                all_prerequisites.append(', '.join(np.unique(list(np.array(prerequisite_courses).flatten()))))
                all_corequisites.append(', '.join(np.unique(list(np.array(corequisite_courses).flatten()))))



In [509]:
#End goal: CSV file with following columns:
#Course, Prerequisites, Term, # of credit hours
output_df = pd.DataFrame({'Course Code':all_course_codes, 'Course Name':all_course_names, 'Num Credit Hours':all_credit_hours,
                         'Terms Offered':all_terms, 'Prerequisites':all_prerequisites, 'Corequisites':all_corequisites})
output_df.to_csv("major_plan.csv",index=False)
output_df

Unnamed: 0,Course Code,Course Name,Num Credit Hours,Terms Offered,Prerequisites,Corequisites
0,COMP 202,Foundations of Programming,3,"Fall 2021, Winter 2022",,
1,COMP 206,Introduction to Software Systems,3,"Fall 2021, Winter 2022","COMP 202, COMP 250",
2,COMP 250,Introduction to Computer Science,3,"Fall 2021, Winter 2022","COMP 202, COMP 250",
3,COMP 251,Algorithms and Data Structures,3,"Fall 2021, Winter 2022","COMP 202, COMP 250","MATH 235, MATH 240"
4,COMP 273,Introduction to Computer Systems,3,"Fall 2021, Winter 2022","COMP 202, COMP 250","COMP 206, MATH 235, MATH 240"
5,COMP 302,Programming Languages and Paradigms,3,"Fall 2021, Winter 2022","COMP 202, COMP 230, COMP 250, MATH 235, MATH 2...","COMP 206, MATH 235, MATH 240"
6,COMP 303,Software Design,3,"Fall 2021, Winter 2022","COMP 202, COMP 206, COMP 230, COMP 250, MATH 2...","COMP 206, MATH 235, MATH 240"
7,COMP 310,Operating Systems,3,"Fall 2021, Winter 2022","COMP 202, COMP 206, COMP 230, COMP 250, COMP 2...","COMP 206, MATH 235, MATH 240"
8,MATH 222,Calculus 3,3,"Fall 2021, Winter 2022, Summer 2022","COMP 202, COMP 206, COMP 230, COMP 250, COMP 2...","COMP 206, MATH 133, MATH 141, MATH 235, MATH 240"
9,MATH 223,Linear Algebra,3,"Fall 2021, Winter 2022","COMP 202, COMP 206, COMP 230, COMP 250, COMP 2...","COMP 206, MATH 133, MATH 141, MATH 235, MATH 240"
