In [6]:
import json

import requests
from bs4 import BeautifulSoup as bs

# Getting the majors, minors, honor programs, and fellowships' requirements

In [12]:
def getDegreeLink(soup, id) :
    urls = []
    for link in soup.find(id=id).find_all('a') :
        href = link.get('href')
        if 'college-of-liberal-arts' in href:
            urls.append(href + 'majors-minors/')

    return list(set(urls))

def getRequirements(url, className) :
    page = requests.get(url)
    soup = bs(page.content, "html.parser")

    titles = soup.find_all(class_=className)
    degrees = []

    for title in titles:
        degreeName = title.text
        reqTable = title.find_next_sibling()
        degreeDetail = getReqTableContent(reqTable)
        degreeDetail['Major'] = 'major-title' == className
        
        degreeDetail["Name"] = degreeName
        degrees.append(degreeDetail)

    return degrees


def getReqTableContent(reqTable) :
    rows = reqTable.find_all('tr')
    detail = {}

    for row in rows :
        detail[row.find('th').text] = row.find('td').text

    return detail



In [14]:
def majorMinorReqs():
    url = "https://www.depauw.edu/academics/"
    page = requests.get(url)
    soup = bs(page.content, "html.parser")

    majorURLs = getDegreeLink(soup, 'majors')
    majorsMinors = []
    print(majorURLs)
    for link in majorURLs:
        majorsMinors += getRequirements(link, 'major-title')
        majorsMinors += getRequirements(link, 'minor-title')

    json_object = json.dumps({"Degrees": majorsMinors})
    with open("majorsMinors.json", "w") as outfile:
        outfile.write(json_object)    

majorMinorReqs()

['https://www.depauw.edu/academics/college-of-liberal-arts/africana-studies/majors-minors/', 'https://www.depauw.edu/academics/college-of-liberal-arts/sociology-and-anthropology/majors-minors/', 'https://www.depauw.edu/academics/college-of-liberal-arts/chemistry-and-biochemistry/majors-minors/', 'https://www.depauw.edu/academics/college-of-liberal-arts/physics-astronomy/majors-minors/', 'https://www.depauw.edu/academics/college-of-liberal-arts/art/majors-minors/', 'https://www.depauw.edu/academics/college-of-liberal-arts/hispanic/majors-minors/', 'https://www.depauw.edu/academics/college-of-liberal-arts/modern-languages/majors-minors/', 'https://www.depauw.edu/academics/college-of-liberal-arts/italian/majors-minors/', 'https://www.depauw.edu/academics/college-of-liberal-arts/computer-science/majors-minors/', 'https://www.depauw.edu/academics/college-of-liberal-arts/peace-and-conflict-studies/majors-minors/', 'https://www.depauw.edu/academics/college-of-liberal-arts/economics-management

# Getting the courses information

In [5]:
def getDeptLink(coursesLs, className=None):
    urls = []
    aLs = coursesLs.find_all('a')
    if (className) :
        aLs = coursesLs.find(class_=className).find_all('a')

    for link in aLs :
        href = link.get('href')
        if (href and "https://www.depauw.edu/academics/catalog/courses/" in href):
            urls.append(href)

    # return set to remove duplicates
    return list(set(urls))


def getCourseDetail(courseURL):
    detail = {}
    title = bs(requests.get(courseURL).content, "html.parser").find(class_='form__head')
    courseCode = title.find('small')
    try:
        reqHeader = title.find_next_sibling('table').find_all('th')
        reqRow = title.find_next_sibling('table').find_all('td')
    except:
        return None
    else:

        for i in range (3):
            detail[reqHeader[i].text] = reqRow[i].text
        detail['url'] = courseURL
        detail['course'] = courseCode.text
        
        return detail


def getCourses() :
    url = "https://www.depauw.edu/academics/catalog/courses/"

    page = requests.get(url)
    soup = bs(page.content, "html.parser")
    coursesLs = soup.find(class_='columns_2')

    deptLinks = getDeptLink(coursesLs)
    courseLinks = []

    for deptLink in deptLinks:
        deptSoup = bs(requests.get(deptLink).content, "html.parser")
        courseLinks += getDeptLink(deptSoup, 'content listing')
    

    courses = []
    for each in courseLinks:
        courseDetail = getCourseDetail(each)
        if (courseDetail):
            courses.append(courseDetail)
    
    json_object = json.dumps({"Courses": courses})
    with open("courses.json", "w") as outfile:
        outfile.write(json_object)    



getCourses()

# Processing the data to the correct datatype

### 1. Processing the courses data TODO
1. scrape information regarding competency offered by the courses
2. adjust the data so that it has the same attributes as the Course object

In [21]:
import json
from fractions import Fraction
import re

def updateDistArea(courses):
    # val is a single course. I'm too lazy to rename lmao
    for val in courses:
        distStr = val['Distribution Area']
        distAr = []
        if ("Arts and Humanities" in distStr):
            distAr.append('AH')
        
        if ("Social Science" in distStr):
            distAr.append('SS')

        if ("Global Learning" in distStr):
            distAr.append('GL')

        if ("Science and Mathematics" in distStr):
            distAr.append('SM')

        if ("Privilege, Power And Diversity" in distStr):
            distAr.append('PPD')

        val['Distribution Area'] = distAr


def updateCredits(courses):    
    # val is a single course. I'm too lazy to rename lmao
    for val in courses:
        creditStr = re.sub(r' ([cC](ourse|redit))?.*', '', val['Credits']).split('-')
        credits = []
        for cre in creditStr:
            if (bool(re.search(r'\d', cre))):
                credits.append(float(Fraction(cre)))
        val['Credits'] = credits

def updatePrereqs(courses):
    for course in courses:
        preReqStr = course['Prerequisites']
        if (not preReqStr):
            course['Required courses'] = []
        else:
            course['SPAC'] = True if (re.search(r'consent|permission of instructor', preReqStr, re.IGNORECASE)) else False
            reqCourses = re.split(r'and|;\s', preReqStr) if (re.search(r'or|;', preReqStr)) else re.split(r',\s', preReqStr)
            for i in range (len(reqCourses)):
                reqCourses[i] = re.findall(r'[A-Z]?[A-Za-z]{3}\s\d{3}\w{0,2}', reqCourses[i])
            course['Required courses'] = reqCourses


def process_courses() :
    with open('courses.json', 'r') as f:
        allCourses = json.load(f)
        courses = allCourses['Courses']
        updateDistArea(courses)
        updateCredits(courses)
        updatePrereqs(courses)
        
    with open('updatedCourses.json', 'w') as f:
        json_object = json.dumps({'Courses' : courses})
        f.write(json_object)
        

process_courses()

### Regex for course prerequisites (or requirements in majorsMinors)

1. Check if has `permission of instructor`:
    - Yes: replace `permission of instructor` with `""` and set `SPAC` field to `True`
    - No: set `SPAC` field to `False`

2. The actual prerequisites will be an array. Each element of the array is one requirements. Each element will be a set. Can have duplicated set if required 2+ courses from the same list of courses. i.e., two 300-level courses will be displayed as 2 sets (the same) of all the 300-level courses.

3. How to produce the correct regex to solve this
    - regex of a course: `[A-Z]?[A-Za-z]{3}\s\d{3}\w?`
    - replace all the `/` with ` or `

In [3]:
import re
s = "MATH 331 or MATH 336/ECON 390 and MATH 441 and MATH 341, or Eng 123, MATH 348, ECON 385 or ECON 450 and CSC 233; MUS 122_1"

a = re.split(r'and|;\s', s)
for i in range (len(a)):
    a[i] = re.findall(r'[A-Z]{3,4}\s\d{3}', a[i])
    

print (a)

[['MATH 331', 'MATH 336', 'ECON 390'], ['MATH 441'], ['MATH 341', 'MATH 348', 'ECON 385', 'ECON 450'], ['CSC 233'], ['MUS 122']]


### 2. Processing Major/Minor data TODO
1. add the field to the data to give it the same attributes as the program objects
2. 

In [1]:
import pandas as pd
import json


def extractCredit(s: str) -> float:
    return 0.0

with open('majorsMinors.json', 'r') as courseF:
    courses = json.load(courseF)['Degrees']
    mDf = pd.DataFrame.from_records(courses)

print(mDf.dtypes)
mDf.fillna('None', inplace=True)
mDf.shape


# s = ['nine', 'one-half', 'nine and one-half', '2']
# print(float('11.5'))
# for each in s:
#     print(w2n.word_to_num(each))

Total courses required                        object
Core courses                                  object
Other required courses                        object
Number 300 and 400 level courses              object
Senior requirement and capstone experience    object
Recent changes in major                       object
Writing in the Major                          object
Major                                           bool
Name                                          object
Additional information                        object
Note                                          object
dtype: object


(100, 11)

In [2]:
with open('updatedCourses.json', 'r') as f:
    courses = json.load(f)['Courses']
    cDf = pd.DataFrame.from_records(courses)
    print(cDf.dtypes)
cDf.loc[:, 'SPAC'].fillna(False, inplace=True)
cDf.loc[:, 'Prerequisites'] = cDf.loc[:, 'Prerequisites'].map(lambda v: 'None' if (v.isspace() or not v) else v)
cDf.shape

Distribution Area    object
Prerequisites        object
Credits              object
url                  object
course               object
SPAC                 object
Required courses     object
dtype: object


(1220, 7)

# Adding the data to Firebase DB

In [None]:
import firebase_admin
from firebase_admin import db
import json

databaseURL = "https://strike-depauw-default-rtdb.firebaseio.com/"
cred_obj = firebase_admin.credentials.Certificate('./firebaseServiceKey.json')
default_app = firebase_admin.initialize_app(cred_obj, {
    'databaseURL': databaseURL
}, name='test')

In [None]:
ref = db.reference('/')

with open("courses.json", 'r') as jsonF:
    content = json.load(jsonF)

ref.set(content) # to add the whole json structure of the database