In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import re
import pickle 

In [124]:
def getAllCours():
    URL_ROOT = 'https://edu.epfl.ch/'
    shs = ['https://edu.epfl.ch/studyplan/fr/bachelor/programme-sciences-humaines-et-sociales/', 'https://edu.epfl.ch/studyplan/fr/master/programme-sciences-humaines-et-sociales/']
    page = requests.get(URL_ROOT)
    soup = BeautifulSoup(page.content, "html.parser")
    cards = soup.findAll("div", class_="card-title")
    annees = [card.find('a').get('href') for card in cards]
    courses = []
    for annee in annees:
        page = requests.get(URL_ROOT + annee)
        soup = BeautifulSoup(page.content, "html.parser")
        sections = [x.get('href') for x in soup.find('main').find('ul').findAll('a')]
        for section in sections:
            page = requests.get(URL_ROOT + section)
            soup = BeautifulSoup(page.content, "html.parser")
            for cours in soup.find('main').findAll('div', class_="cours-name"):
                if cours.find('a') != None:
                    courses.append(cours.find('a').get('href').split('/').pop())
    courses.remove('programme-sciences-humaines-et-sociales')
    for url in shs:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        for cours in soup.findAll("div", class_="cours-name"):
            if cours.find('a') != None:
                courses.append(cours.find('a').get('href').split('/').pop())
                
    return courses

In [121]:
def parseCours(url):
    page = requests.get(url)
    if (page.status_code == 404):
        print(url)
        return
        
    soup = BeautifulSoup(page.content, "html.parser")
    
    title = soup.find('main').find('h1').text
    if (soup.find('div', class_="course-summary") == None):
        print(url)
    code = soup.find('div', class_="course-summary").findAll('p')[0].text.split('/')[0].strip()
    semester = soup.find('div', class_="study-plans").findAll('div', class_="collapse-item")[0].findAll('li')[0].text.split(':')[1].strip()
    credits = int(re.findall(r'\d+', soup.find('div', class_="course-summary").findAll('p')[0].text.split('/')[1])[0])
    teachers = [(x.text, x.get('href')) for x in soup.find('div', class_="course-summary").findAll('p')[1].findAll('a')]
    

    schedule = dict()
    if (soup.find("table", class_="semaineDeRef") != None):
        rows = soup.find("table", class_="semaineDeRef").findAll("tr")
        days = []
        for i, row in enumerate(rows):
            col = row.findAll('td')
            for j, col in enumerate(col):
                if (i == 0):
                    if (j > 0):
                        days.append(col.text)
                else:
                    if (j == 0):
                        time = col.text
                    else:
                        classes = col.get('class')
                        if (classes != None and "taken" in classes):
                            if (col.get('rowspan') == None):
                                duration = 1
                            else:
                                duration = int(col.get('rowspan'))
                            classes.remove('taken')
                            if (len(classes) != 0):
                                label = classes[0]
                                day = days[j-1]
                                rooms = [room.text for room in col.findAll('a')]
                                if (time not in schedule):
                                    schedule[time] = dict()
                                schedule[time][day] = {
                                    'label': label,
                                    'duration': duration,
                                    'rooms': rooms
                                }
    if (semester != 'Printemps' and semester != 'Automne'):
        semester = None
    course = {
        'name': title,
        'code': code,
        'credits': credits,
        'semester': semester,
        'teachers': teachers,
        'schedule': schedule
    }

    return course

In [125]:
courses_url = getAllCours()

In [126]:
URL_ROOT = 'https://edu.epfl.ch/coursebook/fr/'
courses_url = list(set(courses_url))

courses = []
for url in courses_url:
    courses.append(parseCours(URL_ROOT + url))



https://edu.epfl.ch/coursebook/fr/hydrological-risks-and-structures-ENV-524
https://edu.epfl.ch/coursebook/fr/human-computer-interaction-CS-213
https://edu.epfl.ch/coursebook/fr/programme-sciences-humaines-et-sociales


In [127]:
data = list(filter(lambda x: x != None, courses))

with open('./data/data.pkl', 'wb') as f:
        pickle.dump(data, f)

In [110]:
print(data[0])

{'name': 'Discrete optimization', 'code': 'MATH-261', 'credits': 5, 'semester': 'Printemps', 'teachers': [('Pinchasi Rom', 'https://people.epfl.ch/220020?lang=fr')], 'schedule': {'8-9': {'Ve': {'label': 'exercice', 'duration': 2, 'rooms': ['CM1105']}}, '10-11': {'Ve': {'label': 'cours', 'duration': 2, 'rooms': ['CM1105']}}}}


In [83]:
def save_files_entities():
    data = load_file('data')

    # Get list of unique teachers
    teachers = sum([x['teachers'] for x in data], [])
    teachers = list(set(teachers))
    teachers = [{
        'name': x[0],
        'people_url': x[1]
    } for x in teachers]

    # Get list of unique rooms
    rooms = []
    for x in data:
        for time in x['schedule'].values():
            for day in time.values():
                rooms.append(day['rooms'])
    rooms = list(set(sum(rooms, [])))
    rooms = [{ 'name': room } for room in rooms]

    # Get list of unique courses
    codes = []
    courses = []
    for course in data:
        if (course['code'] not in codes):
            courses.append({
                'name': course['name'],
                'code': course['code'],
                'credits': course['credits'],
                'semester': course['semester']
            })
            codes.append(course['code'])

    with open('./data/teachers.pkl', 'wb') as f:
        pickle.dump(teachers, f)
    
    with open('./data/rooms.pkl', 'wb') as f:
        pickle.dump(rooms, f)

    with open('./data/courses.pkl', 'wb') as f:
        pickle.dump(courses, f)

In [84]:
def save_files_relations():
    data = load_file('./data/data.pkl')

    # Get list of unique teach_in
    teach_in = []
    for course in data:
        for teacher in course['teachers']:
            teach_in.append((
                course['code'],
                teacher[0]
            ))
    teach_in = list(set(teach_in))
    
    # Get list of unique booking
    booking = []
    for course in data:
        for time, row in course['schedule'].items():
            for day, creneau in row.items():
                for room in creneau['rooms']:
                    booking.append({
                        'room': room,
                        'course': course['code'],
                        'label': creneau['label'],
                        'duration': creneau['duration'],
                        'time': time,
                        'day': day,
                        'semester': course['semester']
                    })
    
    with open('./data/teach_in.pkl', 'wb') as f:
        pickle.dump(teach_in, f)

    with open('./data/booking.pkl', 'wb') as f:
        pickle.dump(booking, f)

In [177]:
print(booking[:1])

[{'_id': ObjectId('63deb3dfe2f04985631b6cf0'), 'room': ObjectId('63de9d83e2f04985631b3ca1'), 'course': ObjectId('63de9d87e2f04985631b4185'), 'label': 'exercice', 'duration': 2, 'time': '8-9', 'day': 'Ma', 'semester': 'Printemps'}]


In [97]:
def update_db_entities(db):
    teachers = load_file('./data/teachers.pkl')
    rooms = load_file('./data/rooms.pkl')
    courses = load_file('./data/courses.pkl')

    for x in rooms:
        db.rooms.update_one({'name': x['name']}, {"$set": x}, upsert=True)
    for x in teachers:
        db.teachers.update_one({'name': x['name']}, {"$set": x}, upsert=True)
    for x in courses:
        db.courses.update_one({'code': x['code']}, {"$set": x}, upsert=True)

In [102]:
def update_db_relations(db):
    queried_rooms = db.rooms.find()
    queried_teachers = db.teachers.find()
    queried_courses = db.courses.find()

    map_room = dict()
    for room in queried_rooms:
        map_room[room['name']] = room['_id']

    map_teacher = dict()
    for teacher in queried_teachers:
        map_teacher[teacher['name']] = teacher['_id'] 

    map_course = dict()
    for course in queried_courses:
        map_course[course['code']] = course['_id']

    # Load the relations
    teach_in = load_file('./data/teach_in.pkl')
    booking = load_file('./data/booking.pkl')

    # Map the DB ids
    teach_in = [{
        'teacher': map_teacher[x[1]],
        'course': map_course[x[0]]
    } for x in teach_in]

    booking = [{
        **x,
        'room': map_room[x['room']],
        'course': map_course[x['course']],
    } for x in booking]

    # Insert in DB

    db.teach_in.drop()
    db.booking.drop()

    db.teach_in.insert_many(teach_in)
    db.booking.insert_many(booking)

    

In [87]:
def load_file(name):
    with open(name, 'rb') as handle:
        data = pickle.load(handle)
    return data

In [88]:
def DB_indices(db):
    db.rooms.create_index([("name", pymongo.ASCENDING)], name="room_name", unique=True)
    db.teachers.create_index([("name", pymongo.ASCENDING)], name="teacher_unique", unique=True)
    db.courses.create_index([("code", pymongo.ASCENDING)], name="course_unique", unique=True)
    db.teach_in.create_index([("teacher", pymongo.ASCENDING), ("course", pymongo.ASCENDING)], name="teach_in_unique", unique=True)
    db.booking.create_index([("room", pymongo.ASCENDING), ("time", pymongo.ASCENDING), ("day", pymongo.ASCENDING), ("semester", pymongo.ASCENDING)], name="booking_unique", unique=True)

In [89]:
import pymongo
from pymongo import MongoClient
import config

client = MongoClient(f"mongodb+srv://{config.DB_USER}:{config.DB_PASSWORD}@{config.DB_URL}/?retryWrites=true&w=majority")

db = client[config.DB_NAME]

DB_indices(db)

In [100]:
save_files_entities()
update_db_entities(db)

In [128]:
save_files_relations()
update_db_relations(db)