In [1]:
import json
from os import makedirs, path
from copy import deepcopy
from requests import request
from pprint import pprint
from bs4 import BeautifulSoup
import uuid

In [2]:
def clean_string(input_string, mode = None):
    """
    Utility function to clean the input string.
    The filters are chosen by hand when scraping the values.
    NOTE: Only add filters, don't remove them

    :param input_string: String to be cleaned
    :param mode: Useful to avoid adding spaces in case of select fields
    :return: Cleaned string
    """
    return input_string.strip().replace(u"\xa0â€‹", "").replace(u"â€‹", "").replace(u"\n", "").replace(u"\t", "").replace(u"\xa0", " " if (mode and mode == 'prof') else "").replace(u"\u200b", "").replace(u"\u00e0", "à")


def initialize_dataset():
    """
    Return an object to use for inizialization of a new dataset.

    :return: Initial dictionary for any dataset
    """
    return {
        "value": {
            "total": 0,
            "size": 0,
            "language": "en",
            "data": []
        }
    }


def append_data(dataset, to_append):
    """
    Utility function to mask away some code and make it more readable

    :param dataset: Dataset to append data to
    :param to_append: Data to append
    """
    dataset['value']['data'].append(to_append)


def save_dataset(dataset, name, file_format):
    """
    Save the dataset given in input

    :param dataset: Dataset to save
    :param name: Name of the dataset
    :param file_format: Format the dataset should be saved in
    """
    filename = f'../datasets/{name}.{file_format}'
    makedirs(path.dirname(filename), exist_ok=True)
    with open(filename, 'w', encoding='utf-8') as f:
        if file_format == 'json':
            json.dump(dataset, f, indent=2)


def set_total_size(dataset):
    """
    Sets the values of the 'total' and 'size' fields in a dictionary

    :param dataset: Dataset to manipulate
    """
    dataset['value']['total'] = len(dataset['value']['data'])
    dataset['value']['size'] = len(dataset['value']['data'])


In [3]:
def scrape_esse3(url):
    """
    Scrape the given URL and extract all the needed information

    :param url: Url to scrape
    :return: Dictionary of information for the given URL
    """
    # Uncomment the following and comment lines [99-101] to download the page instead of using the local one
    res = request('get', url, headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:106.0) Gecko/20100101 Firefox/106.0', "Accept-Language": "en-US,en;q=0.5"})
    # with open('prova.html', 'w', encoding='utf-8') as f:
    #     f.write(res.text)
    if res.ok:
        soup = BeautifulSoup(res.text, 'lxml')
        # with open('prova3.html', "r") as f:
        #     page = f.read()
        # soup = BeautifulSoup(page, 'lxml')
        course_id = uuid.uuid4().hex
        if soup.find(id='header2').find('h2').string == "Errore":
            information = {
                'id': course_id,
                'year': 'NA',
                'typeCourse': 'NA',
                'credits': 'NA',
                'lessonType': 'NA',
                'examType': 'NA',
                'evaluationType': 'NA',
                'lessonPeriod': 'NA'
            }
        else:
            table_values = soup.find_all('dd')
            teaching_units_html = soup.find(
                id="table1").find('tbody').find_all("tr")
            teaching_units = []
            for i in range(len(teaching_units_html)):
                unit = teaching_units_html[i].find_all("td")
                unitId = uuid.uuid4().hex
                if len(unit) < 6:
                    # Probably an error in the input, see https://www.esse3.unitn.it/Guide/PaginaADContest.do?ad_cont_id=10661*94459*2022*2017*9999
                    previous_unit = teaching_units_html[i-1].find_all("td")
                    unit_information = {
                        'courseId': course_id,
                        'name': clean_string(previous_unit[0].string),
                        'activityType': clean_string(unit[0].string) if unit[0].string else '',
                        'durationHours': clean_string(unit[1].string) if unit[1].string else '',
                        'typeTeaching': clean_string(unit[2].string) if unit[2].string else '',
                        'subjectArea': clean_string(unit[3].string) if unit[3].string else '',
                        'credits': clean_string(unit[4].string) if unit[4].string else '',
                        'id': unitId
                    }
                else:
                    unit_information = {
                        'courseId': course_id,
                        'name': clean_string(unit[0].string),
                        'activityType': clean_string(unit[1].string) if unit[1].string else '',
                        'durationHours': clean_string(unit[2].string) if unit[2].string else '',
                        'typeTeaching': clean_string(unit[3].string) if unit[3].string else '',
                        'subjectArea': clean_string(unit[4].string) if unit[4].string else '',
                        'credits': clean_string(unit[5].string) if unit[5].string else '',
                        'id': unitId
                    }
                teaching_units.append(unit_information)

            partitions_html = soup.find(id="table2").find(
                'tbody').find_all("tr") if soup.find(id="table2") else None
            partitions = []
            last_rowspan = {
                'partition': 1,
                'syllabus': 1
            }
            append_new = True
            professor = {
                'count': 0,
                'name': '',
                'tenured': True
            }
            if partitions_html:
                for i in range(len(partitions_html)):
                    partition = partitions_html[i].find_all("td")
                    partitionId = uuid.uuid4().hex
                    if list(last_rowspan.values()) == [1, 1]:
                        partition_information = {
                            'name': clean_string(partition[0].string),
                            'period': clean_string(partition[1].string),
                            'teacher': {'name': [clean_string(partition[2].string, 'prof') if partition[2].string else ''], 'tenured': [True if partition[3].find('img') else False]},
                            'syllabusLink': 'https://www.esse3.unitn.it/'+partition[4].find('a')['href'] if partition[4].find('a') else '' if partition[4] else '',
                        }
                        last_rowspan = {
                            'partition': int(partition[0]['rowspan']),
                            'syllabus': int(partition[4]['rowspan'])
                        }
                        if partition[2].string and clean_string(partition[2].string, 'prof') != '':
                            professor['count'] += 1
                            professor['name'] = clean_string(
                                partition[2].string, 'prof')
                            professor['tenured'] = True if partition[3].find(
                                'img') else False
                    else:
                        if last_rowspan['partition'] <= 1:
                            partition_information = {
                                'name': clean_string(partition[0].string),
                                'period': clean_string(partition[1].string),
                                'teacher': {'name': [clean_string(partition[2].string, 'prof') if partition[2].string else ''], 'tenured': [True if partition[3].find('img') else False]},
                                'syllabusLink': '',
                            }
                            append_new = True
                            if partition[2].string and clean_string(partition[2].string) != '':
                                professor['count'] += 1
                                professor['name'] = clean_string(
                                    partition[2].string, 'prof')
                                professor['tenured'] = True if partition[3].find(
                                    'img') else False
                        else:
                            last_rowspan['partition'] -= 1
                            partitions[-1]['teacher']['name'].append(clean_string(
                                partition[0].string, 'prof') if partition[0].string else '')
                            partitions[-1]['teacher']['tenured'].append(
                                True if partition[1].find('img') else False)
                            append_new = False
                            if partition[0].string and clean_string(partition[0].string) != '':
                                professor['count'] += 1
                                professor['name'] = clean_string(
                                    partition[0].string, 'prof')
                                professor['tenured'] = True if partition[1].find(
                                    'img') else False

                        if last_rowspan['syllabus'] <= 1:
                            partition_information['syllabusLink'] = clean_string(partition[4].string) if len(
                                partition) >= 5 and partition[4] else partition_information['syllabusLink']
                        else:
                            last_rowspan['syllabus'] -= 1
                            if partitions[-1]['syllabusLink'] == '':
                                partitions[-1]['syllabusLink'] = clean_string(partition[2].string) if len(
                                    partition) >= 3 and partition[2] else partition_information['syllabusLink']
                    if append_new:
                        partition_information['courseId'] = course_id
                        partition_information['id'] = partitionId
                        partitions.append(partition_information)
                if professor['count'] == 1:
                    for partition in partitions:
                        partition['teacher']['name'] = [professor['name']]
                        partition['teacher']['tenured'] = [
                            professor['tenured']]

            course_year = '0'
            year_tmp = table_values[0].contents[0].string.split(',')
            if len(year_tmp) > 1:
                course_year = year_tmp[0][0] + '&' + year_tmp[1][1]
            elif table_values[0].contents[0].string[0] in {'1', '2', '3', '4', '5'}:
                course_year = table_values[0].contents[0].string[0]
            else:
                course_year = clean_string(table_values[0].contents[0].string)
            information = {
                'id': course_id,
                'year': course_year,
                'typeCourse': clean_string(table_values[1].contents[0]),
                'credits': clean_string(table_values[2].contents[0].string.split(" ")[0]),
                'lessonType': clean_string(soup.find("desc_tipo_att").contents[0].string if soup.find("desc_tipo_att") else ''),
                'examType': clean_string(table_values[4].contents[0].string),
                'evaluationType': clean_string(table_values[5].contents[0].string),
                'lessonPeriod': clean_string(table_values[6].contents[0].string)
            }
    else:
        # add better error control with an exception
        print(f"Cannot download the page from {url}.")
        return {}, [], []

    """ with open('prova_esse3.json', 'w', encoding='utf-8') as f:
        json.dump(information, f) """
    # pprint(information)
    return information, partitions, teaching_units


information, partitions, teaching_units = scrape_esse3(
    'https://www.esse3.unitn.it/Guide/PaginaADContest.do?ad_cont_id=10117*87830*2022*2011*10000&cod_lingua=en')
pprint(information)


{'credits': '6',
 'evaluationType': 'Voto Finale',
 'examType': 'Orale',
 'id': 'f3e4f074990545bb83f89e0b4d6c841d',
 'lessonPeriod': 'Primo Semestre',
 'lessonType': 'Lezioni',
 'typeCourse': 'Caratterizzante',
 'year': '1&2'}
