# CAO Points

In [1]:
# For dataframes
import pandas as pd

# For HTTP requests
import requests as rq

# For regular expressions
import re

# For accessing dates and times
import datetime as dt

# Accessing and downloading using urls
import urllib.request as urlrq

In [2]:
# conda install -c conda-forge tabula-py 
# commented out the installation for now.
# For reading pdfs 
import tabula

### Datetime

In [3]:
"""Using datetime to create a variable that will refer to a string stating the current time.
This will be used throughout this notebook to save files with the current time in the filename."""

# Access the current date and time.
now = dt.datetime.now()

# Formatting the current date and time as a string and saving to a variable.
current_time = now.strftime('%Y%m%d_%H%M%S')

## 2021 Level 8 Points

In [4]:
resp2021_l8 = rq.get('http://www2.cao.ie/points/l8.php')

In [5]:
resp2021_l8

<Response [200]>

In [6]:
"""Have to change the encoding as the following error is returned:
'charmap' codec can't encode character '\x96' in position 25767: character maps to <undefined>"""

# The server uses the wrong encoding.
original_encoding = resp2021_l8.encoding

# Change to cp1252, which recognises the '\x96' character.
resp2021_l8.encoding = 'cp1252'

In [7]:
# Create a file path for the original data.
path2021_l8_html = 'cao-data/cao2021_level8_' + current_time + '.html'

In [8]:
# Save the original html file.
with open(path2021_l8_html, 'w') as f:
    f.write(resp2021_l8.text)

In [9]:
# re adapted from:https://github.com/ianmcloughlin/cao-points/blob/main/cao-points-analysis.ipynb

# Using regular expression to extract the lines of data we want.
re_course = re.compile(r'([A-Z]{2}[0-9]{3})(.*)')

In [10]:
# Code adapted from: https://github.com/ianmcloughlin/cao-points/blob/main/cao-points-analysis.ipynb


# The file path for the csv file.
path2021_l8 = 'cao-data/cao2021_level8_csv_' + current_time + '.csv'

# Keep track of how many courses we process.
no_lines = 0

# Open the csv file for writing.
with open(path2021_l8, 'w') as f:
    # Write a header row.
    f.write(','.join(['code', 'title', 'pointsR1', 'pointsR2']) + '\n')
    # Loop through lines of the response.
    for line in resp2021_l8.iter_lines():
        # Decode the line, using the wrong encoding!
        dline = line.decode('cp1252')
        # Match only the lines representing courses.
        if re_course.fullmatch(dline):
            # Add one to the lines counter.
            no_lines = no_lines + 1
            # The course code.
            course_code = dline[:5]
            # The course title.
            course_title = dline[7:57].strip()
            # Round one points.
            course_points = re.split(' +', dline[60:])
            if len(course_points) != 2:
                course_points = course_points[:2]
            # Join the fields using a comma.
            linesplit = [course_code, course_title, course_points[0], course_points[1]]
            # Rejoin the substrings with commas in between.
            f.write(','.join(linesplit) + '\n')

# Print the total number of processed lines.
print(f"Total number of lines is {no_lines}.")

Total number of lines is 949.


In [11]:
# Load the 2021 level 8 data to a pandas dataframe
df2021_l8 = pd.read_csv(path2021_l8, encoding='cp1252')

In [12]:
df2021_l8

Unnamed: 0,code,title,pointsR1,pointsR2
0,AL801,Software Design for Virtual Reality and Gaming,300,
1,AL802,Software Design in Artificial Intelligence for...,313,
2,AL803,Software Design for Mobile Apps and Connected ...,350,
3,AL805,Computer Engineering for Network Infrastructure,321,
4,AL810,Quantity Surveying,328,
...,...,...,...,...
944,WD211,Creative Computing,270,
945,WD212,Recreation and Sport Management,262,
946,WD230,Mechanical and Manufacturing Engineering,230,230
947,WD231,Early Childhood Care and Education,266,


## 2021 Level 7/6 Points

In [13]:
resp2021_l76 = rq.get('http://www2.cao.ie/points/l76.php')

In [14]:
resp2021_l76

<Response [200]>

In [15]:
# Create a file path for the original data.
path2021_l76_html = 'cao-data/cao2021_level7_6_' + current_time + '.html'

In [16]:
# Save the original html file.
with open(path2021_l76_html, 'w') as f:
    f.write(resp2021_l76.text)

In [17]:
# The file path for the csv file.
path2021_l76 = 'cao-data/cao2021_level7_6_csv_' + current_time + '.csv'

# Keep track of how many courses we process.
no_lines = 0

# Open the csv file for writing.
with open(path2021_l76, 'w') as f:
    # Write a header row.
    f.write(','.join(['code', 'title', 'pointsR1', 'pointsR2']) + '\n')
    # Loop through lines of the response.
    for line in resp2021_l76.iter_lines():
        # Decode the line, using the wrong encoding!
        dline = line.decode('cp1252')
        # Match only the lines representing courses.
        if re_course.fullmatch(dline):
            # Add one to the lines counter.
            no_lines = no_lines + 1
            # The course code.
            course_code = dline[:5]
            # The course title.
            course_title = dline[7:57].strip()
            # Round one points.
            course_points = re.split(' +', dline[60:])
            if len(course_points) != 2:
                course_points = course_points[:2]
            # Join the fields using a comma.
            linesplit = [course_code, course_title, course_points[0], course_points[1]]
            # Rejoin the substrings with commas in between.
            f.write(','.join(linesplit) + '\n')

# Print the total number of processed lines.
print(f"Total number of lines is {no_lines}.")

Total number of lines is 416.


In [18]:
# Load the 2021 level 7 and level 6 data to a pandas dataframe
df2021_l76 = pd.read_csv(path2021_l76, encoding='cp1252')

In [19]:
df2021_l76

Unnamed: 0,code,title,pointsR1,pointsR2
0,AL605,Music and Instrument Technology,211,
1,AL630,Pharmacy Technician,308,
2,AL631,Dental Nursing,311,
3,AL632,Applied Science,297,
4,AL650,Business,AQA,AQA
...,...,...,...,...
411,WD188,Applied Health Care,220,
412,WD205,Molecular Biology with Biopharmaceutical Science,AQA,262v
413,WD206,Electronic Engineering,180,
414,WD207,Mechanical Engineering,172,


## 2020 Level 8/7/6 Points

In [20]:
# The points for levels 8, 7, and 6 were included in one spreadsheet on the CAO website
url2020 = 'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx'

In [21]:
# Create a file path for the original data.
path2020_xlsx = 'cao-data/cao2020_' + current_time + '.xlsx'

In [22]:
urlrq.urlretrieve(url2020, path2020_xlsx)

('cao-data/cao2020_20211109_211250.xlsx',
 <http.client.HTTPMessage at 0x2c3fd9bc2e0>)

In [23]:
# Load the 2020 data to a pandas dataframe
df2020 = pd.read_excel(url2020, skiprows=10)

In [24]:
df2020

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,...,,,,,,,,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,...,,,,,,,,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,...,,,,,,,,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,...,,,,,,,,,,


In [25]:
# Create a file path for the pandas data.
path2020 = 'cao-data/cao2020_' + current_time + '.csv'

In [26]:
# Save pandas data frame to disk.
df2020.to_csv(path2020)

## 2019 Level 8 Points

In [27]:
# Request CAO website for 2019 level 8 url
url2019_l8 = 'http://www2.cao.ie/points/lvl8_19.pdf'

In [28]:
# Create a file path for the original data.
path2019_pdf = 'cao-data/cao2019_l8_' + current_time + '.pdf'

In [29]:
# Save pdf to disk
urlrq.urlretrieve(url2019_l8, path2019_pdf)

('cao-data/cao2019_l8_20211109_211250.pdf',
 <http.client.HTTPMessage at 0x2c3fe129b50>)

In [30]:
# Commenting out tabula code for now as the resulting table was broken, but may come back and try fix it
# Read pdf into panda using tabula
# Using concat as tabula creates a new dataframe for each page of the pdf
# Takes pandas options to remove header as tabula assigns first row on each new page as the header 
df2019_l8_load = pd.concat(tabula.read_pdf("http://www2.cao.ie/points/lvl8_19.pdf", 
                                      pages = 'all', pandas_options={'header': None}))

In [31]:
df2019_l8_load

Unnamed: 0,0,1,2,3
0,Course Code,INSTITUTION and COURSE,EOS,Mid
1,,Athlone Institute of Technology,,
2,AL801,Software Design with Virtual Reality and Gaming,304,328
3,AL802,Software Design with Cloud Computing,301,306
4,AL803,Software Design with Mobile Apps and Connected...,309,337
...,...,...,...,...
37,WD200,Arts (options),221,296.0
38,WD210,Software Systems Development,271,329.0
39,WD211,Creative Computing,275,322.0
40,WD212,Recreation and Sport Management,274,311.0


In [32]:
# Set the first row of new dataframe as header
# https://www.codegrepper.com/code-examples/python/frameworks/django/change+header+to+first+row+pandas
df2019_l8_load.columns = df2019_l8_load.iloc[0]
df2019_l8 = df2019_l8_load[1:]

In [33]:
df2019_l8

Unnamed: 0,Course Code,INSTITUTION and COURSE,EOS,Mid
1,,Athlone Institute of Technology,,
2,AL801,Software Design with Virtual Reality and Gaming,304,328
3,AL802,Software Design with Cloud Computing,301,306
4,AL803,Software Design with Mobile Apps and Connected...,309,337
5,AL805,Network Management and Cloud Infrastructure,329,442
...,...,...,...,...
37,WD200,Arts (options),221,296.0
38,WD210,Software Systems Development,271,329.0
39,WD211,Creative Computing,275,322.0
40,WD212,Recreation and Sport Management,274,311.0


In [34]:
# Reset index as tabula had a new index for each page of the pdf
df2019_l8 = df2019_l8.reset_index()

In [35]:
# Delete rows that contain NaN in the Course Code column
df2019_l8 = df2019_l8.dropna(subset = ['Course Code'])

In [36]:
df2019_l8

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
1,2,AL801,Software Design with Virtual Reality and Gaming,304,328
2,3,AL802,Software Design with Cloud Computing,301,306
3,4,AL803,Software Design with Mobile Apps and Connected...,309,337
4,5,AL805,Network Management and Cloud Infrastructure,329,442
5,6,AL810,Quantity Surveying,307,349
...,...,...,...,...,...
960,37,WD200,Arts (options),221,296.0
961,38,WD210,Software Systems Development,271,329.0
962,39,WD211,Creative Computing,275,322.0
963,40,WD212,Recreation and Sport Management,274,311.0


In [37]:
# Create a file path for the pandas data.
path2019_l8 = 'cao-data/cao2019_l8_' + current_time + '.csv'

In [38]:
# Save pandas data frame to disk.
df2019_l8.to_csv(path2019_l8)

## 2019 Level 7/6 Points

## References