# CAO Points

In [1]:
# For dataframes
import pandas as pd

# For HTTP requests
import requests as rq

# For regular expressions
import re

# For accessing dates and times
import datetime as dt

# Accessing and downloading using urls
import urllib.request as urlrq

In [2]:
# conda install -c conda-forge tabula-py 
# commented out the installation for now.
# For reading pdfs 
import tabula

### Datetime

In [3]:
"""Using datetime to create a variable that will refer to a string stating the current time.
This will be used throughout this notebook to save files with the current time in the filename."""

# Access the current date and time.
now = dt.datetime.now()

# Formatting the current date and time as a string and saving to a variable.
current_time = now.strftime('%Y%m%d_%H%M%S')

In [4]:
# Make dataframe scrollable to eyeball any obvious errors in dataframes
pd.set_option("display.max_rows", None)

## 2021 Level 8 Points

In [5]:
resp2021_l8 = rq.get('http://www2.cao.ie/points/l8.php')

In [6]:
resp2021_l8

<Response [200]>

In [7]:
"""Have to change the encoding as the following error is returned:
'charmap' codec can't encode character '\x96' in position 25767: character maps to <undefined>"""

# The server uses the wrong encoding.
original_encoding = resp2021_l8.encoding

# Change to cp1252, which recognises the '\x96' character.
resp2021_l8.encoding = 'cp1252'

In [8]:
# Create a file path for the original data.
path2021_l8_html = 'cao-data/cao2021_level8_' + current_time + '.html'

In [9]:
# Save the original html file.
with open(path2021_l8_html, 'w') as f:
    f.write(resp2021_l8.text)

In [10]:
# re adapted from:https://github.com/ianmcloughlin/cao-points/blob/main/cao-points-analysis.ipynb

# Using regular expression to extract the lines of data we want.
re_course = re.compile(r'([A-Z]{2}[0-9]{3})(.*)')

In [11]:
# Code adapted from: https://github.com/ianmcloughlin/cao-points/blob/main/cao-points-analysis.ipynb


# The file path for the csv file.
path2021_l8 = 'cao-data/cao2021_level8_csv_' + current_time + '.csv'

# Keep track of how many courses we process.
no_lines = 0

# Open the csv file for writing.
with open(path2021_l8, 'w') as f:
    # Write a header row.
    f.write(','.join(['code', 'title', 'pointsR1', 'pointsR2']) + '\n')
    # Loop through lines of the response.
    for line in resp2021_l8.iter_lines():
        # Decode the line, using the wrong encoding!
        dline = line.decode('cp1252')
        # Match only the lines representing courses.
        if re_course.fullmatch(dline):
            # Add one to the lines counter.
            no_lines = no_lines + 1
            # The course code.
            course_code = dline[:5]
            # The course title.
            course_title = dline[7:57].strip()
            # Round one points.
            course_points = re.split(' +', dline[60:])
            if len(course_points) != 2:
                course_points = course_points[:2]
            # Join the fields using a comma.
            linesplit = [course_code, course_title, course_points[0], course_points[1]]
            # Rejoin the substrings with commas in between.
            f.write(','.join(linesplit) + '\n')

# Print the total number of processed lines.
print(f"Total number of lines is {no_lines}.")

Total number of lines is 949.


In [12]:
# Load the 2021 level 8 data to a pandas dataframe
df2021_l8 = pd.read_csv(path2021_l8, encoding='cp1252')

In [13]:
df2021_l8

Unnamed: 0,code,title,pointsR1,pointsR2
0,AL801,Software Design for Virtual Reality and Gaming,300,
1,AL802,Software Design in Artificial Intelligence for...,313,
2,AL803,Software Design for Mobile Apps and Connected ...,350,
3,AL805,Computer Engineering for Network Infrastructure,321,
4,AL810,Quantity Surveying,328,
5,AL811,Civil Engineering,,
6,AL820,Mechanical and Polymer Engineering,327,
7,AL830,General Nursing,451*,444
8,AL832,Mental Health Nursing,440*,431
9,AL835,Pharmacology,356,


## 2021 Level 7/6 Points

In [14]:
resp2021_l76 = rq.get('http://www2.cao.ie/points/l76.php')

In [15]:
resp2021_l76

<Response [200]>

In [16]:
# Create a file path for the original data.
path2021_l76_html = 'cao-data/cao2021_level7_6_' + current_time + '.html'

In [17]:
# Save the original html file.
with open(path2021_l76_html, 'w') as f:
    f.write(resp2021_l76.text)

In [18]:
# The file path for the csv file.
path2021_l76 = 'cao-data/cao2021_level7_6_csv_' + current_time + '.csv'

# Keep track of how many courses we process.
no_lines = 0

# Open the csv file for writing.
with open(path2021_l76, 'w') as f:
    # Write a header row.
    f.write(','.join(['code', 'title', 'pointsR1', 'pointsR2']) + '\n')
    # Loop through lines of the response.
    for line in resp2021_l76.iter_lines():
        # Decode the line, using the wrong encoding!
        dline = line.decode('cp1252')
        # Match only the lines representing courses.
        if re_course.fullmatch(dline):
            # Add one to the lines counter.
            no_lines = no_lines + 1
            # The course code.
            course_code = dline[:5]
            # The course title.
            course_title = dline[7:57].strip()
            # Round one points.
            course_points = re.split(' +', dline[60:])
            if len(course_points) != 2:
                course_points = course_points[:2]
            # Join the fields using a comma.
            linesplit = [course_code, course_title, course_points[0], course_points[1]]
            # Rejoin the substrings with commas in between.
            f.write(','.join(linesplit) + '\n')

# Print the total number of processed lines.
print(f"Total number of lines is {no_lines}.")

Total number of lines is 416.


In [19]:
# Load the 2021 level 7 and level 6 data to a pandas dataframe
df2021_l76 = pd.read_csv(path2021_l76, encoding='cp1252')

In [20]:
df2021_l76

Unnamed: 0,code,title,pointsR1,pointsR2
0,AL605,Music and Instrument Technology,211,
1,AL630,Pharmacy Technician,308,
2,AL631,Dental Nursing,311,
3,AL632,Applied Science,297,
4,AL650,Business,AQA,AQA
5,AL660,Culinary Arts,AQA,AQA
6,AL661,Bar Supervision,AQA,
7,AL663,Business (Sport and Recreation),AQA,AQA
8,AL701,Computer Engineering for Network Infrastructure,207,
9,AL702,Software Design in Artificial Intelligence for...,220,


## 2020 Level 8/7/6 Points

In [21]:
# The points for levels 8, 7, and 6 were included in one spreadsheet on the CAO website
url2020 = 'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx'

In [22]:
# Create a file path for the original data.
path2020_xlsx = 'cao-data/cao2020_' + current_time + '.xlsx'

In [23]:
urlrq.urlretrieve(url2020, path2020_xlsx)

('cao-data/cao2020_20211113_145558.xlsx',
 <http.client.HTTPMessage at 0x254f5882100>)

In [24]:
# Load the 2020 data to a pandas dataframe
df2020 = pd.read_excel(url2020, skiprows=10)

In [25]:
df2020

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
5,Education,Education & Design or Fine Art (Second Level T...,AD202,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
6,Arts,Fine Art (portfolio),AD204,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
7,Arts,Fashion Design (portfolio),AD211,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
8,Arts,Product Design (portfolio),AD212,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
9,Arts,Visual Culture,AD215,377,,320,,320,,389,...,,,,,,,,,,


In [26]:
# Create a file path for the pandas data.
path2020 = 'cao-data/cao2020_' + current_time + '.csv'

In [27]:
# Save pandas data frame to disk.
df2020.to_csv(path2020)

## 2019 Level 8 Points

In [28]:
# Request CAO website for 2019 level 8 url
url2019_l8 = 'http://www2.cao.ie/points/lvl8_19.pdf'

In [29]:
# Create a file path for the original data.
path2019_l8_pdf = 'cao-data/cao2019_l8_' + current_time + '.pdf'

In [30]:
# Save pdf to disk
urlrq.urlretrieve(url2019_l8, path2019_l8_pdf)

('cao-data/cao2019_l8_20211113_145558.pdf',
 <http.client.HTTPMessage at 0x254f56e4220>)

In [31]:
# Read pdf into panda using tabula
# Using concat as tabula creates a new dataframe for each page of the pdf
# Takes pandas options to remove header as tabula assigns first row on each new page as the header 
df2019_l8_load = pd.concat(tabula.read_pdf("http://www2.cao.ie/points/lvl8_19.pdf", 
                                      pages = 'all', pandas_options={'header': None}))

In [32]:
# Have a look at the imported data
df2019_l8_load

Unnamed: 0,0,1,2,3
0,Course Code,INSTITUTION and COURSE,EOS,Mid
1,,Athlone Institute of Technology,,
2,AL801,Software Design with Virtual Reality and Gaming,304,328
3,AL802,Software Design with Cloud Computing,301,306
4,AL803,Software Design with Mobile Apps and Connected...,309,337
5,AL805,Network Management and Cloud Infrastructure,329,442
6,AL810,Quantity Surveying,307,349
7,AL820,Mechanical and Polymer Engineering,300,358
8,AL830,General Nursing,410,429
9,AL832,Psychiatric Nursing,387,403


In [33]:
# Set the first row of new dataframe as header
# https://www.codegrepper.com/code-examples/python/frameworks/django/change+header+to+first+row+pandas
df2019_l8_load.columns = df2019_l8_load.iloc[0]
df2019_l8 = df2019_l8_load[1:]

In [34]:
df2019_l8

Unnamed: 0,Course Code,INSTITUTION and COURSE,EOS,Mid
1,,Athlone Institute of Technology,,
2,AL801,Software Design with Virtual Reality and Gaming,304,328
3,AL802,Software Design with Cloud Computing,301,306
4,AL803,Software Design with Mobile Apps and Connected...,309,337
5,AL805,Network Management and Cloud Infrastructure,329,442
6,AL810,Quantity Surveying,307,349
7,AL820,Mechanical and Polymer Engineering,300,358
8,AL830,General Nursing,410,429
9,AL832,Psychiatric Nursing,387,403
10,AL836,Nutrition and Health Science,352,383


In [35]:
# Reset index as tabula had a new index for each page of the pdf
df2019_l8 = df2019_l8.reset_index()

In [36]:
# Delete rows that contain NaN in the Course Code column
df2019_l8 = df2019_l8.dropna(subset = ['Course Code'])

In [37]:
df2019_l8

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
1,2,AL801,Software Design with Virtual Reality and Gaming,304,328
2,3,AL802,Software Design with Cloud Computing,301,306
3,4,AL803,Software Design with Mobile Apps and Connected...,309,337
4,5,AL805,Network Management and Cloud Infrastructure,329,442
5,6,AL810,Quantity Surveying,307,349
6,7,AL820,Mechanical and Polymer Engineering,300,358
7,8,AL830,General Nursing,410,429
8,9,AL832,Psychiatric Nursing,387,403
9,10,AL836,Nutrition and Health Science,352,383
10,11,AL837,Sports Science with Exercise Physiology,351,392


In [38]:
# Create a file path for the pandas data.
path2019_l8 = 'cao-data/cao2019_l8_' + current_time + '.csv'

In [39]:
# Save pandas data frame to disk.
df2019_l8.to_csv(path2019_l8)

## 2019 Level 7/6 Points

In [40]:
# Request CAO website for 2019 level 7 and 6 url
url2019_l76 = 'http://www2.cao.ie/points/lvl76_19.pdf'

In [41]:
# Create a file path for the original data.
path2019_l76_pdf = 'cao-data/cao2019_l76_' + current_time + '.pdf'

In [42]:
# Save pdf to disk
urlrq.urlretrieve(url2019_l76, path2019_l76_pdf)

('cao-data/cao2019_l76_20211113_145558.pdf',
 <http.client.HTTPMessage at 0x254f6214310>)

In [43]:
# Read pdf into panda using tabula
# Using concat as tabula creates a new dataframe for each page of the pdf
# Takes pandas options to remove header as tabula assigns first row on each new page as the header 
df2019_l76_load = pd.concat(tabula.read_pdf("http://www2.cao.ie/points/lvl76_19.pdf", 
                                      pages = 'all', pandas_options={'header': None}))

In [44]:
# Have a look at the imported data
df2019_l76_load

Unnamed: 0,0,1,2,3
0,,ADMISSION DATA 2019,,
1,,End of Season,,
2,,"Level 6, 7",,
3,,The details given are for general information...,,
4,*,Not all on this points score were offered places,,
5,#,Test / Interview / Portfolio / Audition,,
6,AQA,All qualified applicants,,
7,,,,
8,Course Code,INSTITUTION and COURSE,EOS,Mid
9,,Athlone Institute of Technology,,


In [45]:
# Rename the columns to match the 2019 level 8 data
df2019_l76_load.columns = ['Course Code', 'INSTITUTION and COURSE', 'EOS', 'Mid']

In [46]:
# With the 2019 level 8 data, tabula ignored the introductory information 
# But it is included in the 2019 level 7 and 6 data so we have to remove it
# Reusing regular expressions to extract rows with the course codes we want
df2019_l76 = df2019_l76_load[df2019_l76_load['Course Code'].str.match(re_course, na=False)]

In [47]:
# Reset index as tabula had a new index for each page of the pdf
df2019_l76 = df2019_l76.reset_index()

In [48]:
df2019_l76

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
0,10,AL600,Software Design,205,306.0
1,11,AL601,Computer Engineering,196,272.0
2,12,AL602,Mechanical Engineering,258,424.0
3,13,AL604,Civil Engineering,252,360.0
4,14,AL630,Pharmacy Technician,306,366.0
5,15,AL631,Dental Nursing,326,379.0
6,16,AL632,Applied Science,243,372.0
7,17,AL650,Business,210,317.0
8,18,AL651,Music and Instrument Technology,AQA,296.0
9,19,AL660,Culinary Arts,AQA,216.0


In [49]:
# Create a file path for the pandas data.
path2019_l76 = 'cao-data/cao2019_l76_' + current_time + '.csv'

In [50]:
# Save pandas data frame to disk.
df2019_l76.to_csv(path2019_l76)

## 2018 Level 8 Points

In [51]:
# Request CAO website for 2018 level 8 url
url2018_l8 = 'http://www2.cao.ie/points/lvl8_18.pdf'

In [52]:
# Create a file path for the original data.
path2018_l8_pdf = 'cao-data/cao2018_l8_' + current_time + '.pdf'

In [53]:
# Save pdf to disk
urlrq.urlretrieve(url2018_l8, path2018_l8_pdf)

('cao-data/cao2018_l8_20211113_145558.pdf',
 <http.client.HTTPMessage at 0x254f621bb20>)

In [54]:
# Read pdf into panda using tabula
# Using concat as tabula creates a new dataframe for each page of the pdf
# Takes pandas options to remove header as tabula assigns first row on each new page as the header 
df2018_l8_load = pd.concat(tabula.read_pdf("http://www2.cao.ie/points/lvl8_18.pdf", 
                                      pages = 'all', pandas_options={'header': None}))

In [56]:
df2018_l8_load

Unnamed: 0,0,1,2,3
0,,ADMISSION DATA 2018,,
1,,End of Season,,
2,,Level 8,,
3,,The details given are for general information...,,
4,*,Not all on this points score were offered places,,
5,#,Test / Interview / Portfolio / Audition,,
6,AQA,All qualified applicants,,
7,,,,
8,Course Code,INSTITUTION and COURSE,EOS,Mid
9,,Athlone Institute of Technology,,


In [59]:
# Rename the columns to match the 2019 level 8 data
df2018_l8_load.columns = ['Course Code', 'INSTITUTION and COURSE', 'EOS', 'Mid']

In [60]:
# Need to filter out the introductory blurb that isn't needed in the dataframe
# Reusing regular expressions to extract rows with the course codes we want
df2018_l8 = df2018_l8_load[df2018_l8_load['Course Code'].str.match(re_course, na=False)]

In [63]:
# Reset index as tabula had a new index for each page of the pdf
df2018_l8 = df2018_l8.reset_index()

In [64]:
df2018_l8

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
0,10,AL801,Software Design (Game Development or Cloud Com...,295,326.0
1,11,AL810,Quantity Surveying,300,340.0
2,12,AL820,Mechanical and Polymer Engineering,299,371.0
3,13,AL830,General Nursing,418,440.0
4,14,AL832,Psychiatric Nursing,377,388.0
5,15,AL836,Health Science with Nutrition,354,389.0
6,16,AL837,Sports Science with Exercise Physiology,346,369.0
7,17,AL838,Biotechnology,301,338.0
8,18,AL839,Microbiology,318,380.0
9,19,AL840,Pharmaceutical Science,312,388.0


In [65]:
# Create a file path for the pandas data.
path2018_l8 = 'cao-data/cao2018_l8_' + current_time + '.csv'

In [66]:
# Save pandas data frame to disk.
df2018_l8.to_csv(path2018_l8)

## 2018 Level 7/6 Points

In [67]:
# Request CAO website for 2018 level 7 and 6 url
url2018_l76 = 'http://www2.cao.ie/points/lvl76_18.pdf'

In [68]:
# Create a file path for the original data.
path2018_l76_pdf = 'cao-data/cao2018_l76_' + current_time + '.pdf'

In [69]:
# Save pdf to disk
urlrq.urlretrieve(url2019_l76, path2019_l76_pdf)

('cao-data/cao2019_l76_20211113_145558.pdf',
 <http.client.HTTPMessage at 0x254f6004670>)

In [71]:
# Read pdf into panda using tabula
# Using concat as tabula creates a new dataframe for each page of the pdf
# Takes pandas options to remove header as tabula assigns first row on each new page as the header 
df2018_l76_load = pd.concat(tabula.read_pdf("http://www2.cao.ie/points/lvl76_18.pdf", 
                                      pages = 'all', pandas_options={'header': None}))

In [72]:
# Have a look at the imported data
df2018_l76_load

Unnamed: 0,0,1,2,3
0,,ADMISSION DATA 2018,,
1,,End of Season,,
2,,"Level 6, 7",,
3,,The details given are for general information...,,
4,*,Not all on this points score were offered places,,
5,#,Test / Interview / Portfolio / Audition,,
6,AQA,All qualified applicants,,
7,,,,
8,Course Code,INSTITUTION and COURSE,,
9,,Athlone Institute of Technology,EOS,Mid


In [75]:
# Rename the columns to match the 2019 level 8 data
df2018_l76_load.columns = ['Course Code', 'INSTITUTION and COURSE', 'EOS', 'Mid']

In [76]:
# Reusing regular expressions to extract rows with the course codes we want
df2018_l76 = df2018_l76_load[df2018_l76_load['Course Code'].str.match(re_course, na=False)]

In [77]:
# Reset index as tabula had a new index for each page of the pdf
df2018_l76 = df2018_l76.reset_index()

In [78]:
df2018_l76

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
0,10,AL601,Electronics and Computer Engineering,240,321.0
1,11,AL602,Mechanical Engineering,201,299.0
2,12,AL604,Civil Engineering,243,320.0
3,13,AL630,Pharmacy Technician,306,388.0
4,14,AL631,Dental Nursing,307,348.0
5,15,AL632,Science (Bioscience/Chemistry),216,444.0
6,16,AL650,Business,200,357.0
7,17,AL651,Music and Instrument Technology,AQA,308.0
8,18,AL652,Business in Equine,AQA,281.0
9,19,AL660,Culinary Arts,AQA,208.0


In [79]:
# Create a file path for the pandas data.
path2018_l76 = 'cao-data/cao2018_l76_' + current_time + '.csv'

In [80]:
# Save pandas data frame to disk.
df2018_l76.to_csv(path2018_l76)

## References