# CAO Points

In [1]:
# For dataframes
import pandas as pd

# For HTTP requests
import requests as rq

# For regular expressions
import re

# For accessing dates and times
import datetime as dt

### Datetime

In [2]:
"""Using datetime to create a variable that will refer to a string stating the current time.
This will be used throughout this notebook to save files with the current time in the filename."""

# Access the current date and time.
now = dt.datetime.now()

# Formatting the current date and time as a string and saving to a variable.
current_time = now.strftime('%Y%m%d_%H%M%S')

## 2021 Level 8 Points

In [3]:
resp = rq.get('http://www2.cao.ie/points/l8.php')

In [4]:
resp

<Response [200]>

In [5]:
"""Have to change the encoding as the following error is returned:
'charmap' codec can't encode character '\x96' in position 25767: character maps to <undefined>"""

# The server uses the wrong encoding.
original_encoding = resp.encoding

# Change to cp1252, which recognises the '\x96' character.
resp.encoding = 'cp1252'

In [6]:
# Create a file path for the original data.
path = 'cao-data/cao2021_level8_' + current_time + '.html'

In [7]:
# Save the original html file.
with open(path, 'w') as f:
    f.write(resp.text)

## NB This regular expression excludes courses where the points are listed
## as AQA (all qualified applicants) ie. courses where all the people who applied
## were accepted. Might need to come back to this to include these courses.

In [8]:
# re adapted from:https://github.com/ianmcloughlin/cao-points/blob/main/cao-points-analysis.ipynb

# Using regular expression to extract the lines of data we want.
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')

In [9]:
# Code adapted from: https://github.com/ianmcloughlin/cao-points/blob/main/cao-points-analysis.ipynb


# The file path for the csv file.
path = 'cao-data/cao2021_level8_csv_' + current_time + '.csv'

# Keep track of how many courses we process.
no_lines = 0

# Open the csv file for writing.
with open(path, 'w') as f:
    # Loop through lines of the response.
    for line in resp.iter_lines():
        # Decode the line, using the wrong encoding!
        dline = line.decode('cp1252')
        # Match only the lines representing courses.
        if re_course.fullmatch(dline):
            # Add one to the lines counter.
            no_lines = no_lines + 1
            # Split the line on two or more spaces.
            linesplit = re.split('  +', dline)
            # Rejoin the substrings with commas in between.
            f.write(','.join(linesplit) + '\n')

# Print the total number of processed lines.
print(f"Total number of lines is {no_lines}.")

Total number of lines is 922.


## 2021 Level 7/6 Points

In [10]:
resp = rq.get('http://www2.cao.ie/points/l76.php')

In [11]:
resp

<Response [200]>

In [12]:
# Create a file path for the original data.
path = 'cao-data/cao2021_level7_6_' + current_time + '.html'

In [13]:
# Save the original html file.
with open(path, 'w') as f:
    f.write(resp.text)

In [14]:
# Using regular expression to extract the lines of data we want.
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')

In [15]:
# The file path for the csv file.
path = 'cao-data/cao2021_level7_6_csv_' + current_time + '.csv'

# Keep track of how many courses we process.
no_lines = 0

# Open the csv file for writing.
with open(path, 'w') as f:
    # Loop through lines of the response.
    for line in resp.iter_lines():
        # Decode the line, using the wrong encoding!
        dline = line.decode('cp1252')
        # Match only the lines representing courses.
        if re_course.fullmatch(dline):
            # Add one to the lines counter.
            no_lines = no_lines + 1
            # Split the line on two or more spaces.
            linesplit = re.split('  +', dline)
            # Rejoin the substrings with commas in between.
            f.write(','.join(linesplit) + '\n')

# Print the total number of processed lines.
print(f"Total number of lines is {no_lines}.")

Total number of lines is 390.


## 2020 Level 8 Points

## 2020 Level 7/8 Points

## 2019 Level 8 Points

## 2019 Level 7/6 Points

## References