# CAO points analysis in 2019 - 2021  

http://www2.cao.ie/points/l8.php

The Central Applications Office processes applications for undergraduate courses in Irish Higher Education Institutions (HEIs).

In [1]:
# Convenient HTTP requests.
import requests as rq

# Regular expressions.
import re

# Dates and times.
import datetime as dt

In [2]:
# Get the current date and time.
now = dt.datetime.now()

# Format as a string.
nowstr = now.strftime('%Y%m%d_%H%M%S')

<br>

## 2021 Points

Taken from [cao.ie](http://www2.cao.ie/points/l8.php)
***

In [3]:
# Fetch the CAO points URL.
resp = rq.get('http://www2.cao.ie/points/l8.php')

# Testing of we getting expected response (200 means OK).
resp

<Response [200]>

<br>

## Save original data set
***

In [4]:
# Create a file path for the original data.
pathhtml = 'data/cao2021_' + nowstr + '.html'

### Fixing encoding error

If we inspect the page in hte browser, server says we should decode as per:  

```
Content-Type: text/html; charset=iso-8859-1
```
     
However, one line uses #x96 character which isn't defined in iso-8859-1.
Therefore to fix this issue we going to use very similar standart cp1252,
which includes #x96 character

In [5]:
# The server uses the wrong encoding, fix it.
original_encoding = resp.encoding

# Change to cp1252.
resp.encoding = 'cp1252'

In [6]:
# Save the original html file.
with open(pathhtml, 'w') as f:
    f.write(resp.text)

<br>

## Loop through the lines of the response
***

In [7]:
# The file path for the csv file.
path2021 = 'data/cao2021_csv_' + nowstr + '.csv'

In [8]:
# Using regular expression to select lines that we need.
#re_course = re.compile(r'([A-Z]{2}[0-9]{3})(.*)')
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')
no_lines = 0

# Open the csv file for writing.
with open(path2021, 'w') as f:
    # Loop through the lines of the response content
    for line in resp.iter_lines():
        # Decode the line, using the wrong encoding!
        dline = line.decode('cp1252')
        # Match only the lines representing courses
        if re_course.fullmatch(dline):
            # Add one to the lines counter.
            no_lines += 1
            # Split the line on two or more spaces
            linesplit = re.split('  +', dline)
            # Rejoin the substrings with commas in between.
            f.write(','.join(linesplit) + '\n')
            # Pick out relevant parts of the matched line
            #csv_version = re_course.sub(r'\1,\2,\3,\4', dline)
            #print(csv_version)
print (f'Total number of lines is {no_lines}.')

Total number of lines is 922.
