# CAO points analysis in 2019 - 2021  


The Central Applications Office processes applications for undergraduate courses in Irish Higher Education Institutions (HEIs).

In [1]:
# Convenient HTTP requests.
import requests as rq

# Regular expressions.
import re

# Dates and times.
import datetime as dt

# Data frames.
import pandas as pd

# For downloading.
import urllib.request as urlrq

# For loading PDFs into pandas
import camelot

In [2]:
# Get the current date and time.
now = dt.datetime.now()

# Format as a string.
nowstr = now.strftime('%Y%m%d_%H%M%S')

<br>

## 2021 Points

Taken from [cao.ie](http://www2.cao.ie/points/l8.php)
***

In [3]:
# Fetch the CAO points URL.
resp = rq.get('http://www2.cao.ie/points/l8.php')

# Testing of we getting expected response (200 means OK).
resp

<Response [200]>

<br>

## Save original data set
***

In [4]:
# Create a file path for the original data.
pathhtml = 'data/cao2021_' + nowstr + '.html'

### Fixing encoding error

If we inspect the page in hte browser, server says we should decode as per:  

```
Content-Type: text/html; charset=iso-8859-1
```
     
However, one line uses #x96 character which isn't defined in iso-8859-1.
Therefore to fix this issue we going to use very similar standart cp1252,
which includes #x96 character

In [5]:
# The server uses the wrong encoding, fix it.
original_encoding = resp.encoding

# Change to cp1252.
resp.encoding = 'cp1252'

In [6]:
# Save the original html file.
with open(pathhtml, 'w') as f:
    f.write(resp.text)

<br>

## Loop through the lines of the response
***

In [7]:
# The file path for the csv file.
path2021 = 'data/cao2021_csv_' + nowstr + '.csv'

In [8]:
# Using regular expression to select lines that we need.
#re_course = re.compile(r'([A-Z]{2}[0-9]{3})(.*)')
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')
no_lines = 0

# Open the csv file for writing.
with open(path2021, 'w') as f:
    # Loop through the lines of the response content
    for line in resp.iter_lines():
        # Decode the line, using the wrong encoding!
        dline = line.decode('cp1252')
        # Match only the lines representing courses
        if re_course.fullmatch(dline):
            # Add one to the lines counter.
            no_lines += 1
            # Split the line on two or more spaces
            linesplit = re.split('  +', dline)
            # Rejoin the substrings with commas in between.
            f.write(','.join(linesplit) + '\n')
            # Pick out relevant parts of the matched line
            #csv_version = re_course.sub(r'\1,\2,\3,\4', dline)
            #print(csv_version)
print (f'Total number of lines is {no_lines}.')

Total number of lines is 922.


<br>

## 2020 Points

https://www.cao.ie/index.php?page=points&p=2020
***

<br>

### Save Original File
***

In [9]:
# URL to CAO points for 2020
url2020 = 'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx'

In [10]:
# Create a file path for the original data.
pathxlsx = 'data/cao2020_' + nowstr + '.xlsx'

# Save retrieved URL to the file
urlrq.urlretrieve(url2020, pathxlsx)

('data/cao2020_20211220_192927.xlsx',
 <http.client.HTTPMessage at 0x7ff0e89ee670>)

<br>

### Load Spreadsheet using pandas
***

In [11]:
# Download and parse the excel spreadsheet.
df2020 = pd.read_excel(url2020, skiprows=10)

In [12]:
df2020

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,...,,,,,,,,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,...,,,,,,,,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,...,,,,,,,,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,...,,,,,,,,,,


In [13]:
# Spot check a random row.
df2020.iloc[753]

CATEGORY (i.e.ISCED description)          Engineering and engineering trades
COURSE TITLE                        Road Transport Technology and Management
COURSE CODE2                                                           LC286
R1 POINTS                                                                264
R1 Random *                                                              NaN
R2 POINTS                                                                NaN
R2 Random*                                                               NaN
EOS                                                                      264
EOS Random *                                                             NaN
EOS Mid-point                                                            360
LEVEL                                                                      7
HEI                                         Limerick Institute of Technology
Test/Interview #                                                         NaN

In [14]:
# Spot check the last row.
df2020.iloc[-1]

CATEGORY (i.e.ISCED description)          Engineering and engineering trades
COURSE TITLE                        Mechanical and Manufacturing Engineering
COURSE CODE2                                                           WD230
R1 POINTS                                                                253
R1 Random *                                                              NaN
R2 POINTS                                                                NaN
R2 Random*                                                               NaN
EOS                                                                      253
EOS Random *                                                             NaN
EOS Mid-point                                                            369
LEVEL                                                                      8
HEI                                        Waterford Institute of Technology
Test/Interview #                                                         NaN

In [15]:
# Create a file path for the pandas data.
path2020 = 'data/cao2020_' + nowstr + '.csv'

# Save pandas data frame to disk.
df2020.to_csv(path2020)

<br>

## 2019 Points

https://www.cao.ie/index.php?page=points&p=2019
***

In [16]:
tables = camelot.read_pdf('data/cao2019.pdf')

In [17]:
tables

<TableList n=1>

In [18]:
tables.export('data/cao2019.csv', f='csv', compress=True)

In [19]:
tables[0]

<Table shape=(44, 4)>

In [20]:
tables[0].parsing_report

{'accuracy': 99.61, 'whitespace': 3.98, 'order': 1, 'page': 1}

In [21]:
tables[0].to_csv('data/cao2019_' + nowstr + '.csv')

In [23]:
tables.export('data/cao2019_' + nowstr + '.csv', f='csv')

In [25]:
tables.export('data/cao2019_test.csv', f='csv', pages='1-end')

TypeError: export() got an unexpected keyword argument 'pages'

In [26]:
cao_tables = camelot.read_pdf('data/cao2019.pdf', pages='1-end')

In [27]:
cao_tables

<TableList n=18>

In [28]:
cao_tables.export('data/cao2019_' + nowstr + '.csv', f='csv')

In [51]:
comb_df = pd.DataFrame()
i=0
for table in cao_tables:
    df = cao_tables[i].df
    i=i + 1
    comb_df = comb_df.append(df)
comb_df

Unnamed: 0,0,1,2,3
0,Course Code INSTITUTION and COURSE,,EOS,Mid
1,,Athlone Institute of Technology,,
2,AL801,Software Design with Virtual Reality and Gaming,304,328
3,AL802,Software Design with Cloud Computing,301,306
4,AL803,Software Design with Mobile Apps and Connected...,309,337
...,...,...,...,...
37,WD200,Arts (options),221,296
38,WD210,Software Systems Development,271,329
39,WD211,Creative Computing,275,322
40,WD212,Recreation and Sport Management,274,311


In [52]:
comb_df.to_csv('data/TEST.csv')