# CAO Points

In [1]:
# For dataframes
import pandas as pd

# For HTTP requests
import requests as rq

# For regular expressions
import re

# For accessing dates and times
import datetime as dt

# Accessing and downloading using urls
import urllib.request as urlrq

# For reading pdfs 
import tabula

### Datetime

In [2]:
"""Using datetime to create a variable that will refer to a string stating the current time.
This will be used throughout this notebook to save files with the current time in the filename."""

# Access the current date and time.
now = dt.datetime.now()

# Formatting the current date and time as a string and saving to a variable.
current_time = now.strftime('%Y%m%d_%H%M%S')

In [3]:
# Make dataframe scrollable to eyeball any obvious errors in dataframes
# Commenting out for now as it's slowing down the browser too much
# pd.set_option("display.max_rows", None)

### Functions

In [4]:
# A function that adds courses with specific symbols in the points columns to lists
# Then returns the points without the symbols
# * - Not all on this points score were offered places
# # - Test / Interview / Portfolio / Audition
# AQA - All qualified applicants
# Commented out for now until it's tested fully
"""
def points_to_array(s):
    portfolio = ''
    if s[0] == '#':
        portfolio = '#'
    random = ''
    if s[-1] == '*':
        random = '*'
    aqa = ''
    if s == 'AQA':
        aqa = 'AQA'
    points = ''
    for i in s:
        if i.isdigit():
            points = points + i
    return[points, portfolio, random, aqa]
"""

"\ndef points_to_array(s):\n    portfolio = ''\n    if s[0] == '#':\n        portfolio = '#'\n    random = ''\n    if s[-1] == '*':\n        random = '*'\n    aqa = ''\n    if s == 'AQA':\n        aqa = 'AQA'\n    points = ''\n    for i in s:\n        if i.isdigit():\n            points = points + i\n    return[points, portfolio, random, aqa]\n"

In [5]:
# A function that finds courses that required Test / Interview / Portfolio / Audition
# As indicated by the '#' character
def portfolio(df):
    # Find '#' in pointsR1 column
    df_portfolio1 = df[df['pointsR1'].str.match("#", na=False)]
    # find '#' in pointsR2 column
    df_portfolio2 = df[df['pointsR2'].str.match("#", na=False)]
    # Concat the dataframes together
    df_portfolio = pd.concat([df_portfolio1, df_portfolio2], ignore_index=True)
    # Remove duplicates
    df_portfolio = df_portfolio.drop_duplicates()
    return df_portfolio

In [6]:
# A function that finds courses where all on this points score were offered places
# As indicated by the '*' character
def random(df):
    # Find '*' in pointsR1 column
    df_random1 = df[df['pointsR1'].str[-1] == '*']
    # find '#' in pointsR2 column
    df_random2 = df[df['pointsR2'].str[-1] == '*']
    # Concat the dataframes together
    df_random = pd.concat([df_random1, df_random2], ignore_index=True)
    # Remove duplicates
    df_random = df_random.drop_duplicates()
    return df_random

In [7]:
# A function that finds courses where all qualified applicants earned places
# As indicated by AQA
def aqa(df):
    # Find 'AQA' in pointsR1 column
    df_aqa1 = df[df['pointsR1'].str.match("AQA", na=False)]
    # find 'AQA' in pointsR2 column
    df_aqa2 = df[df['pointsR2'].str.match("AQA", na=False)]
    # Concat the dataframes together
    df_aqa = pd.concat([df_aqa1, df_aqa2], ignore_index=True)
    # Remove duplicates
    df_aqa = df_aqa.drop_duplicates()
    return df_aqa

In [8]:
# A function that finds courses where there's new competition for available places
# As indicated by the 'v' character
def comp(df):
    # Find 'v' in pointsR1 column
    df_comp1 = df[df['pointsR1'].str[-1] == 'v']
    # find '#' in pointsR2 column
    df_comp2 = df[df['pointsR2'].str[-1] == 'v']
    # Concat the dataframes together
    df_comp = pd.concat([df_comp1, df_comp2], ignore_index=True)
    # Remove duplicates
    df_comp = df_comp.drop_duplicates()
    return df_comp

In [9]:
# Add new columns with point totals sans additional characters
# https://stackoverflow.com/questions/13682044/remove-unwanted-parts-from-strings-in-a-column/22238380
def points(df):
    df2 = df.assign(Round_1_Points=df['pointsR1'].str.replace(r'\D', '', regex=True))
    df3 = df2.assign(Round_2_Points=df['pointsR2'].str.replace(r'\D', '', regex=True))
    return df3

## 2021 Level 8 Points

In [10]:
resp2021_l8 = rq.get('http://www2.cao.ie/points/l8.php')

In [11]:
resp2021_l8

<Response [200]>

In [12]:
"""Have to change the encoding as the following error is returned:
'charmap' codec can't encode character '\x96' in position 25767: character maps to <undefined>"""

# The server uses the wrong encoding.
original_encoding = resp2021_l8.encoding

# Change to cp1252, which recognises the '\x96' character.
resp2021_l8.encoding = 'cp1252'

In [13]:
# Create a file path for the original data.
path2021_l8_html = 'cao-data/cao2021_level8_' + current_time + '.html'

In [14]:
# Save the original html file.
with open(path2021_l8_html, 'w') as f:
    f.write(resp2021_l8.text)

In [15]:
# re adapted from:https://github.com/ianmcloughlin/cao-points/blob/main/cao-points-analysis.ipynb

# Using regular expression to extract the lines of data we want.
re_course = re.compile(r'([A-Z]{2}[0-9]{3})(.*)')

In [16]:
# Code adapted from: https://github.com/ianmcloughlin/cao-points/blob/main/cao-points-analysis.ipynb


# The file path for the csv file.
path2021_l8 = 'cao-data/cao2021_level8_csv_' + current_time + '.csv'

# Keep track of how many courses we process.
no_lines = 0

# Open the csv file for writing.
with open(path2021_l8, 'w') as f:
    # Write a header row.
    f.write(','.join(['code', 'title', 'pointsR1', 'pointsR2']) + '\n')
    # Loop through lines of the response.
    for line in resp2021_l8.iter_lines():
        # Decode the line, using the wrong encoding!
        dline = line.decode('cp1252')
        # Match only the lines representing courses.
        if re_course.fullmatch(dline):
            # Add one to the lines counter.
            no_lines = no_lines + 1
            # The course code.
            course_code = dline[:5]
            # The course title.
            course_title = dline[7:57].strip()
            # Round one points.
            course_points = re.split(' +', dline[60:])
            if len(course_points) != 2:
                course_points = course_points[:2]
            # Join the fields using a comma.
            linesplit = [course_code, course_title, course_points[0], course_points[1]]
            # Rejoin the substrings with commas in between.
            f.write(','.join(linesplit) + '\n')

# Print the total number of processed lines.
print(f"Total number of lines is {no_lines}.")

Total number of lines is 949.


In [17]:
# Load the 2021 level 8 data to a pandas dataframe
df2021_l8 = pd.read_csv(path2021_l8, encoding='cp1252')

In [18]:
# Extracting all Portfolio/Test/Interview/Audition courses 
df2021_l8_portfolio = portfolio(df2021_l8)

In [19]:
df2021_l8_portfolio

Unnamed: 0,code,title,pointsR1,pointsR2
0,AL861,Animation and Illustration (portfolio),#575,
1,AL863,Graphic and Digital Design (portfolio),#747,
2,CW038,Art (portfolio),#700,
3,CW858,Sports Management and Coaching (options portf...,#700,
4,CR121,Music at CIT Cork School of Music,#904,#904
...,...,...,...,...
78,SG244,Fine Art,#501,
79,TL801,Animation Visual Effects and Motion Design (L...,#718,
80,WD027,Music,#321,
96,MH802,Community and Youth Work (part-time in service),,#


In [20]:
# Extracting courses where not everyone with those points were offered places
df2021_l8_random = random(df2021_l8)

In [21]:
df2021_l8_random

Unnamed: 0,code,title,pointsR1,pointsR2
0,AL830,General Nursing,451*,444
1,AL832,Mental Health Nursing,440*,431
2,AL870,Applied Psychology,484*,467*
3,CR320,Biomedical Science - Offered jointly by CIT an...,590*,
4,CR930,Home Economics and Business,518*,
...,...,...,...,...
118,DN600,Law (options),576,567*
119,DN660,Commerce International (options),555,554*
120,DN710,Economics,534,532*
123,LM090,Physical Education with concurrent Teacher Edu...,552,544*


In [22]:
# Extracting courses where all qualified applicants earned places
df2021_l8_aqa = aqa(df2021_l8)

In [23]:
df2021_l8_aqa

Unnamed: 0,code,title,pointsR1,pointsR2
0,MH801,Early Childhood - Teaching and Learning (part-...,AQA,AQA


In [24]:
# Add extra columns for points only using previously defined function
df2021_l8 = points(df2021_l8)

In [25]:
df2021_l8

Unnamed: 0,code,title,pointsR1,pointsR2,Round_1_Points,Round_2_Points
0,AL801,Software Design for Virtual Reality and Gaming,300,,300,
1,AL802,Software Design in Artificial Intelligence for...,313,,313,
2,AL803,Software Design for Mobile Apps and Connected ...,350,,350,
3,AL805,Computer Engineering for Network Infrastructure,321,,321,
4,AL810,Quantity Surveying,328,,328,
...,...,...,...,...,...,...
944,WD211,Creative Computing,270,,270,
945,WD212,Recreation and Sport Management,262,,262,
946,WD230,Mechanical and Manufacturing Engineering,230,230,230,230
947,WD231,Early Childhood Care and Education,266,,266,


In [26]:
# Rename columns to match later dataframes
df2021_l8.rename(columns={'code': 'Course Code', 'title': 'INSTITUTION and COURSE'}, inplace=True)

In [27]:
df2021_l8

Unnamed: 0,Course Code,INSTITUTION and COURSE,pointsR1,pointsR2,Round_1_Points,Round_2_Points
0,AL801,Software Design for Virtual Reality and Gaming,300,,300,
1,AL802,Software Design in Artificial Intelligence for...,313,,313,
2,AL803,Software Design for Mobile Apps and Connected ...,350,,350,
3,AL805,Computer Engineering for Network Infrastructure,321,,321,
4,AL810,Quantity Surveying,328,,328,
...,...,...,...,...,...,...
944,WD211,Creative Computing,270,,270,
945,WD212,Recreation and Sport Management,262,,262,
946,WD230,Mechanical and Manufacturing Engineering,230,230,230,230
947,WD231,Early Childhood Care and Education,266,,266,


In [28]:
# Save pandas data frame to disk.
df2021_l8.to_csv(path2021_l8)

## 2021 Level 7/6 Points

In [29]:
resp2021_l76 = rq.get('http://www2.cao.ie/points/l76.php')

In [30]:
resp2021_l76

<Response [200]>

In [31]:
# Create a file path for the original data.
path2021_l76_html = 'cao-data/cao2021_level7_6_' + current_time + '.html'

In [32]:
# Save the original html file.
with open(path2021_l76_html, 'w') as f:
    f.write(resp2021_l76.text)

In [33]:
# The file path for the csv file.
path2021_l76 = 'cao-data/cao2021_level7_6_csv_' + current_time + '.csv'

# Keep track of how many courses we process.
no_lines = 0

# Open the csv file for writing.
with open(path2021_l76, 'w') as f:
    # Write a header row.
    f.write(','.join(['code', 'title', 'pointsR1', 'pointsR2']) + '\n')
    # Loop through lines of the response.
    for line in resp2021_l76.iter_lines():
        # Decode the line, using the wrong encoding!
        dline = line.decode('cp1252')
        # Match only the lines representing courses.
        if re_course.fullmatch(dline):
            # Add one to the lines counter.
            no_lines = no_lines + 1
            # The course code.
            course_code = dline[:5]
            # The course title.
            course_title = dline[7:57].strip()
            # Round one points.
            course_points = re.split(' +', dline[60:])
            if len(course_points) != 2:
                course_points = course_points[:2]
            # Join the fields using a comma.
            linesplit = [course_code, course_title, course_points[0], course_points[1]]
            # Rejoin the substrings with commas in between.
            f.write(','.join(linesplit) + '\n')

# Print the total number of processed lines.
print(f"Total number of lines is {no_lines}.")

Total number of lines is 416.


In [34]:
# Load the 2021 level 7 and level 6 data to a pandas dataframe
df2021_l76 = pd.read_csv(path2021_l76, encoding='cp1252')

In [35]:
df2021_l76

Unnamed: 0,code,title,pointsR1,pointsR2
0,AL605,Music and Instrument Technology,211,
1,AL630,Pharmacy Technician,308,
2,AL631,Dental Nursing,311,
3,AL632,Applied Science,297,
4,AL650,Business,AQA,AQA
...,...,...,...,...
411,WD188,Applied Health Care,220,
412,WD205,Molecular Biology with Biopharmaceutical Science,AQA,262v
413,WD206,Electronic Engineering,180,
414,WD207,Mechanical Engineering,172,


In [36]:
# Extracting all Portfolio/Test/Interview/Audition courses 
df2021_l76_portfolio = portfolio(df2021_l76)

In [37]:
df2021_l76_portfolio

Unnamed: 0,code,title,pointsR1,pointsR2
0,AL763,Graphic Design (portfolio),#642,
1,CW057,Art (portfolio),#700,
2,CW807,Sport Coaching and Business Management - GAA (...,#700,
3,CW817,Sport Coaching and Business Management - Rugby...,#700,
4,CW827,Sport Coaching and Business Management - Socce...,#700,
5,TU655,Computing (Networking Technologies) Learn and ...,#502,
6,TU722,Process Instrumentation and Automation Learn a...,#346,#346
7,TU795,Visual Merchandising and Display,#618,
8,TR802,Dental Hygiene,#577,#565
9,TR803,Dental Technology,#543,#498*


In [38]:
# Extracting courses where not everyone with those points were offered places
df2021_l76_random = random(df2021_l76)

In [39]:
df2021_l76_random

Unnamed: 0,code,title,pointsR1,pointsR2
0,LY847,Veterinary Nursing,389*,
1,TR803,Dental Technology,#543,#498*


In [40]:
# Extracting courses where all qualified applicants earned places
df2021_l76_aqa = aqa(df2021_l76)

In [41]:
df2021_l76_aqa

Unnamed: 0,code,title,pointsR1,pointsR2
0,AL650,Business,AQA,AQA
1,AL660,Culinary Arts,AQA,AQA
2,AL661,Bar Supervision,AQA,
3,AL663,Business (Sport and Recreation),AQA,AQA
4,AL761,Hotel and Leisure Management,AQA,AQA
5,TU772,Business (options),AQA,250v
6,TU792,Community and Youth Development,AQA,242v
7,WD003,Business,AQA,AQA
8,WD013,Legal Studies,AQA,206v
9,WD019,Recreation and Sport Management,AQA,AQA


In [42]:
# Extracting courses where there's new competition for available places
df2021_l76_comp = comp(df2021_l76)

In [43]:
df2021_l76_comp

Unnamed: 0,code,title,pointsR1,pointsR2
0,TU708,Engineering (Common Entry with Award options),117,263v
1,TU772,Business (options),AQA,250v
2,TU792,Community and Youth Development,AQA,242v
3,WD013,Legal Studies,AQA,206v
4,WD177,Science (Mol. Biology with Biopharm. Food Science,205,455v
5,WD205,Molecular Biology with Biopharmaceutical Science,AQA,262v


In [44]:
df2021_l76 = points(df2021_l76)

In [45]:
df2021_l76

Unnamed: 0,code,title,pointsR1,pointsR2,Round_1_Points,Round_2_Points
0,AL605,Music and Instrument Technology,211,,211,
1,AL630,Pharmacy Technician,308,,308,
2,AL631,Dental Nursing,311,,311,
3,AL632,Applied Science,297,,297,
4,AL650,Business,AQA,AQA,,
...,...,...,...,...,...,...
411,WD188,Applied Health Care,220,,220,
412,WD205,Molecular Biology with Biopharmaceutical Science,AQA,262v,,262
413,WD206,Electronic Engineering,180,,180,
414,WD207,Mechanical Engineering,172,,172,


In [46]:
# Rename columns to match later dataframes
df2021_l76.rename(columns={'code': 'Course Code', 'title': 'INSTITUTION and COURSE'}, inplace=True)

In [47]:
df2021_l76

Unnamed: 0,Course Code,INSTITUTION and COURSE,pointsR1,pointsR2,Round_1_Points,Round_2_Points
0,AL605,Music and Instrument Technology,211,,211,
1,AL630,Pharmacy Technician,308,,308,
2,AL631,Dental Nursing,311,,311,
3,AL632,Applied Science,297,,297,
4,AL650,Business,AQA,AQA,,
...,...,...,...,...,...,...
411,WD188,Applied Health Care,220,,220,
412,WD205,Molecular Biology with Biopharmaceutical Science,AQA,262v,,262
413,WD206,Electronic Engineering,180,,180,
414,WD207,Mechanical Engineering,172,,172,


In [48]:
# Save pandas data frame to disk.
df2021_l76.to_csv(path2021_l76)

## 2020 Level 8/7/6 Points

In [49]:
# The points for levels 8, 7, and 6 were included in one spreadsheet on the CAO website
url2020 = 'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx'

In [50]:
# Create a file path for the original data.
path2020_xlsx = 'cao-data/cao2020_' + current_time + '.xlsx'

In [51]:
urlrq.urlretrieve(url2020, path2020_xlsx)

('cao-data/cao2020_20211123_130330.xlsx',
 <http.client.HTTPMessage at 0x233cbe21850>)

In [52]:
# Load the 2020 data to a pandas dataframe
df2020 = pd.read_excel(url2020, skiprows=10)

In [53]:
df2020

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,...,,,,,,,,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,...,,,,,,,,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,...,,,,,,,,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,...,,,,,,,,,,


In [54]:
# Rename columns to match later dataframes
df2020.rename(columns={'COURSE CODE2': 'Course Code', 'COURSE TITLE': 'INSTITUTION and COURSE',
                      'R1 POINTS': 'pointsR1', 'R2 POINTS': 'pointsR2'}, inplace=True)


In [55]:
df2020

Unnamed: 0,CATEGORY (i.e.ISCED description),INSTITUTION and COURSE,Course Code,pointsR1,R1 Random *,pointsR2,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,...,,,,,,,,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,...,,,,,,,,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,...,,,,,,,,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,...,,,,,,,,,,


In [56]:
# Extract courses that required portfolio/test/interview/audition
df2020_portfolio = df2020[df2020['Test/Interview #'].str.match("#", na=False)]

In [57]:
df2020_portfolio

Unnamed: 0,CATEGORY (i.e.ISCED description),INSTITUTION and COURSE,Course Code,pointsR1,R1 Random *,pointsR2,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
5,Education,Education & Design or Fine Art (Second Level T...,AD202,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
6,Arts,Fine Art (portfolio),AD204,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1382,Arts,Design - Visual Communication,TU973,717,,,,717,,865,...,,,,,,,,,,
1383,Arts,Fine Art,TU974,709,,563,,563,,826,...,,,,,,,,,,
1384,Arts,Interior Design,TU975,632,,,,632,,879,...,,,,,,,,,,
1385,Arts,Photography,TU976,695,,,,695,,926,...,,,,,,,,,,


In [58]:
# Extract courses where not everyone with that point total was offered a place
# Due to the different formatting, the previous function for this task cannot be reused
# Filters out the rows where the Random columns contain NaN values, leaving only '*' rows
df_random1 = df2020[~df2020['R1 Random *'].isnull()]
df_random2 = df2020[~df2020['R2 Random*'].isnull()]
df_random = pd.concat([df_random1, df_random2], ignore_index=True)
df2020_random = df_random.drop_duplicates()

In [59]:
df2020_random

Unnamed: 0,CATEGORY (i.e.ISCED description),INSTITUTION and COURSE,Course Code,pointsR1,R1 Random *,pointsR2,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Health,Psychiatric Nursing,AL832,387,*,384,,384,,399,...,,,,,,,,,,
1,Education,"Education, Home Economics and Religious Ed - w...",AS002,441,*,434,*,434,*,466,...,,,,,,,,,,
2,Education,"Education, Home Economics and Irish - with con...",AS003,440,*,440,,440,,484,...,,,,,,,,,,
3,Health,General Nursing,AS110,419,*,,,419,*,431,...,,,,,,,,,,
4,Health,Intellectual Disability Nursing,AS130,336,*,328,,317,,339,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,Law,Law,TR004,566,,565,*,565,*,578,...,,,,,,,,,,
80,Social and behavioural sciences,Psychology,TR006,567,,555,*,555,*,578,...,,,,,,,,,,
81,Engineering and engineering trades,Engineering,TR032,510,,509,*,508,,533,...,,,,,,,,,,
86,Business and administration,Business Studies and German,TR086,509,,499,*,499,,518,...,,,,,,,,,,


In [60]:
# Reuse function to find courses where all qualified applicants received offers
df2020_aqa = aqa(df2020)

In [61]:
df2020_aqa

Unnamed: 0,CATEGORY (i.e.ISCED description),INSTITUTION and COURSE,Course Code,pointsR1,R1 Random *,pointsR2,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Information and Communication Technologies (ICTs),Software Design,AL600,AQA,,,,104,,279,...,,,,,,,,,,
1,Engineering and engineering trades,Computer Engineering,AL601,AQA,,219,,112,,192,...,avp,v,,,,,,,,
2,Engineering and engineering trades,Mechanical Engineering,AL602,AQA,,,,260,,424,...,,,,,,,,,,
3,Architecture and construction,Civil Engineering,AL604,AQA,,AQA,,AQA,,244,...,,,,,,,,,,
4,Engineering and engineering trades,Music and Instrument Technology,AL605,AQA,,AQA,,154,,306,...,avp,,,,,,,,,
5,Personal services,Culinary Arts,AL660,AQA,,AQA,,112,,270,...,avp,,,,,,,,,
6,Personal services,Bar Supervision,AL661,AQA,,AQA,,101,,241,...,avp,v,,,,,,,,
7,Personal services,Business (Sport and Recreation),AL663,AQA,,AQA,,120,,262,...,avp,,,,,,,,,
8,Business and administration,Business,CW006,AQA,,AQA,,AQA,,254,...,avp,,,,,,,,,
9,Social and behavioural sciences,Applied Social Care,DB528,AQA,,AQA,,AQA,,337,...,avp,,,,,,,,,


In [62]:
# Create a file path for the pandas data.
path2020 = 'cao-data/cao2020_' + current_time + '.csv'

In [63]:
# Save pandas data frame to disk.
df2020.to_csv(path2020)

## 2019 Level 8 Points

In [64]:
# Request CAO website for 2019 level 8 url
url2019_l8 = 'http://www2.cao.ie/points/lvl8_19.pdf'

In [65]:
# Create a file path for the original data.
path2019_l8_pdf = 'cao-data/cao2019_l8_' + current_time + '.pdf'

In [66]:
# Save pdf to disk
urlrq.urlretrieve(url2019_l8, path2019_l8_pdf)

('cao-data/cao2019_l8_20211123_130330.pdf',
 <http.client.HTTPMessage at 0x233cc608760>)

In [67]:
# Read pdf into panda using tabula
# Using concat as tabula creates a new dataframe for each page of the pdf
# Takes pandas options to remove header as tabula assigns first row on each new page as the header 
df2019_l8_load = pd.concat(tabula.read_pdf("http://www2.cao.ie/points/lvl8_19.pdf", 
                                      pages = 'all', pandas_options={'header': None}))

In [68]:
# Have a look at the imported data
df2019_l8_load

Unnamed: 0,0,1,2,3
0,Course Code,INSTITUTION and COURSE,EOS,Mid
1,,Athlone Institute of Technology,,
2,AL801,Software Design with Virtual Reality and Gaming,304,328
3,AL802,Software Design with Cloud Computing,301,306
4,AL803,Software Design with Mobile Apps and Connected...,309,337
...,...,...,...,...
37,WD200,Arts (options),221,296.0
38,WD210,Software Systems Development,271,329.0
39,WD211,Creative Computing,275,322.0
40,WD212,Recreation and Sport Management,274,311.0


In [69]:
# Set the first row of new dataframe as header
# https://www.codegrepper.com/code-examples/python/frameworks/django/change+header+to+first+row+pandas
df2019_l8_load.columns = df2019_l8_load.iloc[0]
df2019_l8 = df2019_l8_load[1:]

In [70]:
df2019_l8

Unnamed: 0,Course Code,INSTITUTION and COURSE,EOS,Mid
1,,Athlone Institute of Technology,,
2,AL801,Software Design with Virtual Reality and Gaming,304,328
3,AL802,Software Design with Cloud Computing,301,306
4,AL803,Software Design with Mobile Apps and Connected...,309,337
5,AL805,Network Management and Cloud Infrastructure,329,442
...,...,...,...,...
37,WD200,Arts (options),221,296.0
38,WD210,Software Systems Development,271,329.0
39,WD211,Creative Computing,275,322.0
40,WD212,Recreation and Sport Management,274,311.0


In [71]:
# Reset index as tabula had a new index for each page of the pdf
df2019_l8 = df2019_l8.reset_index()

In [72]:
# Delete rows that contain NaN in the Course Code column
df2019_l8 = df2019_l8.dropna(subset = ['Course Code'])

In [73]:
# Extract rows where points contain a symbol and include them in new df
# All courses with # required Test / Interview / Portfolio / Audition
df2019_l8_portfolio = df2019_l8[df2019_l8['EOS'].str.match("#", na=False)]

In [74]:
df2019_l8_portfolio

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
24,25,AL861,Animation and Illustration (portfolio),#615,899
25,26,AL863,Graphic and Digital Design (portfolio),#703,898
31,32,CW038,"Art (portfolio, Wexford)",#700,700
58,15,CW858,"Sports Management and Coaching (options, portf...",#700,700.0
74,31,CR121,Music at CIT Cork School of Music,#633,1052.0
...,...,...,...,...,...
858,45,MH802,"Community and Youth Work (part-time, in service",#,
859,46,MH803,"Local Studies or Community Studies (part-time,...",#,
879,11,SG244,Fine Art,#355,496.0
924,1,TL889,Counselling with Addiction - Mature applicants...,#,


In [75]:
# Extracting courses where not everyone with those points were offered places
df2019_l8_random = df2019_l8[df2019_l8['EOS'].str[-1] == '*']

In [76]:
df2019_l8_random

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
129,31,CK201,Commerce,465*,489.0
163,10,CK704,Occupational Therapy,532*,554.0
166,13,CK707,Medical and Health Sciences,510*,543.0
172,19,CK791,Medicine - Graduate Entry (GAMSAT required),#58*,59.0
179,26,CM001,Education - Primary Teaching,452*,462.0
211,3,DC002,Education - Primary Teaching,462*,485.0
452,24,RC004,Physiotherapy,532*,542.0
460,32,TR006,Psychology,555*,577.0
463,35,TR012,History and Political Science,532*,557.0
467,39,TR018,Law and French,532*,554.0


In [77]:
# Extracting courses where all qualified applicants earned places
df2019_l8_aqa = df2019_l8[df2019_l8['EOS'].str.match("AQA", na=False)]

In [78]:
df2019_l8_aqa

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid


In [79]:
# Add new column with point numbers removing non-digit characters
df2019_l8 = df2019_l8.assign(Points=df2019_l8['EOS'].str.replace(r'\D', '', regex=True))

In [80]:
df2019_l8

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid,Points
1,2,AL801,Software Design with Virtual Reality and Gaming,304,328,304
2,3,AL802,Software Design with Cloud Computing,301,306,301
3,4,AL803,Software Design with Mobile Apps and Connected...,309,337,309
4,5,AL805,Network Management and Cloud Infrastructure,329,442,329
5,6,AL810,Quantity Surveying,307,349,307
...,...,...,...,...,...,...
960,37,WD200,Arts (options),221,296.0,221
961,38,WD210,Software Systems Development,271,329.0,271
962,39,WD211,Creative Computing,275,322.0,275
963,40,WD212,Recreation and Sport Management,274,311.0,274


In [81]:
# Create a file path for the pandas data.
path2019_l8 = 'cao-data/cao2019_l8_' + current_time + '.csv'

In [82]:
# Save pandas data frame to disk.
df2019_l8.to_csv(path2019_l8)

## 2019 Level 7/6 Points

In [83]:
# Request CAO website for 2019 level 7 and 6 url
url2019_l76 = 'http://www2.cao.ie/points/lvl76_19.pdf'

In [84]:
# Create a file path for the original data.
path2019_l76_pdf = 'cao-data/cao2019_l76_' + current_time + '.pdf'

In [85]:
# Save pdf to disk
urlrq.urlretrieve(url2019_l76, path2019_l76_pdf)

('cao-data/cao2019_l76_20211123_130330.pdf',
 <http.client.HTTPMessage at 0x233cc518f40>)

In [86]:
# Read pdf into panda using tabula
# Using concat as tabula creates a new dataframe for each page of the pdf
# Takes pandas options to remove header as tabula assigns first row on each new page as the header 
df2019_l76_load = pd.concat(tabula.read_pdf("http://www2.cao.ie/points/lvl76_19.pdf", 
                                      pages = 'all', pandas_options={'header': None}))

In [87]:
# Have a look at the imported data
df2019_l76_load

Unnamed: 0,0,1,2,3
0,,ADMISSION DATA 2019,,
1,,End of Season,,
2,,"Level 6, 7",,
3,,The details given are for general information...,,
4,*,Not all on this points score were offered places,,
...,...,...,...,...
1,WD188,Applied Health Care,206,339
2,WD205,Molecular Biology with Biopharmaceutical Science,208,441
3,WD206,Electronic Engineering,191,322
4,WD207,Mechanical Engineering,179,330


In [88]:
# Rename the columns to match the 2019 level 8 data
df2019_l76_load.columns = ['Course Code', 'INSTITUTION and COURSE', 'EOS', 'Mid']

In [89]:
# With the 2019 level 8 data, tabula ignored the introductory information 
# But it is included in the 2019 level 7 and 6 data so we have to remove it
# Reusing regular expressions to extract rows with the course codes we want
df2019_l76 = df2019_l76_load[df2019_l76_load['Course Code'].str.match(re_course, na=False)]

In [90]:
# Reset index as tabula had a new index for each page of the pdf
df2019_l76 = df2019_l76.reset_index()

In [91]:
df2019_l76

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
0,10,AL600,Software Design,205,306
1,11,AL601,Computer Engineering,196,272
2,12,AL602,Mechanical Engineering,258,424
3,13,AL604,Civil Engineering,252,360
4,14,AL630,Pharmacy Technician,306,366
...,...,...,...,...,...
456,1,WD188,Applied Health Care,206,339
457,2,WD205,Molecular Biology with Biopharmaceutical Science,208,441
458,3,WD206,Electronic Engineering,191,322
459,4,WD207,Mechanical Engineering,179,330


In [92]:
# Extract rows where points contain a symbol and include them in new df
# All courses with # required Test / Interview / Portfolio / Audition
df2019_l76_portfolio = df2019_l76[df2019_l76['EOS'].str.match("#", na=False)]

In [93]:
df2019_l76_portfolio

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
27,37,AL763,Graphic Design (portfolio),#626,820.0
35,46,CW057,"Art (portfolio, Wexford)",#700,700.0
57,16,CW807,Sport Coaching and Business Management - GAA (...,#700,700.0
58,17,CW817,Sport Coaching and Business Management - Rugby...,#700,700.0
59,18,CW827,Sport Coaching and Business Management - Socce...,#700,700.0
155,11,DT598,Visual Merchandising and Display,#463,873.0
195,1,TA014,Advertising and Marketing Communications,#288,676.0
225,34,TR802,Dental Hygiene,#520,531.0
226,35,TR803,Dental Technology,#367,477.0
241,51,DK767,Community Youth Work,#280,360.0


In [94]:
# Extracting courses where not everyone with those points were offered places
df2019_l76_random = df2019_l76[df2019_l76['EOS'].str[-1] == '*']

In [95]:
df2019_l76_random

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
224,33,TR801,Dental Nursing,379*,430.0
322,26,LY847,Veterinary Nursing,347*,378.0


In [96]:
# Extracting courses where all qualified applicants earned places
df2019_l76_aqa = df2019_l76[df2019_l76['EOS'].str.match("AQA", na=False)]

In [97]:
df2019_l76_aqa

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
8,18,AL651,Music and Instrument Technology,AQA,296.0
9,19,AL660,Culinary Arts,AQA,216.0
10,20,AL661,Bar Supervision,AQA,216.0
11,21,AL663,Business (Sport and Recreation),AQA,270.0
67,27,PC404,Applied Social Studies - Professional Social Care,AQA,245.0
105,13,DS701,Business,AQA,
109,18,DB528,Applied Social Care,AQA,307.0
114,23,DB572,Business in Accounting,AQA,403.0
116,25,DB574,Business in Information Technology,AQA,367.0
162,20,GC411,Business (Dublin),AQA,349.0


In [98]:
# Add new column with point numbers removing non-digit characters
df2019_l76 = df2019_l76.assign(Points=df2019_l76['EOS'].str.replace(r'\D', '', regex=True))

In [99]:
#### NEED TO FIX, LAST FEW VALUES NOT OUTPUT CORRECTLY
df2019_l76

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid,Points
0,10,AL600,Software Design,205,306,205
1,11,AL601,Computer Engineering,196,272,196
2,12,AL602,Mechanical Engineering,258,424,258
3,13,AL604,Civil Engineering,252,360,252
4,14,AL630,Pharmacy Technician,306,366,306
...,...,...,...,...,...,...
456,1,WD188,Applied Health Care,206,339,
457,2,WD205,Molecular Biology with Biopharmaceutical Science,208,441,
458,3,WD206,Electronic Engineering,191,322,
459,4,WD207,Mechanical Engineering,179,330,


In [100]:
# Create a file path for the pandas data.
path2019_l76 = 'cao-data/cao2019_l76_' + current_time + '.csv'

In [101]:
# Save pandas data frame to disk.
df2019_l76.to_csv(path2019_l76)

## 2018 Level 8 Points

In [102]:
# Request CAO website for 2018 level 8 url
url2018_l8 = 'http://www2.cao.ie/points/lvl8_18.pdf'

In [103]:
# Create a file path for the original data.
path2018_l8_pdf = 'cao-data/cao2018_l8_' + current_time + '.pdf'

In [104]:
# Save pdf to disk
urlrq.urlretrieve(url2018_l8, path2018_l8_pdf)

('cao-data/cao2018_l8_20211123_130330.pdf',
 <http.client.HTTPMessage at 0x233cc608ca0>)

In [105]:
# Read pdf into panda using tabula
# Using concat as tabula creates a new dataframe for each page of the pdf
# Takes pandas options to remove header as tabula assigns first row on each new page as the header 
df2018_l8_load = pd.concat(tabula.read_pdf("http://www2.cao.ie/points/lvl8_18.pdf", 
                                      pages = 'all', pandas_options={'header': None}))

In [106]:
df2018_l8_load

Unnamed: 0,0,1,2,3
0,,ADMISSION DATA 2018,,
1,,End of Season,,
2,,Level 8,,
3,,The details given are for general information...,,
4,*,Not all on this points score were offered places,,
...,...,...,...,...
7,WD197,The Internet of Things,260,329
8,WD200,Arts,220,299
9,WD210,Software Systems Development,289,327
10,WD211,Creative Computing,265,326


In [107]:
# Rename the columns to match the 2019 level 8 data
df2018_l8_load.columns = ['Course Code', 'INSTITUTION and COURSE', 'EOS', 'Mid']

In [108]:
# Need to filter out the introductory blurb that isn't needed in the dataframe
# Reusing regular expressions to extract rows with the course codes we want
df2018_l8 = df2018_l8_load[df2018_l8_load['Course Code'].str.match(re_course, na=False)]

In [109]:
# Reset index as tabula had a new index for each page of the pdf
df2018_l8 = df2018_l8.reset_index()

In [110]:
df2018_l8

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
0,10,AL801,Software Design (Game Development or Cloud Com...,295,326
1,11,AL810,Quantity Surveying,300,340
2,12,AL820,Mechanical and Polymer Engineering,299,371
3,13,AL830,General Nursing,418,440
4,14,AL832,Psychiatric Nursing,377,388
...,...,...,...,...,...
898,7,WD197,The Internet of Things,260,329
899,8,WD200,Arts,220,299
900,9,WD210,Software Systems Development,289,327
901,10,WD211,Creative Computing,265,326


In [111]:
# Extract rows where points contain a symbol and include them in new df
# All courses with # required Test / Interview / Portfolio / Audition
df2018_l8_portfolio = df2018_l8[df2018_l8['EOS'].str.match("#", na=False)]

In [112]:
df2018_l8_portfolio

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
20,30,AL861,Animation and Illustration (Portfolio /Intervi...,#789,948
24,36,CW038,Art (Portfolio - Wexford),#700,700
47,8,CW858,Sport Management and Coaching (with options - ...,#700,700
60,25,CR121,Music at CIT Cork School of Music,#628,1016
61,26,CR125,Popular Music: Electric Bass Guitar at CIT Cor...,#836,875
...,...,...,...,...,...
799,8,MH802,"Community and Youth Work (part-time, in service)",#,
800,9,MH803,"Local Studies or Community Studies (part-time,...",#,
818,31,SG244,Fine Art,#446,446
863,24,TL889,Counselling with Addiction (mature applicants ...,#,


In [113]:
# Extracting courses where not everyone with those points were offered places
df2018_l8_random = df2018_l8[df2018_l8['EOS'].str[-1] == '*']

In [114]:
df2018_l8_random

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
72,37,CR320,Biomedical Science - Offered jointly by CIT an...,521*,542
146,5,CK705,Speech and Language Therapy,509*,530
149,8,CK710,General Nursing,440*,473
152,11,CK730,Intellectual Disability Nursing,409*,434
154,13,CK791,Medicine (Graduate Entry),#57*,58
414,31,RC001,Medicine - Undergraduate Entry,#730*,731
415,32,RC004,Physiotherapy,532*,543
417,34,RC101,Medicine - Graduate Entry,#58*,61
430,49,TR018,Law and French,566*,589
436,1,TR024,European Studies,532*,555


In [115]:
# Extracting courses where all qualified applicants earned places
df2018_l8_aqa = df2018_l8[df2018_l8['EOS'].str.match("AQA", na=False)]

In [116]:
df2018_l8_aqa

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
488,1,DN272,"Horticulture, Landscape and Sportsturf Management",AQA,321
504,17,DN530,Humanities,AQA,358
514,27,DN750,Social Policy and Sociology,AQA,359


In [117]:
# Add new column with point numbers removing non-digit characters
df2018_l8 = df2018_l8.assign(Points=df2018_l8['EOS'].str.replace(r'\D', '', regex=True))

In [118]:
df2018_l8

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid,Points
0,10,AL801,Software Design (Game Development or Cloud Com...,295,326,295
1,11,AL810,Quantity Surveying,300,340,300
2,12,AL820,Mechanical and Polymer Engineering,299,371,299
3,13,AL830,General Nursing,418,440,418
4,14,AL832,Psychiatric Nursing,377,388,377
...,...,...,...,...,...,...
898,7,WD197,The Internet of Things,260,329,260
899,8,WD200,Arts,220,299,220
900,9,WD210,Software Systems Development,289,327,289
901,10,WD211,Creative Computing,265,326,265


In [119]:
# Create a file path for the pandas data.
path2018_l8 = 'cao-data/cao2018_l8_' + current_time + '.csv'

In [120]:
# Save pandas data frame to disk.
df2018_l8.to_csv(path2018_l8)

## 2018 Level 7/6 Points

In [121]:
# Request CAO website for 2018 level 7 and 6 url
url2018_l76 = 'http://www2.cao.ie/points/lvl76_18.pdf'

In [122]:
# Create a file path for the original data.
path2018_l76_pdf = 'cao-data/cao2018_l76_' + current_time + '.pdf'

In [123]:
# Save pdf to disk
urlrq.urlretrieve(url2019_l76, path2019_l76_pdf)

('cao-data/cao2019_l76_20211123_130330.pdf',
 <http.client.HTTPMessage at 0x233cbe1a400>)

In [124]:
# Read pdf into panda using tabula
# Using concat as tabula creates a new dataframe for each page of the pdf
# Takes pandas options to remove header as tabula assigns first row on each new page as the header 
df2018_l76_load = pd.concat(tabula.read_pdf("http://www2.cao.ie/points/lvl76_18.pdf", 
                                      pages = 'all', pandas_options={'header': None}))

In [125]:
# Have a look at the imported data
df2018_l76_load

Unnamed: 0,0,1,2,3
0,,ADMISSION DATA 2018,,
1,,End of Season,,
2,,"Level 6, 7",,
3,,The details given are for general information...,,
4,*,Not all on this points score were offered places,,
...,...,...,...,...
41,WD205,Molecular Biology with Biopharmaceutical Science,217,398.0
42,WD206,Electronic Engineering,175,330.0
43,WD207,Mechanical Engineering,182,362.0
44,WD208,Manufacturing Engineering,180,298.0


In [126]:
# Rename the columns to match the 2019 level 8 data
df2018_l76_load.columns = ['Course Code', 'INSTITUTION and COURSE', 'EOS', 'Mid']

In [127]:
# Reusing regular expressions to extract rows with the course codes we want
df2018_l76 = df2018_l76_load[df2018_l76_load['Course Code'].str.match(re_course, na=False)]

In [128]:
# Reset index as tabula had a new index for each page of the pdf
df2018_l76 = df2018_l76.reset_index()

In [129]:
df2018_l76

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
0,10,AL601,Electronics and Computer Engineering,240,321
1,11,AL602,Mechanical Engineering,201,299
2,12,AL604,Civil Engineering,243,320
3,13,AL630,Pharmacy Technician,306,388
4,14,AL631,Dental Nursing,307,348
...,...,...,...,...,...
466,41,WD205,Molecular Biology with Biopharmaceutical Science,217,398.0
467,42,WD206,Electronic Engineering,175,330.0
468,43,WD207,Mechanical Engineering,182,362.0
469,44,WD208,Manufacturing Engineering,180,298.0


In [130]:
# Extract rows where points contain a symbol and include them in new df
# All courses with # required Test / Interview / Portfolio / Audition
df2018_l76_portfolio = df2018_l76[df2018_l76['EOS'].str.match("#", na=False)]

In [131]:
df2018_l76_portfolio

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
26,36,AL763,Graphic Design (Portfolio Required),#783,842.0
35,47,CW057,Art (Portfolio - Wexford),#700,700.0
55,16,CW807,Sports Coaching and Business Management - GAA ...,#700,700.0
56,17,CW817,Sports Coaching and Business Management - Rugb...,#700,700.0
57,18,CW827,Sports Coaching and Business Management - Socc...,#700,700.0
150,15,DT598,Visual Merchandising and Display,#517,789.0
194,13,TA014,Advertising and Marketing Communications,#620,712.0
222,47,TR802,Dental Hygiene,#424,462.0
223,48,TR803,Dental Technology,#336,388.0
242,15,DK767,Community Youth Work,#240,300.0


In [132]:
# Extracting courses where not everyone with those points were offered places
df2018_l76_random = df2018_l76[df2018_l76['EOS'].str[-1] == '*']

In [133]:
df2018_l76_random

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
221,46,TR801,Dental Nursing,357*,434.0


In [134]:
# Extracting courses where all qualified applicants earned places
df2018_l76_aqa = df2018_l76[df2018_l76['EOS'].str.match("AQA", na=False)]

In [135]:
df2018_l76_aqa

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid
7,17,AL651,Music and Instrument Technology,AQA,308.0
8,18,AL652,Business in Equine,AQA,281.0
9,19,AL660,Culinary Arts,AQA,208.0
63,26,PC402,Humanities,AQA,318.0
64,27,PC404,Applied Social Studies in Social Care,AQA,227.0
99,14,DS602,Computing and Multimedia,AQA,92.0
114,31,DB576,Film and Media,AQA,298.0
162,31,GC416,Business (Dublin),AQA,198.0
168,37,GC466,Film and TV Production (Dublin),AQA,320.0
169,38,GC470,Photographic Media (Dublin),AQA,349.0


In [136]:
# Add new column with point numbers removing non-digit characters
df2018_l76 = df2018_l76.assign(Points=df2018_l76['EOS'].str.replace(r'\D', '', regex=True))

In [137]:
df2018_l76

Unnamed: 0,index,Course Code,INSTITUTION and COURSE,EOS,Mid,Points
0,10,AL601,Electronics and Computer Engineering,240,321,240
1,11,AL602,Mechanical Engineering,201,299,201
2,12,AL604,Civil Engineering,243,320,243
3,13,AL630,Pharmacy Technician,306,388,306
4,14,AL631,Dental Nursing,307,348,307
...,...,...,...,...,...,...
466,41,WD205,Molecular Biology with Biopharmaceutical Science,217,398.0,217
467,42,WD206,Electronic Engineering,175,330.0,175
468,43,WD207,Mechanical Engineering,182,362.0,182
469,44,WD208,Manufacturing Engineering,180,298.0,180


In [138]:
# Create a file path for the pandas data.
path2018_l76 = 'cao-data/cao2018_l76_' + current_time + '.csv'

In [139]:
# Save pandas data frame to disk.
df2018_l76.to_csv(path2018_l76)

## References