In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import sqlite3 as sq

In [2]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

### Loading data

#### Reading SQLite database - you will need to amend your path

In [17]:
# creating file path
dbfile = 'C:/Users/emitc/OneDrive/Documents/STA/Dresscode/sta-it402-dresscode/code/data/sqa-data-db.sqlite'

# Create a SQL connection to our SQLite database
conn = sq.connect(dbfile)

In [18]:
# creating cursor
cur = conn.cursor()

# reading all table names
table_list = [a for a in cur.execute("SELECT name FROM sqlite_master WHERE type = 'table'")]
# here is your table list
print(table_list)

[('school_rolls',), ('sqa_qualification_list',), ('sqa_data',), ('subject_groups',)]


In [5]:
df1 = pd.read_sql_query('SELECT * FROM school_rolls', conn)
df1.head()

Unnamed: 0,year,secondary_schools,secondary_pupils,base_year_census,index_factor,predicted
0,1966.0,,283592.0,316594.0,0.895759,0
1,1967.0,665.0,288054.0,316594.0,0.909853,0
2,1968.0,640.0,295625.0,316594.0,0.933767,0
3,1969.0,616.0,307185.0,316594.0,0.970281,0
4,1970.0,441.0,317027.0,316594.0,1.001368,0


In [6]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   year               56 non-null     float64
 1   secondary_schools  54 non-null     float64
 2   secondary_pupils   56 non-null     float64
 3   base_year_census   56 non-null     float64
 4   index_factor       56 non-null     float64
 5   predicted          56 non-null     int64  
dtypes: float64(5), int64(1)
memory usage: 2.8 KB


In [7]:
df2 = pd.read_sql_query('SELECT * FROM sqa_qualification_list', conn)
df2.head()

Unnamed: 0,QualificationId,QualificationTitle,SCQFLevel,SecondaryYear,AdditionalYears,DataStartYear,DataEndYear
0,Higher,Higher,6,5.0,6,1986,
1,AdvancedHigher,Advanced Higher,7,6.0,,2001,
2,OrdinaryGrade,Ordinary Grade,5,4.0,5;6,1986,1994
3,NewHigher,New Higher,6,5.0,6,2000;2015,2002;2015
4,StandardGrade,Standard Grade,5,4.0,5;6,1986,2013


In [8]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   QualificationId     19 non-null     object 
 1   QualificationTitle  19 non-null     object 
 2   SCQFLevel           17 non-null     object 
 3   SecondaryYear       12 non-null     float64
 4   AdditionalYears     6 non-null      object 
 5   DataStartYear       14 non-null     object 
 6   DataEndYear         6 non-null      object 
dtypes: float64(1), object(6)
memory usage: 1.2+ KB


In [9]:
df3 = pd.read_sql_query('SELECT * FROM sqa_data', conn)
df3.head()

Unnamed: 0,qualification,year,Subject,NumberOfCentres,gender,grade,NoOfStudents
0,CSYS,1986,Art and Design (Enquiry),127.0,male,Entries,86.0
1,CSYS,1986,Art and Design (Enquiry),127.0,male,percentage,0.0
2,CSYS,1986,Art and Design (Enquiry),127.0,male,A,7.0
3,CSYS,1986,Art and Design (Enquiry),127.0,male,B,13.0
4,CSYS,1986,Art and Design (Enquiry),127.0,male,C,25.0


In [10]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159668 entries, 0 to 159667
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   qualification    159668 non-null  object 
 1   year             159668 non-null  object 
 2   Subject          159668 non-null  object 
 3   NumberOfCentres  118152 non-null  float64
 4   gender           159668 non-null  object 
 5   grade            159668 non-null  object 
 6   NoOfStudents     133842 non-null  float64
dtypes: float64(2), object(5)
memory usage: 8.5+ MB


In [11]:
df4 = pd.read_sql_query('SELECT * FROM subject_groups', conn)
df4.head()

Unnamed: 0,Subject,SubjectGroup,CommonSubjectLabel,qualification,year,SubjectTaken,SCQFLevel
0,English,English,English,National3,2014,Yes,3
1,English,English,English,National3,2015,Yes,3
2,English,English,English,National3,2016,Yes,3
3,English,English,English,National3,2017,Yes,3
4,English,English,English,National3,2018,Yes,3


In [12]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6176 entries, 0 to 6175
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Subject             6176 non-null   object
 1   SubjectGroup        6176 non-null   object
 2   CommonSubjectLabel  2582 non-null   object
 3   qualification       6176 non-null   object
 4   year                6176 non-null   int64 
 5   SubjectTaken        6176 non-null   object
 6   SCQFLevel           6176 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 337.9+ KB


#### Finished reading SQLite database

##### How many different grades are there?

In [10]:
df3['grade'].unique()

array(['Entries', 'percentage', 'A', 'B', 'C', 'D', 'E', 'Passes', '1',
       '2', '3', '4', '5', '6', '7', 'Comp', 'COMP',
       'PassesesInUngradedCourses', 'PassesUngraded',
       'PassesUngradedCourses', 'A-C', 'Pass', 'Attainment', 'SCQF3',
       'SCQF4', 'SCQF5', 'SCQF6', 'SCQF2', 'NoAward', 'A-B', 'A-D*',
       'A-D'], dtype=object)

In [11]:
df3['grade'].nunique()

32

#### Create and read in grade types table

In [14]:
df_grade_groups = pd.read_csv("grade_groups.csv")

In [15]:
df_grade_groups.head()

Unnamed: 0,grade,grade_type
0,1,numerical
1,2,numerical
2,3,numerical
3,4,numerical
4,5,numerical


In [16]:
df_grade_groups.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   grade       32 non-null     object
 1   grade_type  32 non-null     object
dtypes: object(2)
memory usage: 640.0+ bytes


In [21]:
#conn = sqlite3.connect('cartoon_characters.db')
df_grade_groups.to_sql("grade_groups", conn, if_exists="replace") #need double quotes

In [22]:
# reading all table names
table_list = [a for a in cur.execute("SELECT name FROM sqlite_master WHERE type = 'table'")]
# here is your table list
print(table_list)

[('school_rolls',), ('sqa_qualification_list',), ('sqa_data',), ('subject_groups',), ('grade_groups',)]


In [23]:
# Be sure to close the connection
conn.close()

#### Join grade groups dataframe to grades dataframe

In [27]:
#df5 = df3.join(df_grade_groups, on=)
df5 = pd.merge(df3, df_grade_groups, on='grade')

In [28]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159668 entries, 0 to 159667
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   qualification    159668 non-null  object 
 1   year             159668 non-null  object 
 2   Subject          159668 non-null  object 
 3   NumberOfCentres  118152 non-null  float64
 4   gender           159668 non-null  object 
 5   grade            159668 non-null  object 
 6   NoOfStudents     133842 non-null  float64
 7   grade_type       159668 non-null  object 
dtypes: float64(2), object(6)
memory usage: 11.0+ MB


In [31]:
#check that it has worked
df5.loc[(df5['grade_type'] == 'scqf')]

Unnamed: 0,qualification,year,Subject,NumberOfCentres,gender,grade,NoOfStudents,grade_type
143741,NationalCertificate,2014,Accounting,,male,SCQF3,,scqf
143742,NationalCertificate,2014,Accounting,,female,SCQF3,,scqf
143743,NationalCertificate,2014,Accounting,,all,SCQF3,,scqf
143744,NationalCertificate,2014,Acting and Theatre Performance,,male,SCQF3,,scqf
143745,NationalCertificate,2014,Acting and Theatre Performance,,female,SCQF3,,scqf
...,...,...,...,...,...,...,...,...
157164,NationalProgressionAward,2021,Web Design Fundamentals,,male,SCQF2,,scqf
157165,NationalProgressionAward,2021,Web Design Fundamentals,,female,SCQF2,,scqf
157166,NationalProgressionAward,2021,Zoo Animal Behaviour and Welfare,,all,SCQF2,,scqf
157167,NationalProgressionAward,2021,Zoo Animal Behaviour and Welfare,,male,SCQF2,,scqf


In [43]:
dummy = df5.loc[(df5['qualification'] == 'National5') & (df5['Subject'] == 'Urdu') & (df5['year'] == '2020')]

#### Sort the merged dataframe

In [47]:
df5 = df5.sort_values(by = ['qualification', 'year', 'Subject', 'gender', 'grade'])
df5.tail(40)

Unnamed: 0,qualification,year,Subject,NumberOfCentres,gender,grade,NoOfStudents,grade_type
126346,StandardGrade,2013,Technological Studies,79.0,female,5,6.0,numerical
129229,StandardGrade,2013,Technological Studies,79.0,female,6,2.0,numerical
132112,StandardGrade,2013,Technological Studies,79.0,female,7,0.0,numerical
17353,StandardGrade,2013,Technological Studies,79.0,female,Entries,76.0,entries
34595,StandardGrade,2013,Technological Studies,79.0,female,percentage,6.0,percentage
109838,StandardGrade,2013,Technological Studies,79.0,male,1,439.0,numerical
113951,StandardGrade,2013,Technological Studies,79.0,male,2,299.0,numerical
118064,StandardGrade,2013,Technological Studies,79.0,male,3,187.0,numerical
122232,StandardGrade,2013,Technological Studies,79.0,male,4,143.0,numerical
126345,StandardGrade,2013,Technological Studies,79.0,male,5,91.0,numerical


### Create a subroutine to deal with letter_groups

In [49]:
#no_change = ['Entries', 'A', 'B', 'C', 'NoAward', 'Pass', 'Passes']

def calculate_grades(x):
#    total = 0
    for i, v in x.iterrows():
#        total += v.
        if v.grade_type == 'letter_group': #and v.compare_previous != 0: #not a cumulation row
            print (v)          
    return x


In [50]:
df_em = df5.groupby(['qualification', 'year', 'Subject', 'gender']).apply(calculate_grades)

qualification      AdvancedHigher
year                         2014
Subject                Accounting
NumberOfCentres               NaN
gender                     female
grade                         A-C
NoOfStudents                   11
grade_type           letter_group
Name: 138367, dtype: object
qualification      AdvancedHigher
year                         2014
Subject                Accounting
NumberOfCentres               NaN
gender                       male
grade                         A-C
NoOfStudents                    8
grade_type           letter_group
Name: 138366, dtype: object
qualification           AdvancedHigher
year                              2014
Subject            Applied Mathematics
NumberOfCentres                    NaN
gender                          female
grade                              A-C
NoOfStudents                        86
grade_type                letter_group
Name: 138369, dtype: object
qualification           AdvancedHigher
year                 

Name: 139087, dtype: object
qualification      AdvancedHigher
year                         2016
Subject                   Biology
NumberOfCentres               NaN
gender                       male
grade                         A-C
NoOfStudents                  610
grade_type           letter_group
Name: 139086, dtype: object
qualification           AdvancedHigher
year                              2016
Subject            Business Management
NumberOfCentres                    NaN
gender                          female
grade                              A-C
NoOfStudents                       162
grade_type                letter_group
Name: 139089, dtype: object
qualification           AdvancedHigher
year                              2016
Subject            Business Management
NumberOfCentres                    NaN
gender                            male
grade                              A-C
NoOfStudents                       127
grade_type                letter_group
Name: 139088, dtype:

Name: 139427, dtype: object
qualification      AdvancedHigher
year                         2018
Subject                   English
NumberOfCentres               NaN
gender                       male
grade                         A-C
NoOfStudents                  529
grade_type           letter_group
Name: 139426, dtype: object
qualification      AdvancedHigher
year                         2018
Subject                    French
NumberOfCentres               NaN
gender                     female
grade                         A-C
NoOfStudents                  422
grade_type           letter_group
Name: 139429, dtype: object
qualification      AdvancedHigher
year                         2018
Subject                    French
NumberOfCentres               NaN
gender                       male
grade                         A-C
NoOfStudents                  132
grade_type           letter_group
Name: 139428, dtype: object
qualification         AdvancedHigher
year                            201

Name: 139695, dtype: object
qualification      AdvancedHigher
year                         2019
Subject                Statistics
NumberOfCentres               NaN
gender                   NotKnown
grade                         A-C
NoOfStudents                  NaN
grade_type           letter_group
Name: 139694, dtype: object
qualification      AdvancedHigher
year                         2019
Subject                Statistics
NumberOfCentres               NaN
gender                     female
grade                         A-C
NoOfStudents                   68
grade_type           letter_group
Name: 139693, dtype: object
qualification      AdvancedHigher
year                         2019
Subject                Statistics
NumberOfCentres               NaN
gender                       male
grade                         A-C
NoOfStudents                   98
grade_type           letter_group
Name: 139692, dtype: object
qualification      AdvancedHigher
year                         2020
Subj

Name: 158215, dtype: object
qualification      AdvancedHigher
year                         2020
Subject                   Physics
NumberOfCentres               NaN
gender                        all
grade                         A-C
NoOfStudents                 1386
grade_type           letter_group
Name: 139975, dtype: object
qualification      AdvancedHigher
year                         2020
Subject                   Physics
NumberOfCentres               NaN
gender                        all
grade                        A-D*
NoOfStudents                 1609
grade_type           letter_group
Name: 158989, dtype: object
qualification      AdvancedHigher
year                         2020
Subject                   Physics
NumberOfCentres               NaN
gender                     female
grade                         A-B
NoOfStudents                  321
grade_type           letter_group
Name: 158214, dtype: object
qualification      AdvancedHigher
year                         2020
Subj

Name: 158582, dtype: object
qualification                AdvancedHigher
year                                   2021
Subject            Mathematics of Mechanics
NumberOfCentres                         NaN
gender                                 male
grade                                   A-C
NoOfStudents                            217
grade_type                     letter_group
Name: 140342, dtype: object
qualification                AdvancedHigher
year                                   2021
Subject            Mathematics of Mechanics
NumberOfCentres                         NaN
gender                                 male
grade                                  A-D*
NoOfStudents                            233
grade_type                     letter_group
Name: 159074, dtype: object
qualification      AdvancedHigher
year                         2021
Subject            Modern Studies
NumberOfCentres               NaN
gender                        all
grade                         A-B
NoOfStud

qualification            Higher
year                       2014
Subject              Accounting
NumberOfCentres             NaN
gender                   female
grade                       A-C
NoOfStudents                447
grade_type         letter_group
Name: 138441, dtype: object
qualification            Higher
year                       2014
Subject              Accounting
NumberOfCentres             NaN
gender                     male
grade                       A-C
NoOfStudents                447
grade_type         letter_group
Name: 138440, dtype: object
qualification              Higher
year                         2014
Subject            Administration
NumberOfCentres               NaN
gender                     female
grade                         A-C
NoOfStudents                 1592
grade_type           letter_group
Name: 138443, dtype: object
qualification              Higher
year                         2014
Subject            Administration
NumberOfCentres               

Name: 138781, dtype: object
qualification               Higher
year                          2015
Subject            Classical Greek
NumberOfCentres                NaN
gender                        male
grade                          A-C
NoOfStudents                   NaN
grade_type            letter_group
Name: 138780, dtype: object
qualification                 Higher
year                            2015
Subject            Classical Studies
NumberOfCentres                  NaN
gender                        female
grade                            A-C
NoOfStudents                     169
grade_type              letter_group
Name: 138783, dtype: object
qualification                 Higher
year                            2015
Subject            Classical Studies
NumberOfCentres                  NaN
gender                          male
grade                            A-C
NoOfStudents                     106
grade_type              letter_group
Name: 138782, dtype: object
qualification   

Name: 139209, dtype: object
qualification            Higher
year                       2016
Subject                   Latin
NumberOfCentres             NaN
gender                     male
grade                       A-C
NoOfStudents                117
grade_type         letter_group
Name: 139208, dtype: object
qualification            Higher
year                       2016
Subject             Mathematics
NumberOfCentres             NaN
gender                   female
grade                       A-C
NoOfStudents               6794
grade_type         letter_group
Name: 139211, dtype: object
qualification            Higher
year                       2016
Subject             Mathematics
NumberOfCentres             NaN
gender                     male
grade                       A-C
NoOfStudents               7163
grade_type         letter_group
Name: 139210, dtype: object
qualification            Higher
year                       2016
Subject                   Media
NumberOfCentres         

Name: 139496, dtype: object
qualification                   Higher
year                              2018
Subject            Engineering Science
NumberOfCentres                    NaN
gender                          female
grade                              A-C
NoOfStudents                        77
grade_type                letter_group
Name: 139499, dtype: object
qualification                   Higher
year                              2018
Subject            Engineering Science
NumberOfCentres                    NaN
gender                            male
grade                              A-C
NoOfStudents                       630
grade_type                letter_group
Name: 139498, dtype: object
qualification            Higher
year                       2018
Subject                 English
NumberOfCentres             NaN
gender                   female
grade                       A-C
NoOfStudents              16998
grade_type         letter_group
Name: 139501, dtype: object
qualific

Name: 139803, dtype: object
qualification                          Higher
year                                     2019
Subject            Health and Food Technology
NumberOfCentres                           NaN
gender                               NotKnown
grade                                     A-C
NoOfStudents                              NaN
grade_type                       letter_group
Name: 139802, dtype: object
qualification                          Higher
year                                     2019
Subject            Health and Food Technology
NumberOfCentres                           NaN
gender                                 female
grade                                     A-C
NoOfStudents                              625
grade_type                       letter_group
Name: 139801, dtype: object
qualification                          Higher
year                                     2019
Subject            Health and Food Technology
NumberOfCentres                           

Name: 158268, dtype: object
qualification            Higher
year                       2020
Subject               Economics
NumberOfCentres             NaN
gender                   female
grade                       A-C
NoOfStudents                189
grade_type         letter_group
Name: 140028, dtype: object
qualification            Higher
year                       2020
Subject               Economics
NumberOfCentres             NaN
gender                   female
grade                       A-D
NoOfStudents                197
grade_type         letter_group
Name: 159147, dtype: object
qualification            Higher
year                       2020
Subject               Economics
NumberOfCentres             NaN
gender                     male
grade                       A-B
NoOfStudents                271
grade_type         letter_group
Name: 158267, dtype: object
qualification            Higher
year                       2020
Subject               Economics
NumberOfCentres         

Name: 158357, dtype: object
qualification            Higher
year                       2020
Subject                 Spanish
NumberOfCentres             NaN
gender                     male
grade                       A-C
NoOfStudents                765
grade_type         letter_group
Name: 140117, dtype: object
qualification            Higher
year                       2020
Subject                 Spanish
NumberOfCentres             NaN
gender                     male
grade                       A-D
NoOfStudents                790
grade_type         letter_group
Name: 159236, dtype: object
qualification            Higher
year                       2020
Subject                    Urdu
NumberOfCentres             NaN
gender                      all
grade                       A-B
NoOfStudents                117
grade_type         letter_group
Name: 158362, dtype: object
qualification            Higher
year                       2020
Subject                    Urdu
NumberOfCentres         

Name: 158702, dtype: object
qualification            Higher
year                       2021
Subject                   Latin
NumberOfCentres             NaN
gender                     male
grade                       A-C
NoOfStudents                113
grade_type         letter_group
Name: 140462, dtype: object
qualification            Higher
year                       2021
Subject                   Latin
NumberOfCentres             NaN
gender                     male
grade                       A-D
NoOfStudents                114
grade_type         letter_group
Name: 159476, dtype: object
qualification            Higher
year                       2021
Subject             Mathematics
NumberOfCentres             NaN
gender                      all
grade                       A-B
NoOfStudents              12541
grade_type         letter_group
Name: 158707, dtype: object
qualification            Higher
year                       2021
Subject             Mathematics
NumberOfCentres         

qualification      Intermediate2
year                        2014
Subject               Accounting
NumberOfCentres              NaN
gender                    female
grade                        A-C
NoOfStudents                  87
grade_type          letter_group
Name: 138567, dtype: object
qualification      Intermediate2
year                        2014
Subject               Accounting
NumberOfCentres              NaN
gender                      male
grade                        A-C
NoOfStudents                  62
grade_type          letter_group
Name: 138566, dtype: object
qualification       Intermediate2
year                         2014
Subject            Administration
NumberOfCentres               NaN
gender                     female
grade                         A-C
NoOfStudents                 1056
grade_type           letter_group
Name: 138569, dtype: object
qualification       Intermediate2
year                         2014
Subject            Administration
NumberOfCentre

qualification      Intermediate2
year                        2015
Subject                  English
NumberOfCentres              NaN
gender                    female
grade                        A-C
NoOfStudents                 673
grade_type          letter_group
Name: 138919, dtype: object
qualification      Intermediate2
year                        2015
Subject                  English
NumberOfCentres              NaN
gender                      male
grade                        A-C
NoOfStudents                 706
grade_type          letter_group
Name: 138918, dtype: object
qualification                                Intermediate2
year                                                  2015
Subject            English for Speakers of Other Languages
NumberOfCentres                                        NaN
gender                                              female
grade                                                  A-C
NoOfStudents                                            59
gra

qualification         National5
year                       2020
Subject              Accounting
NumberOfCentres             NaN
gender                      all
grade                       A-B
NoOfStudents                674
grade_type         letter_group
Name: 158365, dtype: object
qualification         National5
year                       2020
Subject              Accounting
NumberOfCentres             NaN
gender                      all
grade                       A-C
NoOfStudents                794
grade_type         letter_group
Name: 140125, dtype: object
qualification         National5
year                       2020
Subject              Accounting
NumberOfCentres             NaN
gender                      all
grade                       A-D
NoOfStudents                849
grade_type         letter_group
Name: 159244, dtype: object
qualification         National5
year                       2020
Subject              Accounting
NumberOfCentres             NaN
gender              

Name: 158458, dtype: object
qualification         National5
year                       2020
Subject                   Media
NumberOfCentres             NaN
gender                      all
grade                       A-C
NoOfStudents                703
grade_type         letter_group
Name: 140218, dtype: object
qualification         National5
year                       2020
Subject                   Media
NumberOfCentres             NaN
gender                      all
grade                       A-D
NoOfStudents                858
grade_type         letter_group
Name: 159337, dtype: object
qualification         National5
year                       2020
Subject                   Media
NumberOfCentres             NaN
gender                   female
grade                       A-B
NoOfStudents                361
grade_type         letter_group
Name: 158457, dtype: object
qualification         National5
year                       2020
Subject                   Media
NumberOfCentres         

Name: 159570, dtype: object
qualification                National5
year                              2021
Subject            Engineering Science
NumberOfCentres                    NaN
gender                            male
grade                              A-B
NoOfStudents                      1131
grade_type                letter_group
Name: 158795, dtype: object
qualification                National5
year                              2021
Subject            Engineering Science
NumberOfCentres                    NaN
gender                            male
grade                              A-C
NoOfStudents                      1407
grade_type                letter_group
Name: 140555, dtype: object
qualification                National5
year                              2021
Subject            Engineering Science
NumberOfCentres                    NaN
gender                            male
grade                              A-D
NoOfStudents                      1549
grade_type         

qualification         NewHigher
year                       2015
Subject              Accounting
NumberOfCentres             NaN
gender                   female
grade                       A-C
NoOfStudents                238
grade_type         letter_group
Name: 138989, dtype: object
qualification         NewHigher
year                       2015
Subject              Accounting
NumberOfCentres             NaN
gender                     male
grade                       A-C
NoOfStudents                207
grade_type         letter_group
Name: 138988, dtype: object
qualification                  NewHigher
year                                2015
Subject            Administration and IT
NumberOfCentres                      NaN
gender                            female
grade                                A-C
NoOfStudents                        1899
grade_type                  letter_group
Name: 138991, dtype: object
qualification                  NewHigher
year                               

#### Exploring the grades further below

In [32]:
df3.loc[(df3['qualification'] == 'AdvancedHigher') & (df3['year'] == '2004') & (df3['Subject'] == 'Music')]

Unnamed: 0,qualification,year,Subject,NumberOfCentres,gender,grade,NoOfStudents
64071,AdvancedHigher,2004,Music,266.0,male,Entries,384.0
64072,AdvancedHigher,2004,Music,266.0,male,percentage,43.0
64073,AdvancedHigher,2004,Music,266.0,male,A,182.0
64074,AdvancedHigher,2004,Music,266.0,male,B,112.0
64075,AdvancedHigher,2004,Music,266.0,male,C,60.0
64076,AdvancedHigher,2004,Music,266.0,male,Passes,354.0
64077,AdvancedHigher,2004,Music,266.0,male,COMP,7.0
64078,AdvancedHigher,2004,Music,266.0,female,Entries,508.0
64079,AdvancedHigher,2004,Music,266.0,female,percentage,57.0
64080,AdvancedHigher,2004,Music,266.0,female,A,270.0


In [14]:
df3.loc[(df3['qualification'] == 'OrdinaryGrade') & (df3['Subject'] == 'Biology') & (df3['year'] == '1993')]

Unnamed: 0,qualification,year,Subject,NumberOfCentres,gender,grade,NoOfStudents
24486,OrdinaryGrade,1993,Biology,12.0,male,Entries,9.0
24487,OrdinaryGrade,1993,Biology,12.0,male,percentage,0.0
24488,OrdinaryGrade,1993,Biology,12.0,male,1,
24489,OrdinaryGrade,1993,Biology,12.0,male,2,
24490,OrdinaryGrade,1993,Biology,12.0,male,3,2.0
24491,OrdinaryGrade,1993,Biology,12.0,male,4,2.0
24492,OrdinaryGrade,1993,Biology,12.0,male,5,3.0
24493,OrdinaryGrade,1993,Biology,12.0,female,Entries,9.0
24494,OrdinaryGrade,1993,Biology,12.0,female,percentage,0.0
24495,OrdinaryGrade,1993,Biology,12.0,female,1,


In [12]:
df3.loc[(df3['qualification'] == 'National5') & (~df3['NoOfStudents'].isnull())]

Unnamed: 0,qualification,year,Subject,NumberOfCentres,gender,grade,NoOfStudents
124203,National5,2014,Accounting,,all,Entries,777.0
124204,National5,2014,Accounting,,male,Entries,350.0
124205,National5,2014,Accounting,,female,Entries,427.0
124206,National5,2014,Accounting,,all,A,381.0
124207,National5,2014,Accounting,,all,B,148.0
...,...,...,...,...,...,...,...
155880,National5,2021,Urdu,,all,A,52.0
155881,National5,2021,Urdu,,all,A-B,64.0
155882,National5,2021,Urdu,,all,A-C,66.0
155883,National5,2021,Urdu,,all,A-D,66.0


In [13]:
df3.loc[(df3['qualification'] == 'National5') & (df3['Subject'] == 'Urdu')]

Unnamed: 0,qualification,year,Subject,NumberOfCentres,gender,grade,NoOfStudents
124720,National5,2014,Urdu,,all,Entries,42.0
124721,National5,2014,Urdu,,male,Entries,17.0
124722,National5,2014,Urdu,,female,Entries,25.0
124723,National5,2014,Urdu,,all,A,34.0
124724,National5,2014,Urdu,,all,B,3.0
124725,National5,2014,Urdu,,all,C,5.0
124726,National5,2014,Urdu,,all,D,0.0
124727,National5,2014,Urdu,,male,A,12.0
124728,National5,2014,Urdu,,female,A,22.0
124729,National5,2014,Urdu,,male,Pass,17.0


In [14]:
df3.loc[(df3['year'] == '2017') & (df3['qualification'] == 'Higher')]

Unnamed: 0,qualification,year,Subject,NumberOfCentres,gender,grade,NoOfStudents
135843,Higher,2017,Accounting,,all,Entries,1319.0
135844,Higher,2017,Accounting,,male,Entries,725.0
135845,Higher,2017,Accounting,,female,Entries,594.0
135846,Higher,2017,Accounting,,male,A,276.0
135847,Higher,2017,Accounting,,female,A,220.0
...,...,...,...,...,...,...,...
136344,Higher,2017,Urdu,,female,A-C,68.0
136345,Higher,2017,Urdu,,all,A,84.0
136346,Higher,2017,Urdu,,all,B,11.0
136347,Higher,2017,Urdu,,all,C,5.0


#### Create a subset of sqa_data with qualifications from 2000 onwards

In [15]:
df_subset2000_21 = df3.loc[(df3['year'] == '2000') | (df3['year'] == '2001') | (df3['year'] == '2002') | (df3['year'] == '2003') | 
                           (df3['year'] == '2004') | (df3['year'] == '2005') | (df3['year'] == '2006') | (df3['year'] == '2007') | 
                          (df3['year'] == '2008') | (df3['year'] == '2009') | (df3['year'] == '2010') | (df3['year'] == '2011') | 
                          (df3['year'] == '2012') | (df3['year'] == '2013') | 
                           (df3['year'] == '2014') | (df3['year'] == '2015') | (df3['year'] == '2016') | (df3['year'] == '2017') | 
                          (df3['year'] == '2018') | (df3['year'] == '2019') | (df3['year'] == '2020') | (df3['year'] == '2021')]

In [19]:
df_subset_quali = df3.loc[(df3['qualification'] == 'Higher') | (df3['qualification'] == 'Intermediate1') |
                           (df3['qualification'] == 'Intermediate2') | (df3['qualification'] == 'NewHigher') |
                           (df3['qualification'] == 'AdvancedHigher') | (df3['qualification'] == 'National2') | 
                           (df3['qualification'] == 'National3')| (df3['qualification'] == 'National4') |
                           (df3['qualification'] == 'National5') | (df3['qualification'] == 'NationalCertificate') |
                           (df3['qualification'] == 'NationalProgressionAward')]

In [20]:
df_subset_quali.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 117932 entries, 500 to 159667
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   qualification    117932 non-null  object 
 1   year             117932 non-null  object 
 2   Subject          117932 non-null  object 
 3   NumberOfCentres  78342 non-null   float64
 4   gender           117932 non-null  object 
 5   grade            117932 non-null  object 
 6   NoOfStudents     96274 non-null   float64
dtypes: float64(2), object(5)
memory usage: 7.2+ MB


In [22]:
df_subset2000_21['year'].unique()

array(['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007',
       '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017', '2018', '2019', '2020', '2021'], dtype=object)

In [21]:
df_subset_quali['grade'].unique()

array(['Entries', 'A', 'B', 'C', 'Passes', 'D', 'percentage', 'Comp',
       'COMP', 'PassesesInUngradedCourses', 'PassesUngraded',
       'PassesUngradedCourses', 'A-C', 'Pass', 'Attainment', 'SCQF3',
       'SCQF4', 'SCQF5', 'SCQF6', 'SCQF2', '3', 'NoAward', 'A-B', 'A-D*',
       'A-D'], dtype=object)

In [37]:
df_subset_quali.loc[(df_subset_quali['qualification'] == 'Higher') & (df_subset_quali['Subject'] == 'Accounting') & (df_subset_quali['year'] == '2014') ]

Unnamed: 0,qualification,year,Subject,NumberOfCentres,gender,grade,NoOfStudents
121816,Higher,2014,Accounting,,all,Entries,1277.0
121817,Higher,2014,Accounting,,male,Entries,638.0
121818,Higher,2014,Accounting,,female,Entries,638.0
121819,Higher,2014,Accounting,,male,A,217.0
121820,Higher,2014,Accounting,,female,A,211.0
121821,Higher,2014,Accounting,,male,A-C,447.0
121822,Higher,2014,Accounting,,female,A-C,447.0
121823,Higher,2014,Accounting,,all,A,434.0
121824,Higher,2014,Accounting,,all,B,255.0
121825,Higher,2014,Accounting,,all,C,217.0
