In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%pip install tueplots
from tueplots import bundles

Note: you may need to restart the kernel to use updated packages.


In [4]:
results_2019 = pd.read_csv("../data/bac-results-2019.csv", index_col=0)
results_2020 = pd.read_csv("../data/bac-results-2020.csv", index_col=0)
results_2021 = pd.read_csv("../data/bac-results-2021.csv", index_col=0)

# convert to numeric what is possible
results_2019 = results_2019.apply(pd.to_numeric, errors='coerce').fillna(results_2019)
results_2020 = results_2020.apply(pd.to_numeric, errors='coerce').fillna(results_2020)
results_2021 = results_2021.apply(pd.to_numeric, errors='coerce').fillna(results_2021)

# replace NaNs with empty strings for easier searchability
results_2019.fillna('', inplace=True)
results_2020.fillna('', inplace=True)
results_2021.fillna('', inplace=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
pd.set_option('display.max_columns', None)
display(results_2019.head())

print("\nColumns: " + str(list(results_2019.columns)))

Unnamed: 0,id,county_rank,country_rank,high_school,county,prev_promotion,school_type,specialization,romanian_competency,romanian_written,romanian_written_appeal,romanian_written_final,mother_tongue,mother_tongue_competency,mother_tongue_written,mother_tongue_written_appeal,mother_tongue_written_final,modern_language,modern_language_results,mandatory_subject,mandatory_subject_written,mandatory_subject_written_appeal,mandatory_subject_written_final,elective_subject,elective_subject_written,elective_subject_written_appeal,elective_subject_written_final,digital_competencies,final_grade,successful
0,AB110322,1843,96667,"LICEUL TEHNOLOGIC ""ALEXANDRU DOMSA"" ALBA IULIA",AB,DA,Zi,TEHNICIAN PROIECTANT CAD,Utilizator experimentat,5.0,,5.0,,,,,,LIMBA ENGLEZA,B2-B1-A2-B2-B2,MATEMATICA TEHN,7.25,,7.25,BIOLOGIE VEGETALA SI ANIMALA,3.7,,3.7,Utilizator experimentat,,Respins
1,AB110406,1844,96668,LICEUL TEHNOLOGIC AIUD,AB,DA,Zi,TEHNICIAN ELECTROTEHNIST,Utilizator experimentat,5.0,,5.0,,,,,,LIMBA ENGLEZA,B1-A2-A2-B2-B2,MATEMATICA TEHN,2.7,,2.7,BIOLOGIE VEGETALA SI ANIMALA,7.2,,7.2,Utilizator avansat,,Respins
2,AB112567,1845,96669,LICEUL TEHNOLOGIC SILVIC CIMPENI,AB,DA,Seral,TEHNICIAN MECATRONIST,Utilizator experimentat,6.0,,6.0,,,,,,LIMBA FRANCEZA,A1-A1-B2-A1-A1,MATEMATICA TEHN,1.0,,1.0,BIOLOGIE VEGETALA SI ANIMALA,7.45,,7.45,Utilizator nivel mediu,,Respins
3,AB115225,1846,96670,LICEUL TEHNOLOGIC SILVIC CIMPENI,AB,DA,Zi,TEHNICIAN IN SILVICULTURA SI EXPLOATARI FOREST...,Utilizator experimentat,5.25,,5.25,,,,,,LIMBA ENGLEZA,B1-B1--A2-A2,MATEMATICA TEHN,5.95,,5.95,BIOLOGIE VEGETALA SI ANIMALA,2.3,,2.3,Utilizator nivel mediu,,Respins
4,AB116350,2513,136007,LICEUL TEHNOLOGIC SILVIC CIMPENI,AB,DA,Zi,TEHNICIAN DESIGNER MOBILA SI AMENAJARI INTERIOARE,Utilizator experimentat,Eliminat din examen,,Eliminat din examen,,,,,,LIMBA ENGLEZA,A2-A2--A2-A2,MATEMATICA TEHN,5.15,,5.15,BIOLOGIE VEGETALA SI ANIMALA,5.85,,5.85,Utilizator avansat,,Eliminat din examen



Columns: ['id', 'county_rank', 'country_rank', 'high_school', 'county', 'prev_promotion', 'school_type', 'specialization', 'romanian_competency', 'romanian_written', 'romanian_written_appeal', 'romanian_written_final', 'mother_tongue', 'mother_tongue_competency', 'mother_tongue_written', 'mother_tongue_written_appeal', 'mother_tongue_written_final', 'modern_language', 'modern_language_results', 'mandatory_subject', 'mandatory_subject_written', 'mandatory_subject_written_appeal', 'mandatory_subject_written_final', 'elective_subject', 'elective_subject_written', 'elective_subject_written_appeal', 'elective_subject_written_final', 'digital_competencies', 'final_grade', 'successful']


In [6]:
# remove invalid rows from the 2019 data (grade cell contains "DISQUALIFIED" etc. strings)
results_2019_2 = results_2019[
    results_2019[['romanian_written_final', 'mother_tongue_written_final', 'mandatory_subject_written_final', 'elective_subject_written_final']]
    .applymap(lambda x: np.isreal(x) or x == '').all(1)]
# combine the three years
results_all = pd.concat([results_2019_2, results_2020, results_2021], ignore_index=True, sort=False)

# separate the combined dataset into romanians and minorities
romanians_only = results_all[results_all['mother_tongue'] == '']
romanians_plus = results_all[results_all['mother_tongue'] != '']

print("\nRomanians' overall exam results:")
display(romanians_only[romanians_only['successful'] != '']
    .groupby(romanians_only['successful'].str.lower()).size().reset_index(name='count'))

print("\nminorities' overall exam results:")
display(romanians_plus[romanians_plus['successful'] != '']
    .groupby(romanians_plus['successful'].str.lower()).size().reset_index(name='count'))


Romanians' overall exam results:


Unnamed: 0,successful,count
0,eliminat din examen,410
1,neprezentat,14397
2,respins,122645
3,reusit,259460



minorities' overall exam results:


Unnamed: 0,successful,count
0,eliminat din examen,5
1,neprezentat,365
2,respins,7362
3,reusit,13189


In [7]:
# column name abbreviations for final grades
ro = 'romanian_written_final'
mo = 'mother_tongue_written_final'
ma = 'mandatory_subject_written_final'
el = 'elective_subject_written_final'

# prevent string errors
romanians_only = romanians_only.replace('', np.nan)
romanians_plus = romanians_plus.replace('', np.nan)

# filter both datasets for valid grades
romanians_only_f = romanians_only[ \
    (romanians_only[ma] >= 1.0) & \
    (romanians_only[el] >= 1.0) & \
    (romanians_only[ro] >= 1.0)].copy()
romanians_plus_f = romanians_plus[ \
    (romanians_plus[ma] >= 1.0) & \
    (romanians_plus[el] >= 1.0) & \
    (romanians_plus[mo] >= 1.0) & \
    (romanians_plus[ro] >= 1.0)].copy()


# calculate the total average
romanians_only_f.loc[:, 'total_average'] = romanians_only_f.loc[:, [ro, ma, el]].mean(axis=1)
romanians_plus_f.loc[:, 'total_average'] = romanians_plus_f.loc[:, [ro, ma, el, mo]].mean(axis=1)

print("Romanians' average grade: {:0.3f}".format(romanians_only_f['total_average'].mean()))
print("Minorities' average grade: {:0.3f}".format(romanians_plus_f['total_average'].mean()))
# -> Romanians' overall grades are worse (by only 0.01), so everything is right, right?


print("\nRomanians' average Romanian: {:0.3f}".format(romanians_only_f[ro].mean()))
print("Minorities' average Romanian: {:0.3f}".format(romanians_plus_f[ro].mean()))
romanians_plus_f[ro] = pd.to_numeric(romanians_plus_f[ro])
display(romanians_plus_f.groupby(['mother_tongue'])[ro].mean())

print("\nMinorities' average mother tongue: {:0.3f}".format(romanians_plus_f[mo].mean()))
romanians_plus_f[mo] = pd.to_numeric(romanians_plus_f[mo])
display(romanians_plus_f.groupby(['mother_tongue'])[mo].mean())
# -> But the Romanian grades of the minorities are much worse (1.22),
# -> and minorities are getting better grades in their mother tongues (0.57)


# calculate the average grade of their secialization (mandatory and elective subjects)
romanians_only_f.loc[:, 'subject_average'] = romanians_only_f.loc[:, [ma, el]].mean(axis=1)
romanians_plus_f.loc[:, 'subject_average'] = romanians_plus_f.loc[:, [ma, el]].mean(axis=1)

print("\nRomanians' average subject grade: {:0.3f}".format(romanians_only_f['subject_average'].mean()))
print("Minorities' average subject grade: {:0.3f}".format(romanians_plus_f['subject_average'].mean()))
display(romanians_plus_f.groupby(['mother_tongue'])['subject_average'].mean())
# -> The specialization's grades are also 0.294 better for minorities (which is a clear win)

Romanians' average grade: 6.946
Minorities' average grade: 6.958

Romanians' average Romanian: 7.061
Minorities' average Romanian: 5.841


mother_tongue
LIMBA CROATA             6.230952
LIMBA GERMANA            8.298246
LIMBA ITALIANA           7.546067
LIMBA MAGHIARA (REAL)    5.471993
LIMBA MAGHIARA (UMAN)    5.666270
LIMBA SARBA              5.591803
LIMBA SLOVACA            5.655682
LIMBA TURCA              4.856667
LIMBA UCRAINEANA         5.495455
Name: romanian_written_final, dtype: float64


Minorities' average mother tongue: 7.630


mother_tongue
LIMBA CROATA             8.457143
LIMBA GERMANA            7.886808
LIMBA ITALIANA           8.360112
LIMBA MAGHIARA (REAL)    7.584354
LIMBA MAGHIARA (UMAN)    7.627776
LIMBA SARBA              7.751639
LIMBA SLOVACA            8.035227
LIMBA TURCA              8.166667
LIMBA UCRAINEANA         7.262762
Name: mother_tongue_written_final, dtype: float64


Romanians' average subject grade: 6.888
Minorities' average subject grade: 7.182


mother_tongue
LIMBA CROATA             6.194048
LIMBA GERMANA            8.269606
LIMBA ITALIANA           6.661236
LIMBA MAGHIARA (REAL)    6.941918
LIMBA MAGHIARA (UMAN)    7.603987
LIMBA SARBA              5.987705
LIMBA SLOVACA            6.516477
LIMBA TURCA              5.555000
LIMBA UCRAINEANA         4.930070
Name: subject_average, dtype: float64

Even with this, more minorities are passing the exams (final avg >= 6.0).

This all does not imply that minorities are smarter, possible reasons include:
- only the smarter minorities attempt the exam (and if it is true, the fact that they still end up with the same avg results is the disadvantage)
  - this is also supported by the fact that only 5% of the results are from minorities, but Romania has at least 11%
  - but many minorities go to romanian schools (by choice or due to lack of other options)
- minority schools might be better
- minority exam graders might be less severe

In [15]:
results_all.groupby(['mother_tongue']).size() / len(results_all) * 100

mother_tongue
                         94.992976
LIMBA CROATA              0.005505
LIMBA GERMANA             0.564340
LIMBA ITALIANA            0.021540
LIMBA MAGHIARA (REAL)     3.370485
LIMBA MAGHIARA (UMAN)     0.935063
LIMBA SARBA               0.014838
LIMBA SLOVACA             0.021300
LIMBA TURCA               0.003829
LIMBA UCRAINEANA          0.070124
dtype: float64