# Tests for the OEA framework

In [None]:
%run OEA_py

In [None]:
oea = OEA()
oea.set_workspace('sandbox1')

In [None]:
def reset_additive_data_tests():
    oea.rm_if_exists('stage1/Transactional/contoso_sis/v0.1/studentattendance')
    oea.rm_if_exists('stage2/Ingested/contoso_sis/v0.1/studentattendance')

def land_studentattendance_day1(expected_record_count):
    data_str = requests.get('https://raw.githubusercontent.com/aanseacorelearn/OpenEduAnalytics/gene/v0.7dev/framework/test_data/contoso_sis/day1/studentattendance/part1.csv').text
    sink_path = oea.land(data_str, 'contoso_sis/v0.1/studentattendance', 'studentattendance.csv', batch_data_type=oea.ADDITIVE_BATCH_DATA)
    sink_df = oea.load_csv(sink_path)
    assert sink_df.count() == expected_record_count, f'Expected {expected_record_count} records in landed data, but found {sink_df.count()}'

def land_studentattendance_day2(expected_record_count):
    data_str = requests.get('https://raw.githubusercontent.com/aanseacorelearn/OpenEduAnalytics/gene/v0.7dev/framework/test_data/contoso_sis/day2/studentattendance/part1.csv').text
    sink_path = oea.land(data_str, 'contoso_sis/v0.1/studentattendance', 'studentattendance.csv', batch_data_type=oea.ADDITIVE_BATCH_DATA)
    sink_df = oea.load_csv(sink_path)
    assert sink_df.count() == expected_record_count, f'Expected {expected_record_count} records in landed data, but found {sink_df.count()}'

def ingest_studentattendance(expected_record_count):
    oea.ingest('contoso_sis/v0.1/studentattendance')
    df = oea.load('stage2/Ingested/contoso_sis/v0.1/studentattendance')
    assert df.count() == expected_record_count, f'Expected {expected_record_count} records in landed data, but found {df.count()}'

reset_additive_data_tests()
# test1 - Land the first batch of studentattendance data
land_studentattendance_day1(1464)
# test2 - Ingest the data from stage1 into stage2
ingest_studentattendance(1464)
# test3 - run the same ingestion a second time and verify that it doesn't change what was ingested (ingestion is idempotent via use of _checkpoints)
ingest_studentattendance(1464)
# test4 - Land the second batch of studentattendance data
land_studentattendance_day2(2928)
# test5 - Ingest the data from stage1 into stage2
ingest_studentattendance(4392)



In [None]:
def reset_delta_data_tests():
    oea.rm_if_exists('stage1/Transactional/contoso_sis/v0.1/student')
    oea.rm_if_exists('stage2/Ingested/contoso_sis/v0.1/student')

def land_students_day1(expected_record_count):
    data_str = requests.get('https://raw.githubusercontent.com/aanseacorelearn/OpenEduAnalytics/gene/v0.7dev/framework/test_data/contoso_sis/day1/students/part1.csv').text
    sink_path = oea.land(data_str, 'contoso_sis/v0.1/student', 'student.csv', batch_data_type=oea.DELTA_BATCH_DATA)
    sink_df = oea.load_csv(sink_path)
    assert sink_df.count() == expected_record_count, f'Expected {expected_record_count} records in landed data, but found {sink_df.count()}'

def land_students_day2(expected_record_count):
    data_str = requests.get('https://raw.githubusercontent.com/aanseacorelearn/OpenEduAnalytics/gene/v0.7dev/framework/test_data/contoso_sis/day2/students/part1.csv').text
    sink_path = oea.land(data_str, 'contoso_sis/v0.1/student', 'student.csv', batch_data_type=oea.DELTA_BATCH_DATA)
    sink_df = oea.load_csv(sink_path)
    assert sink_df.count() == expected_record_count, f'Expected {expected_record_count} records in landed data, but found {sink_df.count()}'

def ingest_students(expected_record_count):
    oea.ingest('contoso_sis/v0.1/student', 'SIS_ID')
    df = oea.load('stage2/Ingested/contoso_sis/v0.1/student')
    assert df.count() == expected_record_count, f'Expected {expected_record_count} records in landed data, but found {df.count()}'

reset_delta_data_tests()
# test1 - Land the first batch of studentattendance data
land_students_day1(2)
# test2 - Ingest the data from stage1 into stage2
ingest_students(2)
# test3 - run the same ingestion a second time and verify that it doesn't change what was ingested (ingestion is idempotent via use of _checkpoints)
ingest_students(2)
# test4 - Land the second batch of studentattendance data
land_students_day2(2)
# test5 - Ingest the data from stage1 into stage2
ingest_students(3)

In [None]:
def reset_snapshot_data_tests():
    oea.rm_if_exists('stage1/Transactional/contoso_sis/v0.1/studentsectionmark')
    oea.rm_if_exists('stage2/Ingested/contoso_sis/v0.1/studentsectionmark')

def land_studentsectionmark_day1(expected_record_count):
    data_str = requests.get('https://raw.githubusercontent.com/aanseacorelearn/OpenEduAnalytics/gene/v0.7dev/framework/test_data/contoso_sis/day1/studentsectionmark/part1.csv').text
    sink_path = oea.land(data_str, 'contoso_sis/v0.1/studentsectionmark', 'studentsectionmark.csv', batch_data_type=oea.DELTA_BATCH_DATA)
    sink_df = oea.load_csv(sink_path)
    assert sink_df.count() == expected_record_count, f'Expected {expected_record_count} records in landed data, but found {sink_df.count()}'

def land_studentsectionmark_day2(expected_record_count):
    data_str = requests.get('https://raw.githubusercontent.com/aanseacorelearn/OpenEduAnalytics/gene/v0.7dev/framework/test_data/contoso_sis/day1/studentsectionmark/part1.csv').text
    sink_path = oea.land(data_str, 'contoso_sis/v0.1/studentsectionmark', 'studentsectionmark.csv', batch_data_type=oea.DELTA_BATCH_DATA)
    sink_df = oea.load_csv(sink_path)
    assert sink_df.count() == expected_record_count, f'Expected {expected_record_count} records in landed data, but found {sink_df.count()}'

def ingest_studentsectionmark(expected_record_count):
    oea.ingest('contoso_sis/v0.1/studentsectionmark')
    df = oea.load('stage2/Ingested/contoso_sis/v0.1/studentsectionmark')
    assert df.count() == expected_record_count, f'Expected {expected_record_count} records in landed data, but found {df.count()}'

reset_snapshot_data_tests()
# test1 - Land the first batch of studentattendance data
land_studentsectionmark_day1(12)
# test2 - Ingest the data from stage1 into stage2
ingest_studentsectionmark(12)
# test3 - run the same ingestion a second time and verify that it doesn't change what was ingested (ingestion is idempotent via use of _checkpoints)
ingest_studentsectionmark(12)
# test4 - Land the second batch of studentattendance data
land_studentsectionmark_day2(12)
# test5 - Ingest the data from stage1 into stage2
ingest_studentsectionmark(12)

In [None]:
def refine_contoso_sis(df_source):
    metadata = oea.get_metadata_from_url('https://raw.githubusercontent.com/aanseacorelearn/OpenEduAnalytics/gene/v0.7dev/modules/module_catalog/Student_and_School_Data_Systems/metadata.csv')
    #oea.upsert(df_source, 'stage2/Refined/contoso_sis/v0.1/general/studentattendance')
    df_pseudo, df_lookup = oea.pseudonymize(df_source, metadata['studentattendance'])
    oea.upsert(df_pseudo, 'stage2/Refined/contoso_sis/v0.1/general/studentattendance')
    oea.upsert(df_lookup, 'stage2/Refined/contoso_sis/v0.1/sensitive/studentattendance')

oea.process('stage2/Ingested/contoso_sis/v0.1/studentattendance', refine_contoso_sis)

# query a sample of the data refined into stage2/refined
oea.display('stage2/Refined/contoso_sis/v0.1/general/studentattendance')
oea.display('stage2/Refined/contoso_sis/v0.1/sensitive/studentattendance')