In [113]:
import pandas as pd
import requests
import json
import time
import numpy as np
import os

cwd = os.getcwd() # save current working directory

# Get school location data

## Location data

### kommunekoder

In [47]:
kom_koder_raw = pd.read_excel(cwd+'/data_download/kom_koder.xls', usecols = 'G:H')
kom_df = kom_koder_raw.iloc[3:]\
                        .reset_index(drop = True)
kom_df.columns = ['kom_kode','kom_navn']
kom_koder = list(kom_df.iloc[:,0])

### School data

In [48]:
def get_school_data(kom_kode):
    """
    Fetches data for schools by given kommune code and returns a json response.
    """
    url = f'https://dingeologi.appspot.com/_ah/api/skoledistriktendpoint/v1/getSkoler?kommunenr={kom_kode}'
    response = requests.get(url)
    return response.json()


In [49]:
def process_school_data(school_json):
    """
    Takes a json response as input and returns a dataframe.
    """
    # set up dataframe
    number_of_schools = len(school_json['items'])
    columns = ['navn', 'urlfriendly', 'leder', 'adresse', 'postnr', 'postnrby', 'telefon', 'email', 'www', 'introtekst', 'cvrnr', 'beliggenhedskommune', 'beliggenhedskommunenr', 'administrativkommune', 'opdateret', 'lat', 'lon', 'institutionstype2', 'institutionstype3', 'maxklassetrin', 'ejerforhold', 'oprettet', 'elevtal1516', 'elevtal1415', 'elevtal1314', 'elevtal1213', 'elevtal1112', 'insertdate', 'image', 'active', 'institutionsnummer']
    schools_df = pd.DataFrame(data = None, columns = columns, index = range(number_of_schools))

    # add data
    for i in range(number_of_schools):
        active_school = school_json['items'][i]

        for key, val in active_school.items():
            schools_df.loc[i][key] = val

    schools_df.institutionsnummer = schools_df.institutionsnummer.astype(str)

    return schools_df

### From API

In [50]:
# MAIN DOWNLOADING PROCESS
school_data_list = []
for kom in kom_koder: 
    raw_json = get_school_data(kom)
    school_data_tidy = process_school_data(raw_json)
    school_data_list.append(school_data_tidy)

    time.sleep(0.5)

In [51]:
cols = ['navn', 'postnr', 'beliggenhedskommune', 'beliggenhedskommunenr', 'lat', 'lon', 'institutionstype2', 'institutionstype3', 'institutionsnummer']

all_schools = pd.concat(school_data_list)\
                .reset_index(drop = True)\
                .loc[:,cols]
                
all_schools.to_csv('school_location_dirty.csv')

0       101001
1       101003
2       101005
3       101007
4       101008
         ...  
2383    851110
2384    851111
2385    851112
2386    851114
2387    851221
Name: institutionsnummer, Length: 2388, dtype: object

## School student count

In [136]:
"""
TODO: integrate formatting function into data cleaning process 
"""

student_count_raw = pd.read_excel(cwd+'/data_download/student_count.xls', header = 7)

def format_student_count(student_count_raw):
    """
    Formats student count data for merge onto grade and location data. Returns formattedd dataframe.
    """

    student_count = student_count_raw.iloc[:,2:]\
                            .rename(columns = {'Institutionsnummer': 'institutionsnummer'})

    student_count.institutionsnummer = student_count.institutionsnummer.astype(str).str[:6].astype(str)

    student_count = student_count[student_count['institutionsnummer'] != 'nan']
    student_count.columns = student_count.columns.str.replace('/', '-')

    return student_count



In [28]:
"""student_count_raw = pd.read_excel(cwd+'/data_download/student_count.xls',header = 7)
student_count = student_count_raw.iloc[:, 2:]\
    .rename(columns = {'Institutionsnummer': 'institutionsnummer'})

student_count.institutionsnummer = student_count.institutionsnummer.astype(str).str[:6].astype(str)

student_count = student_count[student_count['institutionsnummer'] != 'nan']
student_count.columns = student_count.columns.str.replace('/','-')
student_count.to_csv('student_count_formatted.csv')"""

In [137]:
student_count = format_student_count(student_count_raw)

## School grades

In [30]:
def add_inst_list(raw_df):
    """
    The raw data from excel contains a columns with both school 
    names and institution numbers. This function deletes this
    column and adds a column with only the institution number.

    """

    raw_df['is_inst_num'] = [val.isdecimal() for val in raw_df['Rækkenavne']]

    schools_and_inst = list(raw_df['Rækkenavne'])
    inst_count_dict = {}

    for i in range(len(schools_and_inst)):
        if schools_and_inst[i].isdecimal():
            active_num = schools_and_inst[i]
            count = 0
        else:
            count += 1
            inst_count_dict[active_num] = count
    
    inst_nested = [[num]*count for num, count in inst_count_dict.items()]
    inst_list = [num for inst_list in inst_nested for num in inst_list]

    df_out = raw_df[raw_df.is_inst_num == False]\
             .drop(columns = 'is_inst_num')
    df_out['institutionsnummer'] = inst_list

    return df_out

# Merge and clean

## Merge grades, location and student count

In [44]:
# MERGE ALL DATA


def get_all_data(year):
    """
    This function loads raw grade data and formatted student count data for the
    specified year as well as dirty school location data and merges into one
    combined dataframe ready for datacleaning.

    TODO: Update, so function is not dependend on global variables.
    """

    grades_raw = pd.read_excel(cwd+f'/data_download/grades_{year}.xls', header = 6)
    grades = add_inst_list(grades_raw)
    
    student_count = pd.read_csv('student_count_formatted.csv') # maybe create nested formatting function
    school_loc = pd.read_csv('school_location_dirty.csv') # maybe nest data fetching function here

    # prepare format for merge
    for df in [grades, student_count, school_loc]:
        df.institutionsnummer = df.institutionsnummer.astype(str)

    # merge graeds to student count
    school_grade_count = pd.merge(left = grades, right = student_count[['institutionsnummer', year]], how = 'left', on = 'institutionsnummer')\
                            .rename(columns = {year: 'student_count'})
    # merge to location data
    school_grade_count_loc = pd.merge(left = school_grade_count, right = school_loc, on = 'institutionsnummer', how = 'outer')

    return school_grade_count_loc

In [32]:
"""years = ['2009-2010', '2010-2011', '2012-2013', '2013-2014', '2014-2015', '2015-2016', '2016-2017', '2017-2018', '2018-2019']

schools_dirty = {}
for year in years:
    schools_data = get_all_data(year)
    schools_dirty[year] = schools_data"""

## Clean data

In [126]:
def clean_school_data(dirty_df):
    """
    Extracts overall grade point average (true value as well as the socioeconomic reference) for
    schools in input dataframe. Reshapes dataframe to one school/location point pr. row. Saves
    grade data, location data, student count as well as type of institution for further sorting.

    """

    avg_df = dirty_df[dirty_df['Rækkenavne'] == 'Gennemsnit'] # might update later to extract grades                                                                   for all subjects
    # Define variables of interest, split into text and numeric
    num_vars = ['Karakter', 'Soc_ref', 'student_count', 'postnr', 'beliggenhedskommunenr', 'lat', 'lon']
    text_vars = ['navn', 'institutionstype2', 'institutionstype3']
    
    # Extract and clean text variables
    temp_text = avg_df.pivot(columns = 'institutionsnummer', values = text_vars)\
                        .transpose()\
                        .stack()\
                        .unstack(level = 0)\
                        .reset_index(col_level = -1)
    temp_text.columns = ['institutionsnummer', 'level_1', 'navn', 'institutionstype2', 'institutionstype3']
    text_df = temp_text.drop_duplicates('institutionsnummer')\
                        .drop('level_1', axis = 1)
    
    # Extract and clean numeric variables
    num_df = pd.pivot_table(data = avg_df, index = 'institutionsnummer', values = num_vars)\
                .reset_index()
    
    
    # Merge together and drop missing values
    # OBS: for missing values we have only grade data, no location data.
    tidy = pd.merge(left = num_df, right = text_df, on = 'institutionsnummer', how = 'left')\
            .dropna()

    # Add geo coordinates and format for merge
    tidy['coordinates'] = tuple(zip(tidy['lat'], tidy['lon']))
    tidy.postnr = tidy.postnr.astype(int)
    tidy.beliggenhedskommunenr = tidy.beliggenhedskommunenr.astype(int)

    return tidy



In [138]:
#Process and save as dict and csv
years = ['2009-2010', '2010-2011', '2012-2013', '2013-2014', '2014-2015', '2015-2016', '2016-2017', '2017-2018', '2018-2019']

schools_clean = {}
for year in years:
    dirty = get_all_data(year)
    clean = clean_school_data(dirty)

    schools_clean[year] = clean
    clean.to_csv(f'school_clean_{year}.csv')

## Code for inspecting data etc - to be deleted

In [35]:
test_df = schools_dirty['2009-2010']
test_df_avg = test_df[test_df['Rækkenavne'] == 'Gennemsnit']

In [36]:
num_vars = ['Soc_ref', 'Karakter',  'student_count', 'postnr', 'beliggenhedskommunenr', 'lat', 'lon',]
text_vars = [ 'institutionstype2', 'institutionstype3', 'navn']
num_vars_df = pd.pivot_table(data = test_df_avg, index = ['institutionsnummer'], values = num_vars)\
    .reset_index()

In [37]:
#pivot_transpose = test_df.pivot(columns = 'institutionsnummer', values = text_vars).transpose()

In [38]:
#pivot_transpose_stack = pd.DataFrame(pivot_transpose.stack())

In [39]:
temp = test_df_avg.pivot(columns = 'institutionsnummer', values = text_vars)\
                .transpose()\
                .stack()\
                .unstack(level = 0)\
                .reset_index(col_level = -1)
"""pivot_transpose_stack\.unstack(level=0)\.reset_index(col_level = -1)"""

temp.columns = ['institutionsnummer', 'level_1', 'institutionstype2', 'institutionstype3', 'navn']
text_df = temp.drop_duplicates('institutionsnummer')\
            .drop('level_1', axis = 1)

In [40]:
# why do i have so many institution numbers?
"""
text_nums = list(text_df.institutionsnummer)
num_nums = list(num_vars_df.institutionsnummer)

len(text_nums), len(num_nums)

check_list = [num in num_nums for num in text_nums]
check_list

text_df['in_nums'] = check_list
"""

"\ntext_nums = list(text_df.institutionsnummer)\nnum_nums = list(num_vars_df.institutionsnummer)\n\nlen(text_nums), len(num_nums)\n\ncheck_list = [num in num_nums for num in text_nums]\ncheck_list\n\ntext_df['in_nums'] = check_list\n"

In [41]:
# not_in_num_df = text_df[text_df['in_nums'] == False].reset_index(drop = True)

In [42]:
"""
test_num = not_in_num_df.reset_index()\
    .loc[800, 'institutionsnummer']

not_in_num_df # har ikke data på karaktersnit eller lukket 
"""

"\ntest_num = not_in_num_df.reset_index()    .loc[800, 'institutionsnummer']\n\nnot_in_num_df # har ikke data på karaktersnit eller lukket \n"

In [43]:
tidy = pd.merge(left = num_vars_df, right = text_df, on = 'institutionsnummer', how = 'left')

tidy['coordinates'] = tuple(zip(tidy['lat'], tidy['lon']))

In [97]:
test_dirty = get_all_data('2018-2019')
test_clean = clean_school_data(test_dirty)

In [98]:
test_clean.postnr.astype(int)

0       1552
1       1350
2       1307
3       2200
4       2200
        ... 
1285    9600
1286    9600
1287    9600
1288    9600
1289    9600
Name: postnr, Length: 1241, dtype: int32