In [37]:
import pandas as pd
import numpy as np

In [38]:
DATA_DIR = 'docs'
OUT_DIR = '../abau_proj/asp_data'

#### Loading (and standarizing data)

In [39]:
def load_branch(branch_name):
    df = pd.read_csv(f'{DATA_DIR}/{branch_name}.csv')
    # Remove 'unnamed' empty columns 
    empty_cols = [colname for colname in df.columns if 'Unnamed' in colname]
    df.drop(columns=empty_cols, inplace=True)
    # The symbol '*' means that the ponderation will be removed after some years
    # We replace that by its original value
    df = df.replace({'\*': '', '\,':'.'}, regex=True)
    # standardize the columns names (lowercase, remove extra spaces, replace any spaces by '_', remove accents)
    df.columns = [c.lower().strip().replace(' ', '_').replace('á','a').replace('é','e').replace('í','i').replace('ó','o').replace('ú','u') for c in df.columns]
    # standarize the grao columns in the same way
    if 'grao' not in df.columns:
        # change dobres_graos_e_abertos by grao in the column name
        df.rename(columns={'dobres_graos_e_abertos':'grao'}, inplace=True)
    df['grao'] = df['grao'].str.lower().str.strip().replace(' ', '_').replace('á','a').replace('é','e').replace('í','i').replace('ó','o').replace('ú','u')
    # Ensure columns are float and round to 1 decimal
    for c in df.columns:
        if c == 'grao':
            continue
        df[c] = round(df[c].astype('float32'),1)
    return df


In [40]:
artes_df = load_branch('artes-e-humanidades')
ciencias_df = load_branch('ciencias')
ciencias_da_saude_df = load_branch('ciencias-da-saúde')
ciencias_sociais_df = load_branch('ciencias-xurídicas-e-sociais')
enxeñaria_df = load_branch('enxeñaría-e-arquitectura')
dobres_graos_df = load_branch('simultaneidades-dobres-graos-e-graos-abertos')

# (Manually) Check if there is not wrong duplicates in the names of the columns
sorted(set(list(ciencias_df.columns) + list(artes_df.columns) + list(ciencias_da_saude_df.columns) + list(ciencias_sociais_df.columns) + list(enxeñaria_df.columns) + list(dobres_graos_df)))

['2ª_lingua_estranxeira_ii',
 'analise_musical_ii',
 'artes_escenicas_ii',
 'bioloxia',
 'ciencias_xerais',
 'coro_e_tecnica_vocal_ii',
 'debuxo_artistico_ii',
 'debuxo_tecnico_aplicado_as_artes_plasticas_e_ao_deseño',
 'debuxo_tecnico_aplicado_as_artes_plasticas_e_ao_deseño_ii',
 'debuxo_tecnico_ii',
 'deseño',
 'empresa_e_deseño_de_modelos_de_negocio',
 'fisica',
 'fundamentos_artisticos',
 'grao',
 'grego_ii',
 'historia_da_arte',
 'historia_da_musica_e_da_danza',
 'latin_ii',
 'literatura_dramatica',
 'matematicas_aplicadas_as_ciencias_sociais_ii',
 'matematicas_ii',
 'movementos_culturais_e_artisticos',
 'quimica',
 'tecnicas_de_expresion_grafico-plastica',
 'tecnoloxia_e_enxeñaria_ii',
 'xeografia',
 'xeoloxia_e_ciencias_ambientais']

In [41]:
## Merge all the dataframes
_df = pd.concat([
    artes_df,
    ciencias_df,
    ciencias_da_saude_df,
    ciencias_sociais_df,
    enxeñaria_df,
    dobres_graos_df,
]).replace(np.nan, 0)

# Handle accents in grao column
_df['grao'] = _df['grao'].str.replace('á','a')
_df['grao'] = _df['grao'].str.replace('é','e')
_df['grao'] = _df['grao'].str.replace('í','i')
_df['grao'] = _df['grao'].str.replace('ó','o')
_df['grao'] = _df['grao'].str.replace('ú','u')

_df.to_excel('docs/ponderacións_standarizados.xlsx', index=False)

#### Encoding data as ASP facts

In [42]:
## All degrees
with open(f'{OUT_DIR}/degrees.lp', 'w') as f:
    for subject in _df['grao'].unique():
        f.write(f'degree("{subject}").\n')

In [43]:
## All subjects
with open(f'{OUT_DIR}/subjects.lp', 'w') as f:
    for subject in _df.columns:
        f.write(f'subject("{subject}").\n')

In [44]:
# All weights
with open(f'{OUT_DIR}/weights.lp', 'w') as f:
    for i, row in _df.iterrows():
        for subject in _df.columns:
            if subject == 'grao':
                continue
            f.write(f'weight("{row["grao"]}", "{subject}", {int(row[subject]*10)}).\n')