In [None]:
import nltk
import glob
import pandas as pd
import unicodedata

def norm(string):
    try:
        return unicodedata.normalize('NFKD', string).encode('ASCII', 'ignore').decode("latin-1")
    except:
        return string

def format_text(df, cols_names=None, stopwords=None):

    # format
    for ele in cols_names:
        df[ele] = df[ele].str.title().str.strip()
        for ene in stopwords:
            df[ele] = df[ele].str.replace(' ' + ene.title() + ' ', ' ' + ene + ' ')

    return df

data = glob.glob('*.xlsx')

In [None]:
df = pd.DataFrame()
temp = pd.DataFrame()

for ele in data:
    print('Current file:', ele)
    temp = pd.read_excel(ele, header=1)
    temp.rename(columns={'CLAVE CAMPO UNITARIO': 'speciality_id',
                         'CAMPO UNITARIO DE FORMACIÓN': 'speciality_name_es',
                         'NOMBRE PROGRAMA EDUCATIVO': 'name_es',
                         'CVE CAMPO UNITARIO': 'speciality_id',
                         'CAMPO UNITARIO': 'speciality_name_es',
                         'NOMBRE CARRERA SEP': 'name_es'}, inplace=True)
    temp = temp[['name_es', 'speciality_id', 'speciality_name_es']].copy()
    temp['version'] = ele
    temp['backup'] = temp['name_es'].str.upper()
    temp['speciality_id'] = temp['speciality_id'].ffill()
    temp.drop_duplicates(subset=['name_es', 'speciality_id'], inplace=True)
    temp.fillna(0, inplace=True)
    df = df.append(temp, sort=False)
    df.drop_duplicates(subset=['name_es', 'speciality_id'], inplace=True)

version_replace = {
    'Anuario_Educacion_Superior_2019-2020_temp.xlsx': 2020,
    'anuies_licenciatura_2016-2017.xlsx': 2017,
    'anuies_licenciatura_2017-2018.xlsx': 2018,
    'anuies_licenciatura_2018-2019.xlsx': 2019,
    'anuies_posgrado_2016-2017.xlsx': 2017,
    'anuies_posgrado_2017-2018.xlsx': 2018,
    'anuies_posgrado_2018-2019.xlsx': 2019
}

df['version'].replace(version_replace, inplace=True)

df['speciality_id'] = df['speciality_id'].astype(int)

df.sort_values(by=['backup', 'version'], ascending=False, inplace=True)

df.drop_duplicates(subset=['backup'], keep='first', inplace=True)

df['name_es'] = df['name_es'].str.replace('  ', ' ').str.strip()

# df.drop_duplicates(subset=['name_es'], keep='first', inplace=True)

df['speciality_name_es'].replace({
    '(blank)': 'No aplica Campo Unitario'
}, inplace=True)

df = df.loc[~df['name_es'].isin(['#NAME?'])].copy()

In [None]:
t = df.copy()
# df = t.copy()

# Raw -> Processed

In [None]:
df['name_es'].fillna(0, inplace=True)
df['backup'].fillna(0, inplace=True)

df = df.loc[~df['backup'].astype(str).str.contains('TOTAL')].copy()

list(df.loc[df['backup'].astype(str).str.contains('MAESTRIA EN ENSEÑANZA DE LENGUAS Y'), 'backup'])[0]

for col in ['name_es', 'speciality_name_es']:
    df[col] = df[col].apply(lambda x: norm(x)).str.strip()

for col in ['name_es', 'backup']:
    df[col] = df[col].str.strip()

df['name_es'].fillna(0, inplace=True)
df['backup'].fillna(0, inplace=True)

df = df.loc[df['backup'] != 0].copy()

df.shape, df.drop_duplicates(subset=['backup']).shape, df.drop_duplicates(subset=['name_es']).shape

transform = df[['backup', 'name_es']].copy()

transform.to_csv('raw_to_careers.csv', index=False)

df.drop_duplicates(subset=['name_es'], inplace=True)
df['backup'] = df['name_es']

# Processed -> id

In [None]:
# campus id
df['count'] = 1
count = 0
while df.loc[df.duplicated(subset=['speciality_id', 'count'], keep='first')].shape[0] > 0:
    df.loc[df.duplicated(subset=['speciality_id', 'count'], keep='first'), 'count'] = \
        df.loc[df.duplicated(subset=['speciality_id', 'count'], keep='first'), 'count'] + 1
    count += 1
print('Campus Steps Count:', count)

max_group = str(df['count'].max())
df['count'] = df['count'].astype(str).str.zfill(len(max_group))

df['career_id'] = (df['speciality_id'].astype(str) + df['count']).astype(int)

fields = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vTRqe4aa9Maq0WOZTq6DzpflyyGUhTHMoy5l_nfrrmL0fG0f5ccnRoEDg8klrl1JbynwPuwIuTDhy-z/pub?output=csv')

for key, val in {'area': 1,
                 'field': 2,
                 'subfield': 3}.items():
    for extension in ['id', 'es', 'en']:
        df['{}_{}'.format(key, extension)] = df['speciality_id'].astype(str).apply(lambda x: x[:val] if len(str(x)) == 5 else x[:val+1]).astype(int)
    
    # replace names
    for extension in ['es', 'en']:
        df['{}_{}'.format(key, extension)] = \
            df['{}_{}'.format(key, extension)].replace(dict(zip(fields['{}_id'.format(key)], fields['{}_{}'.format(key, extension)])))

df['career_name'] = df['name_es'].str.capitalize()

df.drop(columns=['count', 'name_es'], inplace=True)

df['speciality_name_es'] = df['speciality_name_es'].fillna('No aplica Campo Unitario')
df['speciality_name_es'] = df['speciality_name_es'].str.capitalize()

key_replace = {
    'No especificado': [x for x in df.columns if '_es' in x],
    'Not specified': [x for x in df.columns if '_en' in x]
}

for k, v in key_replace.items():
    for col in v:
        df.loc[df[col].astype(str).str.isnumeric(), col] = k

df.to_csv('dim_careers.csv', index=False)