# About

# Libraries

In [1]:
%run "../main_global.ipynb"

Connection with MySQL database is ready!


In [2]:
from numpy import nan
from fancyimpute import IterativeImputer

# UDF

In [3]:
def raw_table_list():
    table_list = aux_qdata("show tables like 'sima%'")
    table_list = [table_list[i][0] for i in range(len(table_list))]
    table_list = [s for s in table_list if not(s.startswith('sima_station'))]
    
    return table_list

In [4]:
def reindex_aq_table(data):
    """
    * Cast data table to a float32 type
    * Reindex with datetime.
    """
    
    dt_index = data.columns.get_loc("datetime")
    dt_col = data.iloc[:,dt_index]

    tmp_df = data.iloc[:,1:].astype('float32')

    return tmp_df.set_index(dt_col)

In [5]:
def empty_df_measurement_station():
    cols = ["SE", "NE", "CE", "NO", "SO", "NO2", "NTE", "NE2", "SE2", "SO2", "SE3", "SUR", "NTE2", "NE3"]
    create_external_objects(mvi_method_name, directory)
    return DataFrame(columns = cols)

In [6]:
def count_nulls(data, table_name):
    tmp_data = data.replace(0, nan)
    nulls_count = dict(tmp_data.isnull().sum())
    nulls_count = DataFrame([nulls_count.values()], columns = nulls_count.keys(), index = [table_name])
    
    return nulls_count

In [7]:
def create_external_objects(mvi_method_name, directory):
    !touch {mvi_method_name + "_sql_tables_structure.sql"}; touch {mvi_method_name + "_upload_csv.sh"};
    main_sql_stuff = data_restructuring(directory)
    
    files_list = list(iglob('./{}/*.csv'.format(directory)))
    files_list_trim = [x[len(directory)+3:-4] for x in files_list]
    
    # SQL structures
    main_sql_stuff.sql_tables_structures(files_list_trim, directory, mvi_method_name)
    
    # Shell commands
    main_sql_stuff.upload_csv_script(files_list_trim, directory, mvi_method_name)
    
    # Views
    main_sql_stuff.creating_views(files_list_trim, mvi_method_name)

In [8]:
def mvi_process_tables(path, mvi_method, mvi_method_name, write_out = True, raw_list = raw_table_list()):
    results = empty_df_measurement_station()
    
    for table_name in raw_list:
        
        # Read raw dataset
        sqlq = "SELECT * FROM {}".format(table_name)
        
        # Reformat table
        raw_df = reindex_aq_table(qdata(sqlq))
        
        # Count NaN values 
        nulls_count = count_nulls(raw_df, table_name)        
        results = results.append(nulls_count)

        # Write out table
        if write_out:
            mvi_df = mvi_method(raw_df)
            
            mvi_df = mvi_df.reset_index(drop = False)
            full_path = path + "/" + mvi_method_name + "_" + table_name + ".csv"

            mvi_df.to_csv(full_path, encoding='utf-8', index=False)
        
    return results

In [9]:
def mvi_mean(data):
    tmp_data = data.replace(0, nan)
    
    mvi_substitution = tmp_data.mean(axis = 0)

    for i in tmp_data.columns:
        tmp_data[i] = tmp_data[i].fillna(mvi_substitution[i])

    return tmp_data    

In [10]:
def mvi_spatialAvg(data):
    tmp_data = data.replace(0, nan)

    mvi_substitution = tmp_data.mean(axis = 1)

    for i in tmp_data.index:
        tmp_data.loc[i] = tmp_data.loc[i].fillna(mvi_substitution[i])
        
    return tmp_data

In [11]:
def mvi_MICE(data):
    """
    UDF specific to process a monitoring station's data through a 
    Missing value imputation algorithm. Especifically using a 
    Multivariate imputation by chained equations (MICE). 
    MICE is implemented using the FancyInput library.
    """

    max_iter_vals = 100
    
    df_cols = data.columns
    df_indx = data.index

    # Reformat table
    data = reindex_aq_table(qdata(sqlq))
    data = data.replace(0, nan)

    # Initializing the MICE class
    mice_imputer = IterativeImputer(max_iter = max_iter_vals)

    # imputing the missing value with mice imputer
    data_mice = DataFrame(mice_imputer.fit_transform(data))
    data_mice.columns = df_cols
    data_mice.index = df_indx
    
    return data_mice

# Main

## > MVI - Mean Substitution

In [12]:
mvi_method_name = "MVI_mean"
directory = mvi_method_name + "_SIMA_Data"

!mkdir {directory}

mkdir: cannot create directory ‘MVI_mean_SIMA_Data’: File exists


In [13]:
results = mvi_process_tables(directory, mvi_mean, mvi_method_name, False)
results

Unnamed: 0,SE,NE,CE,NO,SO,NO2,NTE,NE2,SE2,SO2,SE3,SUR,NTE2,NE3
sima_co,8654,21290,11573,33745,26867,34368,26729,23645,38254,13742,28652,49130,33909,50516
sima_no,55552,35881,13875,35461,26568,38353,44991,48662,29206,11414,25743,50726,42282,50506
sima_no2,54533,44883,12360,37962,30648,40488,44647,48529,30880,11818,25584,50462,43094,50372
sima_nox,54576,44902,12045,37625,29200,38212,44683,48092,28691,10508,25565,48055,42732,50410
sima_o3,3203,19410,15358,23516,15616,28023,44175,52113,36495,7612,29216,51681,43896,50350
sima_pm10,2961,2114,2670,3499,2697,8106,6549,4051,3690,11748,24930,26032,26650,53228
sima_pm25,16200,27663,23830,25019,17977,38194,38803,18838,38919,28115,30808,44169,32794,63935
sima_prs,20835,3803,2971,2662,2167,7610,10833,4060,5990,1425,24017,24670,24912,50688
sima_rainf,61570,62121,63600,63826,63893,63904,63925,61617,63837,62575,62683,63935,63869,63574
sima_rh,3506,4435,1637,1283,1513,8689,12110,9607,15233,1410,25649,24715,24916,50379


### >> Creating External Objects

In [14]:
create_external_objects(mvi_method_name, directory)

## > MVI - Spatial Average

In [15]:
mvi_method_name = "MVI_spatialAvg"
directory = mvi_method_name + "_SIMA_Data"

!mkdir {directory}

mkdir: cannot create directory ‘MVI_spatialAvg_SIMA_Data’: File exists


In [16]:
results = mvi_process_tables(directory, mvi_spatialAvg, mvi_method_name, False)

### >> Creating External Objects

In [17]:
create_external_objects(mvi_method_name, directory)

## > MVI - Multiple imputation by chained equations (MICE)

In [18]:
mvi_method_name = "MVI_MICE"
directory = mvi_method_name + "_SIMA_Data"

!mkdir {directory}

mkdir: cannot create directory ‘MVI_MICE_SIMA_Data’: File exists


In [19]:
results = mvi_process_tables(directory, mvi_MICE, mvi_method_name, False)

### >> Creating External Objects

In [20]:
create_external_objects(mvi_method_name, directory)