# About
Functions required for MVI Processes

# Libraries

In [1]:
from numpy import nan
from fancyimpute import IterativeImputer

# UDF

In [2]:
def raw_table_list():
    table_list = aux_qdata("show tables like 'sima%'")
    table_list = [table_list[i][0] for i in range(len(table_list))]
    table_list = [s for s in table_list if not(s.startswith('sima_station'))]
    
    return table_list

In [3]:
def reindex_aq_table(data):
    """
    * Cast data table to a float32 type
    * Reindex with datetime.
    """
    
    dt_index = data.columns.get_loc("datetime")
    dt_col = data.iloc[:,dt_index]

    tmp_df = data.iloc[:,1:].astype('float32')

    return tmp_df.set_index(dt_col)

In [1]:
def create_external_objects(mvi_method_name, directory):
    !touch {mvi_method_name + "_sql_tables_structure.sql"}; touch {mvi_method_name + "_upload_csv.sh"}; touch {mvi_method_name + "_views_creation.sql"}
    main_sql_stuff = data_restructuring(directory)
    
    files_list = list(iglob('./{}/*.csv'.format(directory)))
    files_list_trim = [x[len(directory)+3:-4] for x in files_list]
    
    # SQL structures
    main_sql_stuff.sql_tables_structures(files_list_trim, directory, mvi_method_name)
    
    # Shell commands
    main_sql_stuff.upload_csv_script(files_list_trim, directory, mvi_method_name)
    
    # Views
    main_sql_stuff.creating_views(files_list_trim, mvi_method_name, mvi_method_name)

In [5]:
def empty_df_measurement_station():
    cols = ["SE", "NE", "CE", "NO", "SO", "NO2", "NTE", "NE2", "SE2", "SO2", "SE3", "SUR", "NTE2", "NE3"]
    
    return DataFrame(columns = cols)

In [6]:
def count_nulls(data, table_name):
    tmp_data = data.replace(0, nan)
    nulls_count = dict(tmp_data.isnull().sum())
    nulls_count = DataFrame([nulls_count.values()], columns = nulls_count.keys(), index = [table_name])
    
    return nulls_count

In [7]:
def mvi_mean(data):
    tmp_data = data.replace(0, nan)
    
    mvi_substitution = tmp_data.mean(axis = 0)

    for i in tmp_data.columns:
        tmp_data[i] = tmp_data[i].fillna(mvi_substitution[i])

    return tmp_data    

In [8]:
def mvi_spatialAvg(data):
    tmp_data = data.replace(0, nan)

    mvi_substitution = tmp_data.mean(axis = 1)

    for i in tmp_data.index:
        tmp_data.loc[i] = tmp_data.loc[i].fillna(mvi_substitution[i])
        
    return tmp_data

In [9]:
def mvi_MICE(data):
    """
    UDF specific to process a monitoring station's data through a 
    Missing value imputation algorithm. Especifically using a 
    Multivariate imputation by chained equations (MICE). 
    MICE is implemented using the FancyInput library.
    """
    # General parameters
    max_iter_vals = 100
    
    df_cols = data.columns
    df_indx = data.index

    # Creating dictionary with real min values greater than zero
    global_min = dict()

    for c in data.columns:
        try: 
            mval = min(data[c][data[c] > 0]) 
        except:
            mval = 0
        global_min[c] = mval
    
    # Reformat table
    data = data.replace(0, nan)

    # Initializing the MICE class
    mice_imputer = IterativeImputer(max_iter = max_iter_vals)

    # imputing the missing value with mice imputer
    data_mice = DataFrame(mice_imputer.fit_transform(data))
    data_mice.columns = df_cols
    data_mice.index = df_indx
    
    # Replace negative values with minimum values per column
    for c in global_min.keys():
        data_mice.loc[data_mice[c] < 0, c] = global_min[c]
    
    return data_mice

In [10]:
print("Functions for MVI Processes are ready!")

Functions for MVI Processes are ready!
