# About
This notebook imputes missing values importing the MICE algorithm from the fancyimpute library.

It creates: 
- A CSV file with the subset of data from a monitoring station
- SQL commands with the table structure for the MySQL environment
- A terminal command to upload the CSV file into the previous table structure.

# Libraries

In [1]:
%run "/home/cesar/Python_NBs/HDL_Project/Mini HDL/Baseline_ML_Pollution_Concentration_MMA/global_fv.ipynb"

User information is ready!


In [2]:
# Multiple imputation by chained equations
# importing the MICE from fancyimpute library
from fancyimpute import cn

#from sklearn.impute import IterativeImputer
#from sklearn.experimental import enable_iterative_imputer as IterativeImputer

# Data

In [3]:
# -------------------------------------------------
# Parameters of data to call
# -------------------------------------------------

# SQL table to call
sql_table = 'sima_station_CE'
# "sima_pm25"

# Filter data: date range
s_where = "where datetime > \'2020-04-20\'"

sqlq = "Select * from {} {}".format(sql_table, s_where)

In [4]:
def mice_imputer(sqlq, max_iter_vals = 100):
    """
    UDF specific to process a monitoring station's data through a 
    Missing value imputation algorithm. Especifically using a 
    Multivariate imputation by chained equations (MICE). 
    MICE is implemented using the FancyInput library.
    
    Input:
    * sqlq: SQL query for data querying (e.g. Select * from sima_station_CE where datetime > \'2020-04-20\')
    * max_iter_vals: Intervals for Imputation algorithm. Default = 100
    
    """
    
    data = qdata2(sql_table, sqlq)
    df_cols = data.columns
    df_size = data.shape[0]

    # Columns to ignore
    datetime_index = data.columns.get_loc("datetime")
    rainf_index = data.columns.get_loc("rainf")

    # Data columns to safekeep
    datetime_col = data.iloc[:,datetime_index]
    rainf_col = data.iloc[:,rainf_index]

    # Subset 
    data = data.loc[:, ~data.columns.isin(['datetime', 'rainf'])]

    # Replacing zeros with NA values (for MICE algorithm)
    data = data.replace(0, np.nan)
    missing_df = pd.DataFrame({"Missing Values (%)":data.isna().sum()/data.shape[0]*100 })
    
    # Calling the MICE class
    mice_imputer = IterativeImputer(max_iter = max_iter_vals)

    # imputing the missing value with mice imputer
    data_mice = pd.DataFrame(mice_imputer.fit_transform(data))

    # Reinserting columns in standby
    data_mice.insert(datetime_index, 'datetime', datetime_col)
    data_mice.insert(rainf_index, 'rainf', rainf_col)
    
    # Renaming columns
    data_mice.columns = df_cols
    
    
    return data_mice, missing_df

In [5]:
data_mice, missing_df = mice_imputer(sqlq)

Select * from sima_station_CE where datetime > '2020-04-20'


In [6]:
data_mice

Unnamed: 0,datetime,no,no2,nox,o3,pm10,pm25,prs,rainf,rh,so2,sr,tout,wdr,wsr
0,2020-04-20 01:00:00,2.672293,10.019655,12.321534,48.0,14.0,16.011868,707.9,0.0,33.0,4.781823,0.167,27.0,123.0,4.4
1,2020-04-22 10:00:00,1.600000,8.888484,10.699438,22.0,58.0,28.591051,706.9,0.0,69.0,3.600000,0.166,27.0,63.0,3.8
2,2020-04-24 01:00:00,2.250012,4.400000,6.200000,54.0,22.0,16.000000,708.1,0.0,31.0,2.600000,0.171,26.0,137.0,5.0
3,2020-04-24 15:00:00,2.000000,6.100000,8.000000,96.0,55.0,15.000000,706.8,0.0,9.0,3.000000,0.185,36.0,84.0,7.5
4,2020-04-24 21:00:00,1.600000,8.100000,9.800000,54.0,26.0,15.000000,706.7,0.0,15.0,2.400000,0.183,33.0,93.0,5.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17466,2022-04-14 18:00:00,2.400000,7.400000,10.000000,34.0,53.0,25.020000,708.3,0.0,54.0,6.200000,0.036,26.0,138.0,8.9
17467,2022-04-15 03:00:00,2.366088,3.400000,5.900000,14.0,35.0,15.610000,708.6,0.0,83.0,2.398755,0.001,21.0,52.0,7.6
17468,2022-04-15 11:00:00,13.400000,16.600000,30.300000,12.0,65.0,39.580000,709.9,0.0,73.0,4.100000,0.339,23.0,58.0,7.3
17469,2022-04-16 18:00:00,2.400000,6.600000,9.200000,53.0,67.0,48.140000,707.0,0.0,36.0,9.400000,0.136,33.0,85.0,13.0


In [7]:
missing_df

Unnamed: 0,Missing Values (%)
no,17.188484
no2,11.939786
nox,11.762349
o3,14.698643
pm10,4.206972
pm25,11.355961
prs,2.255166
rh,2.215099
so2,11.230038
sr,16.541698


In [8]:
new_table_name = "MVI_" + sql_table
file_name = new_table_name + ".csv"
data_mice.to_csv(file_name, encoding='utf-8', index=False)    

# Table structure

In [9]:
# Data column names
df_cols = data_mice.columns

# Data types of each column 
dtypes = [' FLOAT NOT NULL']*(len(df_cols)-1)
dtypes.insert(0, ' DATETIME NOT NULL')

print("DROP TABLE IF EXISTS HDL_Project.`MVI_{}`;".format(sql_table))
print()
print("CREATE TABLE HDL_Project.`MVI_{}` (".format(sql_table))
print("`{}` {}".format(df_cols[0], dtypes[0]))

for i, j in zip(df_cols[1:], dtypes[1:]):
    print(", `{}` {}".format(i, j))
    
print(") COMMENT = \"Subset with imputed missing values from table \'{}\'\";".format(sql_table))    

DROP TABLE IF EXISTS HDL_Project.`MVI_sima_station_CE`;

CREATE TABLE HDL_Project.`MVI_sima_station_CE` (
`datetime`  DATETIME NOT NULL
, `no`  FLOAT NOT NULL
, `no2`  FLOAT NOT NULL
, `nox`  FLOAT NOT NULL
, `o3`  FLOAT NOT NULL
, `pm10`  FLOAT NOT NULL
, `pm25`  FLOAT NOT NULL
, `prs`  FLOAT NOT NULL
, `rainf`  FLOAT NOT NULL
, `rh`  FLOAT NOT NULL
, `so2`  FLOAT NOT NULL
, `sr`  FLOAT NOT NULL
, `tout`  FLOAT NOT NULL
, `wdr`  FLOAT NOT NULL
, `wsr`  FLOAT NOT NULL
) COMMENT = "Subset with imputed missing values from table 'sima_station_CE'";


# Terminal command to load into MySQL environment

In [10]:
print("mysql -ucesar -pmysql92@ --local-infile HDL_Project -e \"LOAD DATA LOCAL INFILE \'{}.csv\'  INTO TABLE {} FIELDS TERMINATED BY \',\' LINES TERMINATED BY \'\\n\' IGNORE 1 ROWS\"; ".format(new_table_name, new_table_name))

mysql -ucesar -pmysql92@ --local-infile HDL_Project -e "LOAD DATA LOCAL INFILE 'MVI_sima_station_CE.csv'  INTO TABLE MVI_sima_station_CE FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 ROWS"; 


# Sources
* [sklearn.impute.IterativeImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html)