# Imports

In [None]:
from datetime import datetime

import pandas
import plotly.express as px
import plotly.graph_objects as go
from qolmat.imputations import imputers

import constants.constants as cst
from src.preprocessing import preprocess_data
from src.utils.load_data import load_data

# Data loading 

In [None]:
train, test, submission = load_data()

# Data exploration

In [None]:
train.shape

There are $40991$ rows and $5$ columns to predict.

In [None]:
train.describe()

## Graphs

Let's convert the date column to actual datetime objects.

In [None]:
train[cst.RAW_DATE] = pandas.to_datetime(train[cst.RAW_DATE], format="%Y-%m-%d %H")

In [None]:
cutoff_date = datetime(2022, 1, 1)
train_before_cutoff = train[train[cst.RAW_DATE] < cutoff_date]
train_after_cutoff = train[train[cst.RAW_DATE] >= cutoff_date]

In [None]:
fig = go.Figure()

for pollutant in cst.RAW_TARGETS:
    fig.add_trace(
        go.Scatter(
            x=train_before_cutoff[cst.RAW_DATE],
            y=train_before_cutoff[pollutant],
            name=pollutant,
        )
    )

fig.show()

## Missing values

In [None]:
train.isnull().sum()

There are missing values in the training data (over $\frac{1}{4}$ for CO for instance), we need to fill the holes. For that we can use QOLMAT.

In [None]:
imputer = imputers.ImputerSimple(strategy="mean")
train[cst.RAW_TARGETS] = imputer.fit_transform(train[cst.RAW_TARGETS])

We can test different imputers:

# Final preprocessing

After this EDA analysis, we have the following preprocessing:

In [None]:
preprocessed_train = preprocess_data(train, imputer=imputer)