# Hello world

This is an example of a notebook that connects to the data warehouse

In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine

In [2]:
try:
    connection_string = os.environ["CONNECTION_STRING"]
except:
    connection_string = 'postgresql://user:pass@warehouse:5432/warehouse_db'

engine = create_engine(connection_string)

## Example of a pure SQL query

In the example below we query the data warehouse with SQL and put the result in a Pandas' data frame

In [6]:
query = """
    SELECT short_measure_name, unit, AVG(NULLIF(value, 'NaN')), MAX(NULLIF(value, 'NaN')), MIN(NULLIF(value, 'NaN'))
    FROM fact_measure fm
    JOIN dim_date dd 
      ON fm.date_id = dd.id
    JOIN dim_time dt
      ON fm.time_id = dt.id
    JOIN dim_duration ddu
      ON fm.duration_id = ddu.id
    JOIN dim_station ds
      ON fm.source_id = ds.id
    JOIN dim_measurement_type dmt
      ON fm.measurement_type_id = dmt.id
    GROUP BY short_measure_name, unit
    ORDER BY unit
"""
pd.read_sql(query, engine)

Unnamed: 0,short_measure_name,unit,avg,max,min
0,WD,°,169.621385,360.0,0.0
1,RH,%,64.661782,100.0,0.0
2,T,°C,17.564028,31.1,0.0
3,NO2,µg/m3,13.081291,188.0,0.0
4,O3,µg/m3,69.760467,126.0,0.0
5,PM10,µg/m3,25.572888,572.0,0.0
6,PM2.5,µg/m3,10.00723,791.0,-2.0
7,SO2,µg/m3,5.007019,172.0,0.0
8,PP,l/m2,0.005959,7.1,0.0
9,P,mb,998.068638,1024.0,0.0


## Example of filtering using Pandas

In the example below we query the data warehouse for all the data (at the moment there is only a few month of data so it isn't much). And then we use the functionality of the Pandas package to process the data.

In [5]:
query = """
    SELECT *
    FROM fact_measure fm
    JOIN dim_date dd 
      ON fm.date_id = dd.id
    JOIN dim_time dt
      ON fm.time_id = dt.id
    JOIN dim_duration ddu
      ON fm.duration_id = ddu.id
    JOIN dim_station ds
      ON fm.source_id = ds.id
    JOIN dim_measurement_type dmt
      ON fm.measurement_type_id = dmt.id
    ORDER BY date, hour
"""
data_frame = pd.read_sql(query, engine)

row_filter = data_frame["short_measure_name"] == "PM10"
col_filter = ["date", "hour", "value", "short_measure_name"]

data_frame = data_frame.loc[row_filter, col_filter]

print(data_frame.head(24))
print(data_frame.describe())

           date  hour  value short_measure_name
2    2019-01-01     0   29.0               PM10
6    2019-01-01     0   37.0               PM10
13   2019-01-01     0   24.0               PM10
18   2019-01-01     0   31.0               PM10
27   2019-01-01     0    NaN               PM10
32   2019-01-01     0   51.0               PM10
36   2019-01-01     0   16.0               PM10
41   2019-01-01     0    8.0               PM10
62   2019-01-01     0    7.0               PM10
77   2019-01-01     0   67.0               PM10
98   2019-01-01     1  106.0               PM10
107  2019-01-01     1   13.0               PM10
111  2019-01-01     1    8.0               PM10
139  2019-01-01     1    8.0               PM10
145  2019-01-01     1   71.0               PM10
146  2019-01-01     1   14.0               PM10
154  2019-01-01     1   18.0               PM10
158  2019-01-01     1   69.0               PM10
162  2019-01-01     1    NaN               PM10
172  2019-01-01     1   25.0            