# Hello world

This is an example of a notebook that connects to the data warehouse

In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine

In [2]:
try:
    connection_string = os.environ["CONNECTION_STRING"]
except:
    connection_string = 'postgresql://user:pass@warehouse:5432/warehouse_db'

engine = create_engine(connection_string)

## Example of a pure SQL query

In the example below we query the data warehouse with SQL and put the result in a Pandas' data frame

In [3]:
query = """
    SELECT short_measure_name, unit, AVG(NULLIF(value, 'NaN')), MAX(NULLIF(value, 'NaN')), MIN(NULLIF(value, 'NaN'))
    FROM fact_measure fm
    JOIN dim_date dd 
      ON fm.date_id = dd.id
    JOIN dim_time dt
      ON fm.time_id = dt.id
    JOIN dim_duration ddu
      ON fm.duration_id = ddu.id
    JOIN dim_station ds
      ON fm.source_id = ds.id
    JOIN dim_measurement_type dmt
      ON fm.measurement_type_id = dmt.id
    GROUP BY short_measure_name, unit
"""
pd.read_sql(query, engine)

Unnamed: 0,short_measure_name,unit,avg,max,min
0,SO2,µg/m3,4.784336,172.0,2.0
1,NO2,µg/m3,16.42409,103.0,1.0
2,O3,µg/m3,73.577871,119.0,6.0
3,PM10,µg/m3,24.317623,201.0,1.0
4,PM2.5,µg/m3,8.44104,67.0,1.0


## Example of filtering using Pandas

In the example below we query the data warehouse for all the data (at the moment there is only a few month of data and from single source so it isn't much). And then we use the functionality of the Pandas package to process the data.

In [49]:
query = """
    SELECT *
    FROM fact_measure fm
    JOIN dim_date dd 
      ON fm.date_id = dd.id
    JOIN dim_time dt
      ON fm.time_id = dt.id
    JOIN dim_duration ddu
      ON fm.duration_id = ddu.id
    JOIN dim_station ds
      ON fm.source_id = ds.id
    JOIN dim_measurement_type dmt
      ON fm.measurement_type_id = dmt.id
    ORDER BY date, hour
"""
data_frame = pd.read_sql(query, engine)

row_filter = data_frame["short_measure_name"] == "PM10"
col_filter = ["date", "hour", "value", "short_measure_name"]

data_frame = data_frame.loc[row_filter, col_filter]

print(data_frame.head(24))
print(data_frame.describe())

           date  hour  value short_measure_name
4    2019-01-01     0   51.0               PM10
9    2019-01-01     1   71.0               PM10
11   2019-01-01     2   49.0               PM10
15   2019-01-01     3   22.0               PM10
21   2019-01-01     4   12.0               PM10
28   2019-01-01     5   10.0               PM10
31   2019-01-01     6   14.0               PM10
35   2019-01-01     7   14.0               PM10
42   2019-01-01     8   23.0               PM10
45   2019-01-01     9   28.0               PM10
52   2019-01-01    10   22.0               PM10
56   2019-01-01    11   33.0               PM10
63   2019-01-01    12   41.0               PM10
67   2019-01-01    13   40.0               PM10
71   2019-01-01    14   38.0               PM10
75   2019-01-01    15   38.0               PM10
83   2019-01-01    16   40.0               PM10
87   2019-01-01    17   44.0               PM10
90   2019-01-01    18   51.0               PM10
99   2019-01-01    19   55.0            