ref. https://pandas.pydata.org/docs/getting_started/intro_tutorials/07_reshape_table_layout.html

In [1]:
import pandas as pd

In [4]:

air_quality = pd.read_csv(
    "air_quality_long.csv", index_col="date.utc", parse_dates=True
)

air_quality.head()

Unnamed: 0_level_0,city,country,location,parameter,value,unit
date.utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-06-18 06:00:00+00:00,Antwerpen,BE,BETR801,pm25,18.0,µg/m³
2019-06-17 08:00:00+00:00,Antwerpen,BE,BETR801,pm25,6.5,µg/m³
2019-06-17 07:00:00+00:00,Antwerpen,BE,BETR801,pm25,18.5,µg/m³
2019-06-17 06:00:00+00:00,Antwerpen,BE,BETR801,pm25,16.0,µg/m³
2019-06-17 05:00:00+00:00,Antwerpen,BE,BETR801,pm25,7.5,µg/m³


In [3]:
air_quality.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5272 entries, 2019-06-18 06:00:00+00:00 to 2019-04-09 02:00:00+00:00
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   city       5272 non-null   object 
 1   country    5272 non-null   object 
 2   location   5272 non-null   object 
 3   parameter  5272 non-null   object 
 4   value      5272 non-null   float64
 5   unit       5272 non-null   object 
dtypes: float64(1), object(5)
memory usage: 288.3+ KB


In [8]:

# list all parameters
air_quality["parameter"].unique()


array(['pm25', 'no2'], dtype=object)

In [22]:

# Filter for NO2
no2 = air_quality[air_quality["parameter"] == "no2"]

no2.info()

no2

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3447 entries, 2019-06-21 00:00:00+00:00 to 2019-04-09 02:00:00+00:00
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   city       3447 non-null   object 
 1   country    3447 non-null   object 
 2   location   3447 non-null   object 
 3   parameter  3447 non-null   object 
 4   value      3447 non-null   float64
 5   unit       3447 non-null   object 
dtypes: float64(1), object(5)
memory usage: 188.5+ KB


Unnamed: 0_level_0,city,country,location,parameter,value,unit
date.utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-06-21 00:00:00+00:00,Paris,FR,FR04014,no2,20.0,µg/m³
2019-06-20 23:00:00+00:00,Paris,FR,FR04014,no2,21.8,µg/m³
2019-06-20 22:00:00+00:00,Paris,FR,FR04014,no2,26.5,µg/m³
2019-06-20 21:00:00+00:00,Paris,FR,FR04014,no2,24.9,µg/m³
2019-06-20 20:00:00+00:00,Paris,FR,FR04014,no2,21.4,µg/m³
...,...,...,...,...,...,...
2019-04-09 06:00:00+00:00,London,GB,London Westminster,no2,41.0,µg/m³
2019-04-09 05:00:00+00:00,London,GB,London Westminster,no2,41.0,µg/m³
2019-04-09 04:00:00+00:00,London,GB,London Westminster,no2,41.0,µg/m³
2019-04-09 03:00:00+00:00,London,GB,London Westminster,no2,67.0,µg/m³


In [26]:

# We focus on NO2 data and only use the first two measurements of each location (i.e. the head of each group). 
# The subset of data will be called no2_subset
no2_subset = no2.sort_index().groupby(["location"]).head(2)


In [27]:

# List all locations
no2_subset["location"].unique()

array(['BETR801', 'FR04014', 'London Westminster'], dtype=object)

In [28]:


no2_subset.info()

no2_subset.head(100)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2019-04-09 01:00:00+00:00 to 2019-04-09 03:00:00+00:00
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   city       6 non-null      object 
 1   country    6 non-null      object 
 2   location   6 non-null      object 
 3   parameter  6 non-null      object 
 4   value      6 non-null      float64
 5   unit       6 non-null      object 
dtypes: float64(1), object(5)
memory usage: 336.0+ bytes


Unnamed: 0_level_0,city,country,location,parameter,value,unit
date.utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-04-09 01:00:00+00:00,Antwerpen,BE,BETR801,no2,22.5,µg/m³
2019-04-09 01:00:00+00:00,Paris,FR,FR04014,no2,24.4,µg/m³
2019-04-09 02:00:00+00:00,London,GB,London Westminster,no2,67.0,µg/m³
2019-04-09 02:00:00+00:00,Antwerpen,BE,BETR801,no2,53.5,µg/m³
2019-04-09 02:00:00+00:00,Paris,FR,FR04014,no2,27.4,µg/m³
2019-04-09 03:00:00+00:00,London,GB,London Westminster,no2,67.0,µg/m³
