# About
Processing environmental data

# Libraries

In [1]:
import json
import pandas as pd
import glob

# Data

The data returned will have the following units:

|Columns|Description|
|---	|---	|
|created|Data point creation date|
|date   |Measurement time|
|station|Meteorological station number|
|pressure|Surface Pressure (hPa)|
|temp|Temperature (K)|
|precipitation|Precipitation (mm)|
|wind|Wind (ms^-1)|
|gas|Including O3, NO2, PM2.5 (ug/m3)|

In [2]:
station_number = 's32'
files_list = list(glob.iglob('{}*.json'.format(station_number)))

f = open(files_list[0])
raw_data = json.load(f)

if len(files_list) > 1:
    for i in files_list[1:]:
        tmp = json.load(open(i))
        raw_data.extend(tmp)

col_names = list(raw_data[0].keys())

df = pd.DataFrame(raw_data, columns=col_names)

# User-Defined Functions

In [39]:
class data_processing(object):
    """
    To-Do
    - Drop column `created` 
    - Format column `date`
    - Split wind into two columns
    - Split gas into multiple columns (one for each gas)
    """

    def __init__(self, df):
        """
        df: Raw dataset
        """
        self.df = df

    # Processing `created` column
    def process_created_col(self):
        return self.df.drop("created", axis = 1)


    # Processing `date` column
    def process_date_col(self):
        df = self.process_created_col()
        # Split rows into two columns by string "T"
        tmp_date = df["date"].str.split("T", n = 2, expand = True)
        # Split rows of second column into two columns by string "Z"
        tmp_time = tmp_date[1].str.split("Z", n = 0, expand = True)
        df["day"] = tmp_date[0]
        df["time"] = tmp_time[0]
        df = df.drop("date", axis = 1)
        return df[['day', 'time', 'station', 'pressure', 'temp', 'precip', 'wind', 'gas']]

    # Processing `wind` column
    def process_wind_col(self):
        df = self.process_date_col()
        # Split arrays in dataframe column "wind" into two columns
        tmp_wind = pd.DataFrame(df["wind"].to_list(), columns=['c0','c1'])
        df["wind_northward"] = tmp_wind["c0"]
        df['wind_eastward'] = tmp_wind["c1"]
        return df[['day', 'time', 'station', 'pressure', 'temp', 'precip', 'wind_northward', 'wind_eastward', 'gas']]


    # Processing `gas` column
    def process_gas_col(self):
        df = self.process_wind_col()
        
        gas_cols = list(df["gas"])
        # Split dictionary values in column "gas" into three columns
        tmp_gas = pd.DataFrame(gas_cols, columns=["o3", "no2", "pm25_gcc"])
        df["O3"] = tmp_gas["o3"]
        df["NO2"] = tmp_gas["no2"]
        df["PM25"] = tmp_gas["pm25_gcc"]
        return df[['day', 'time', 'station', 'pressure', 'temp', 'precip', 'wind_northward', 'wind_eastward', 'O3', 'NO2', 'PM25']]
    
    def data_formatting(self):
        df = self.process_gas_col()
        
        df['day'] = pd.to_datetime(df['day'], format='%Y-%m-%d')
        df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S.%f')
        df['station'] = pd.to_numeric(processed_df["station"], downcast='integer')
        df['pressure'] = pd.to_numeric(processed_df["pressure"], downcast='float')
        df['temp'] = pd.to_numeric(processed_df["temp"], downcast='float')
        df['precip'] = pd.to_numeric(processed_df["precip"], downcast='float')
        df['wind_northward'] = pd.to_numeric(processed_df["wind_northward"], downcast='float')
        df['wind_eastward'] = pd.to_numeric(processed_df["wind_eastward"], downcast='float')
        df['O3'] = pd.to_numeric(processed_df["O3"], downcast='float')
        df['NO2'] = pd.to_numeric(processed_df["NO2"], downcast='float')
        df['PM25'] = pd.to_numeric(processed_df["PM25"], downcast='float')
        
        return df

In [40]:
# Initialization of processing class
main_processed_df = data_processing(df)

# Execution of processing functions
processed_df = main_processed_df.data_formatting()

# Overview
processed_df.head()

Unnamed: 0,day,time,station,pressure,temp,precip,wind_northward,wind_eastward,O3,NO2,PM25
0,2022-01-15,1900-01-01 12:00:00,32,929.49469,286.240906,0.0,2.558,-3.1019,56.580799,16.778799,16.9466
1,2022-01-15,1900-01-01 13:00:00,32,930.693726,285.115112,0.0,2.6061,-3.1377,46.1796,11.7535,15.4532
2,2022-01-15,1900-01-01 14:00:00,32,932.097778,286.022491,0.0,3.2427,-4.0089,36.3652,21.883301,9.5109
3,2022-01-15,1900-01-01 15:00:00,32,933.090393,286.560394,0.0,4.1802,-4.9703,36.229,18.8256,9.1762
4,2022-01-15,1900-01-01 16:00:00,32,933.704895,287.03891,0.0,4.1854,-5.2742,40.741001,22.757099,9.2909


In [45]:
type(processed_df["O3"][0])

numpy.float32

# Relevant sources

* https://www.geeksforgeeks.org/read-json-file-using-python/
* https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior