# Package import

In [48]:
import json
import requests
import ndjson
import pandas as pd
import os
from pandas_profiling import ProfileReport

  from .autonotebook import tqdm as notebook_tqdm


# Functions

In [32]:
import ndjson
from typing import List, Dict

def load_ndjon(file_path: str) -> List[Dict]:
    with open(file_path) as f:
        return ndjson.load(f)

# Testing stuff

In [3]:


url = "https://api.openaq.org/v2/sources?limit=100&page=1&offset=0&sort=asc&order_by=sourceName"

headers = {"accept": "application/json"}

response = requests.get(url, headers=headers)

json_data = response.json()
json_data['results'][0]


{'data': {'url': 'https://www.adairquality.ae/',
  'data_avg_dur': None,
  'organization': None,
  'lifecycle_stage': None},
 'readme': None,
 'sourceId': 9800851,
 'locations': 81,
 'sourceName': 'Abu Dhabi Air Quality',
 'sourceSlug': None}

# Exploring the data

In [6]:
countries_df = pd.read_csv('../raw_data/countries.csv')
countries_df

Unnamed: 0,country_code,country_name
0,GB,Great Britain
1,FR,France
2,NL,Netherlands


['1633539068.ndjson',
 '1633531868.ndjson',
 '1633542067.ndjson',
 '1633534868.ndjson',
 '1633546872.ndjson',
 '1633527666.ndjson']

In [40]:
data_path = os.path.join('..', 'raw_data', 'air_quality')

data = []
for filename in os.listdir(data_path):
    data.extend(load_ndjon(os.path.join(data_path, filename)))
    


    
first_element = data[0]
first_element

{'date': {'utc': '2021-08-15T14:30:00.000Z',
  'local': '2021-08-15T19:00:00+04:30'},
 'parameter': 'pm25',
 'value': -999,
 'unit': 'µg/m³',
 'averagingPeriod': {'value': 1, 'unit': 'hours'},
 'location': 'US Diplomatic Post: Kabul',
 'city': 'Kabul',
 'country': 'AF',
 'coordinates': {'latitude': 34.535812, 'longitude': 69.190514},
 'attribution': [{'name': 'EPA AirNow DOS',
   'url': 'http://airnow.gov/index.cfm?action=airnow.global_summary'}],
 'sourceName': 'StateAir_Kabul',
 'sourceType': 'government',
 'mobile': False}

In [41]:
df = pd.json_normalize(data, sep="_")

df.head()

Unnamed: 0,parameter,value,unit,location,city,country,attribution,sourceName,sourceType,mobile,date_utc,date_local,averagingPeriod_value,averagingPeriod_unit,coordinates_latitude,coordinates_longitude
0,pm25,-999.0,µg/m³,US Diplomatic Post: Kabul,Kabul,AF,"[{'name': 'EPA AirNow DOS', 'url': 'http://air...",StateAir_Kabul,government,False,2021-08-15T14:30:00.000Z,2021-08-15T19:00:00+04:30,1.0,hours,34.535812,69.190514
1,pm25,-999.0,µg/m³,US Diplomatic Post: Kabul,Kabul,AF,"[{'name': 'EPA AirNow DOS', 'url': 'http://air...",StateAir_Kabul,government,False,2021-08-15T15:30:00.000Z,2021-08-15T20:00:00+04:30,1.0,hours,34.535812,69.190514
2,pm25,-999.0,µg/m³,US Diplomatic Post: Kabul,Kabul,AF,"[{'name': 'EPA AirNow DOS', 'url': 'http://air...",StateAir_Kabul,government,False,2021-08-15T16:30:00.000Z,2021-08-15T21:00:00+04:30,1.0,hours,34.535812,69.190514
3,pm25,-999.0,µg/m³,US Diplomatic Post: Kabul,Kabul,AF,"[{'name': 'EPA AirNow DOS', 'url': 'http://air...",StateAir_Kabul,government,False,2021-08-15T17:30:00.000Z,2021-08-15T22:00:00+04:30,1.0,hours,34.535812,69.190514
4,pm25,-999.0,µg/m³,US Diplomatic Post: Kabul,Kabul,AF,"[{'name': 'EPA AirNow DOS', 'url': 'http://air...",StateAir_Kabul,government,False,2021-08-15T18:30:00.000Z,2021-08-15T23:00:00+04:30,1.0,hours,34.535812,69.190514


In [42]:
df.dtypes

parameter                 object
value                    float64
unit                      object
location                  object
city                      object
country                   object
attribution               object
sourceName                object
sourceType                object
mobile                      bool
date_utc                  object
date_local                object
averagingPeriod_value    float64
averagingPeriod_unit      object
coordinates_latitude     float64
coordinates_longitude    float64
dtype: object

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77273 entries, 0 to 77272
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   parameter              77273 non-null  object 
 1   value                  77273 non-null  float64
 2   unit                   77273 non-null  object 
 3   location               77273 non-null  object 
 4   city                   77273 non-null  object 
 5   country                77273 non-null  object 
 6   attribution            77273 non-null  object 
 7   sourceName             77273 non-null  object 
 8   sourceType             77273 non-null  object 
 9   mobile                 77273 non-null  bool   
 10  date_utc               77273 non-null  object 
 11  date_local             77273 non-null  object 
 12  averagingPeriod_value  77273 non-null  float64
 13  averagingPeriod_unit   77273 non-null  object 
 14  coordinates_latitude   77063 non-null  float64
 15  co

In [50]:
profile = ProfileReport(df, title="Air Quality data Report")
profile.to_file("air_quality_report.html")

  return func(*args, **kwargs)
  return func(*args, **kwargs)
  return func(*args, **kwargs)
Summarize dataset: 100%|██████████| 41/41 [00:12<00:00,  3.33it/s, Completed]                                           
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  3.61s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.01s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 379.68it/s]


# Data models

In [1]:
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column
from sqlalchemy.sql.sqltypes import Integer, String

Base = declarative_base()

class DBSources(Base):
    __tablename__="source"
    id = Column(Integer, primary_key=True, autoincrement=True)
    name = Column(String(250), nullable=False)
    sourceId = Column(Integer)
    location = Column(Integer)
    