In [111]:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
created: 2020-03-22
updated:
author: Rafal
description: import data from lines of json to pandas
"""
import os
import pathlib
import json
import pandas as pd

ROOT_PATH = pathlib.Path().absolute()
FILES_PATH = os.path.join(ROOT_PATH, '../../rafalr/files')

airly_file = os.path.join(FILES_PATH, 'klobuck_szkolna_airly.json')
print(airly_file)

class Airly:
    debug = 0  # useful for development
    columns = {'from':str, 'to':str, 'pm1':float, 'pm25':float, 'pm10':float, 'press':float, 'humid':float, 'temp':float}  # dict of measurements

    def __init__(self, airly_file: str):
        self._airly_file = airly_file
        self._measurements_list = []
        if not Airly.debug:
            return None

        print(self._airly_file)

    def _24hours_history(self, json_string: str) -> list:
        one_day_measure = json.loads(json_string)
        history_list_group = one_day_measure['history']
        history_list = []
        for one_measure in history_list_group:
            values = one_measure['values']
            di_name_value = {}
            for pair in values:
                di_name_value[pair['name']] = pair['value']
                
            li = [
#                 one_measure['fromDateTime'].replace(':00Z',':00.000Z') , #.astype(str),
#                 one_measure['tillDateTime'].replace(':00Z',':00.000Z'), #.astype(str),
                
                pd.to_datetime(one_measure['fromDateTime'].replace(':00Z',':00.000Z'), utc=True),
                pd.to_datetime(one_measure['tillDateTime'].replace(':00Z',':00.000Z'), utc=True), 
                di_name_value['PM1'],
                di_name_value['PM10'],
                di_name_value['PM25'],
                di_name_value['PRESSURE'],
                di_name_value['HUMIDITY'],
                di_name_value['TEMPERATURE'],
            ]
            history_list.append(li)
        return history_list

    def _json_rows2array(self):
        max_process = 10000000
        if Airly.debug:
            max_process = Airly.debug
        counter = 0
        error_counter = 0
        self._measurements_list = []
        with open(self._airly_file) as f:
            for line in f:
                counter += 1
                if counter > max_process:
                    break
                try:
                    self._measurements_list.extend(self._24hours_history(line))
                except Exception:
                    error_counter += 1
                    # print(f'error in line: {counter}')
                    # print(line)
        print(f'Erorr counter = {error_counter}')
        return self._measurements_list

    def _create_df(self) -> pd.DataFrame:
        df = pd.DataFrame()
        df = df.append(pd.DataFrame(self._measurements_list, columns=Airly.columns), ignore_index=True)
        return df

    def import2df(self):
        self._measurements_list = self._json_rows2array()
        result = self._create_df()
        return result


klobuck_file = Airly(airly_file)
df = klobuck_file.import2df()
df


/Users/rafalroman/PycharmProjects/python_data_products_workshop/rafalr/jupyter/../../rafalr/files/klobuck_szkolna_airly.json
Erorr counter = 57


Unnamed: 0,from,to,pm1,pm25,pm10,press,humid,temp
0,2018-11-15 06:00:00+00:00,2018-11-15 07:00:00+00:00,17.76,47.25,28.26,1031.99,97.58,5.52
1,2018-11-15 07:00:00+00:00,2018-11-15 08:00:00+00:00,17.15,45.84,27.45,1032.25,96.25,6.03
2,2018-11-15 08:00:00+00:00,2018-11-15 09:00:00+00:00,14.66,39.69,23.74,1032.27,93.25,7.02
3,2018-11-15 09:00:00+00:00,2018-11-15 10:00:00+00:00,10.44,25.81,16.91,1032.26,96.25,7.52
4,2018-11-15 10:00:00+00:00,2018-11-15 11:00:00+00:00,9.19,17.71,13.01,1032.11,82.92,8.92
...,...,...,...,...,...,...,...,...
10459,2020-03-22 01:00:00+00:00,2020-03-22 02:00:00+00:00,6.01,12.75,8.92,1028.24,62.08,-1.95
10460,2020-03-22 02:00:00+00:00,2020-03-22 03:00:00+00:00,4.90,10.43,7.33,1028.24,71.59,-2.59
10461,2020-03-22 03:00:00+00:00,2020-03-22 04:00:00+00:00,5.55,11.63,8.22,1028.46,81.48,-2.97
10462,2020-03-22 04:00:00+00:00,2020-03-22 05:00:00+00:00,7.49,15.70,11.19,1028.75,85.25,-2.97


In [100]:
df.where(df['from'].between(pd.to_datetime('20190102',format='%Y%m%d'),\
                 pd.to_datetime('20190104',format='%Y%m%d'))\
        )

SyntaxError: invalid syntax (<ipython-input-100-48ca83810bc2>, line 2)

In [86]:
pd.to_datetime('20190102',\
                                format='%Y%m%d')

Timestamp('2019-01-02 00:00:00')

In [99]:
# start = pd.to_datetime('2019-01-02', utc=True)
# end = pd.to_datetime('2019-01-04', utc=True)
start= '2019-01-02'
end = '2019-01-04'
df[(df["from"] > start) & (df["from"] < end)]


Unnamed: 0,from,to,pm1,pm25,pm10,press,humid,temp
1146,2019-01-02T00:00:00.000Z,2019-01-02T01:00:00.000Z,3.6,7.62,5.38,1006.92,88.75,-0.13
1147,2019-01-02T01:00:00.000Z,2019-01-02T02:00:00.000Z,3.84,9.45,6.24,1006.57,85.0,-0.15
1148,2019-01-02T02:00:00.000Z,2019-01-02T03:00:00.000Z,3.46,8.73,5.97,1006.41,91.67,-0.69
1149,2019-01-02T03:00:00.000Z,2019-01-02T04:00:00.000Z,2.49,6.07,4.09,1006.55,92.5,-0.71
1150,2019-01-02T04:00:00.000Z,2019-01-02T05:00:00.000Z,5.6,14.06,9.09,1007.21,92.5,-0.48
1151,2019-01-02T05:00:00.000Z,2019-01-02T06:00:00.000Z,6.5,16.46,10.85,1007.98,92.5,-0.31
1152,2019-01-02T06:00:00.000Z,2019-01-02T07:00:00.000Z,8.81,20.73,14.25,1008.83,94.75,-0.19
1153,2019-01-02T07:00:00.000Z,2019-01-02T08:00:00.000Z,7.65,17.34,11.89,1009.58,97.67,-0.26
1154,2019-01-02T08:00:00.000Z,2019-01-02T09:00:00.000Z,11.49,24.81,17.16,1010.15,98.78,-0.62
1155,2019-01-02T09:00:00.000Z,2019-01-02T10:00:00.000Z,13.87,31.8,21.21,1010.94,97.45,-0.63


In [101]:
pandas.datetime64[ns]

NameError: name 'pandas' is not defined

In [104]:
%pwd

'/Users/rafalroman/PycharmProjects/python_data_products_workshop/rafalr/jupyter'