## Kalender Liturgi Scraper Data Consistency Check

check the consistency of scraped data by looking of the values that encountered and analyze, including look for uniqueness.

In [1]:
import re
import json
import pandas as pd
from functools import reduce

# import files, directly from repo
!git clone https://github.com/akolites/kalender-liturgi
!ls ./kalender-liturgi/api/v1/202*.json

fatal: destination path 'kalender-liturgi' already exists and is not an empty directory.
./kalender-liturgi/api/v1/2021-08.json	./kalender-liturgi/api/v1/2022-02.json
./kalender-liturgi/api/v1/2021-09.json	./kalender-liturgi/api/v1/2022-03.json
./kalender-liturgi/api/v1/2021-10.json	./kalender-liturgi/api/v1/2022-04.json
./kalender-liturgi/api/v1/2021-11.json	./kalender-liturgi/api/v1/2022-05.json
./kalender-liturgi/api/v1/2021-12.json	./kalender-liturgi/api/v1/2022-06.json
./kalender-liturgi/api/v1/2022-01.json


In [2]:
# create list of files
json_files = !ls -l ./kalender-liturgi/api/v1/202*.json | awk '{print $9}'
json_files

['./kalender-liturgi/api/v1/2021-08.json',
 './kalender-liturgi/api/v1/2021-09.json',
 './kalender-liturgi/api/v1/2021-10.json',
 './kalender-liturgi/api/v1/2021-11.json',
 './kalender-liturgi/api/v1/2021-12.json',
 './kalender-liturgi/api/v1/2022-01.json',
 './kalender-liturgi/api/v1/2022-02.json',
 './kalender-liturgi/api/v1/2022-03.json',
 './kalender-liturgi/api/v1/2022-04.json',
 './kalender-liturgi/api/v1/2022-05.json',
 './kalender-liturgi/api/v1/2022-06.json']

In [3]:
# not to be confused with pd.read_json
def read_json(path):
  with open(path, 'r') as p:
    return json.load(p)

# read all json file and merge
json_dfs = [pd.json_normalize(read_json(js)) for js in json_files]
df = pd.concat(json_dfs, axis=0)

df.head(5)

Unnamed: 0,date,localDate,url,name,dayColor,dayType,color,readings.innerHTML,readings.links
0,2021-08-01T00:00:00.000Z,1 Agustus 2021,https://www.imankatolik.or.id/kalender/1Agu.html,Hari Minggu Biasa XVIII,#ff0000,minggu,hijau,"<a href=""/alkitabq.php?q=Kel16:2-4;Kel16:12-15...",[{'url': 'https://www.imankatolik.or.id/alkita...
1,2021-08-02T00:00:00.000Z,2 Agustus 2021,https://www.imankatolik.or.id/kalender/2Agu.html,"Eusebius Vercelli, Petrus Yulianus Eymard",#000000,biasa,hijau,"<a href=""/alkitabq.php?q=Bil11:4-15;"" target=""...",[{'url': 'https://www.imankatolik.or.id/alkita...
2,2021-08-03T00:00:00.000Z,3 Agustus 2021,https://www.imankatolik.or.id/kalender/3Agu.html,Hari Biasa,#000000,biasa,hijau,"<a href=""/alkitabq.php?q=Bil12:1-13;"" target=""...",[{'url': 'https://www.imankatolik.or.id/alkita...
3,2021-08-04T00:00:00.000Z,4 Agustus 2021,https://www.imankatolik.or.id/kalender/4Agu.html,Peringatan Wajib Yohanes Maria Vianney,#000000,biasa,putih,"<a href=""/alkitabq.php?q=Bil13:1-2;Bil13:25-99...",[{'url': 'https://www.imankatolik.or.id/alkita...
4,2021-08-05T00:00:00.000Z,5 Agustus 2021,https://www.imankatolik.or.id/kalender/5Agu.html,Pemberkatan Gereja Basilik SP Maria,#000000,biasa,hijau,"<a href=""/alkitabq.php?q=Bil20:1-13;"" target=""...",[{'url': 'https://www.imankatolik.or.id/alkita...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 334 entries, 0 to 29
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   date                334 non-null    object
 1   localDate           334 non-null    object
 2   url                 334 non-null    object
 3   name                334 non-null    object
 4   dayColor            334 non-null    object
 5   dayType             334 non-null    object
 6   color               334 non-null    object
 7   readings.innerHTML  334 non-null    object
 8   readings.links      334 non-null    object
dtypes: object(9)
memory usage: 26.1+ KB


In [5]:
df.dayColor.unique()

array(['#ff0000', '#000000', '#0000ff'], dtype=object)

In [6]:
df.dayType.unique()

array(['minggu', 'biasa', 'pesta', 'hariRaya'], dtype=object)

In [7]:
df.color.unique() #v1.5

array(['hijau', 'putih', 'merah', 'putih\n', 'hijau\n', 'ungu/hitam',
       'ungu', 'ungu dan putih', '\n\nhijau', 'ungu/putih'], dtype=object)

In [8]:
# ----------------------------------

In [9]:
df['readings.links'].iloc[0]

[{'text': 'Kel. 16:2-4,12-15',
  'url': 'https://www.imankatolik.or.id/alkitabq.php?q=Kel16:2-4;Kel16:12-15;'},
 {'text': 'Mzm. 78:3,4bc,23-24,25,54',
  'url': 'https://www.imankatolik.or.id/alkitabq.php?q=Mzm78:3;Mzm78:4;Mzm78:23-24;Mzm78:25;Mzm78:54;'},
 {'text': 'Ef. 4:17.20-24',
  'url': 'https://www.imankatolik.or.id/alkitabq.php?q=Ef4:17;Ef4:20-24;'},
 {'text': 'Yoh. 6:24-35',
  'url': 'https://www.imankatolik.or.id/alkitabq.php?q=Yoh6:24-35;'},
 {'text': 'BcO 1Raj. 19:1-21',
  'url': 'https://www.imankatolik.or.id/alkitabq.php?q=1Raj19:1-21;'}]

In [10]:
readings_links_df = pd.DataFrame(reduce(lambda x,y: x+y, df['readings.links'].tolist()))
readings_links_df

Unnamed: 0,url,text
0,https://www.imankatolik.or.id/alkitabq.php?q=K...,"Kel. 16:2-4,12-15"
1,https://www.imankatolik.or.id/alkitabq.php?q=M...,"Mzm. 78:3,4bc,23-24,25,54"
2,https://www.imankatolik.or.id/alkitabq.php?q=E...,Ef. 4:17.20-24
3,https://www.imankatolik.or.id/alkitabq.php?q=Y...,Yoh. 6:24-35
4,https://www.imankatolik.or.id/alkitabq.php?q=1...,BcO 1Raj. 19:1-21
...,...,...
1435,https://www.imankatolik.or.id/alkitabq.php?q=G...,BcO Gal. 1:15-2:10
1436,https://www.imankatolik.or.id/alkitabq.php?q=A...,Am. 7:10-17
1437,https://www.imankatolik.or.id/alkitabq.php?q=M...,"Mzm. 19:8,9,10,11"
1438,https://www.imankatolik.or.id/alkitabq.php?q=M...,Mat. 9:1-8


In [11]:
links = readings_links_df.text.apply(lambda x: re.sub(' \d.*', "", x.replace("BcO", "")).strip())
links.unique(), links

(array(['Kel.', 'Mzm.', 'Ef.', 'Yoh.', '', 'Bil.', 'Mat.', 'Dan.', '2 Ptr',
        'Mrk.', 'Ul.', '1Raj.', '2Kor.', 'Kis.', 'Yos.', 'Sore', '1Kor.',
        'Luk.', 'Why.', 'Hak.', 'Sir.', '1Ptr.', 'Gal.', 'Rut.', '1Tes.',
        '. Ef.', 'Flm.', 'Yak.', 'Am.', 'Kol.', 'Yes.', 'Am', 'Mi.', 'Rm.',
        'Kej.', 'Hos.', '1Tim.', 'Hos', 'Flp.', 'Ibr.', 'Mzm', 'Keb.',
        'Ezr.', 'Yes', 'Hag.', 'Za.', 'MT Yer.', 'Why', 'Neh.', 'Bar.',
        'Yun.', 'MT Yun.', 'Mal', 'Yl.', 'Zef.', 'Yer.', '2Tim.', 'Hab.',
        'Yer', '1Yoh.', '2Mak.', 'Yeh.', '1Mak.', 'MT', 'Luk', 'MT Dan.',
        'Mat', 'MT Yes.', 'Kid.', '1Sam.', 'Mal.', 'Pagi', 'Sore Yes.',
        'Misa Yes.', 'Tit.', 'Siang Yes.', '2Sam.', 'BcE', 'Kej', 'Kis',
        'Mrk', 'Gal', '1Kor', 'Kel', '1Ptr', 'Mrk.9:41-50', 'Im.', 'Est.',
        '2Sam', 'Rm', 'Ibr', '2Raj.', 'Bil', 'BcP Luk', 'BcE Yes.', 'Rat.',
        'Pagi Yes.', 'Sore Kel.', 'VIGILI Kej.', 'Bc2 Kej.', 'Bc3 Kel.',
        'MT Kel.', 'Bc4 Yes.', 'Bc5 Yes.

there are kinds of incorrect parser in non-ordinary days,

from above we could classify by:
pagi, siang, sore

others are : 
epistola, vigili, bc2-bcN

those are need to be handled too : -bcd, HARI,