# SI Scraper

This notebook scrapes data from [Siamensis SI](http://www.siamensis.org/species_index).

## Scraping

- Get a data object from Siamensis SI.
- Loop through all children nodes recursively.
- Get `id`, `num_children`, `children_id` from each node and store as a list.
- Download `html` file of each `id` into `../node` folder. (this step may take a while)

## Parsing

- Loop through each item in a scraped list.
- Parse each `html` page and store parsed data in a dict.
- Save this parsed data as `json`.


### Get Data Object

In [474]:
import requests
import json
import re
import pandas as pd
import ast
import html
from os import path, pardir, mkdir
from bs4 import BeautifulSoup
from tqdm import tqdm
from glob import glob
from copy import deepcopy
from dateutil import parser
from datetime import datetime as dt

### Scraping helper functions

In [101]:
# function to get ids of all children in a list
def idGetter(children_ls):
    ids = []
    for child in children_ls:
        ids.append(child['attr']['link'].split('/')[-1])
    return ids

# function to scrapte data in the object recursively
def scraper(obj, keeper=[]):
    # each item is stored in dict
    item_dict = dict()
    # loop through keys in the object
    for key in obj.keys():
        # take attr link as an id and put in dict
        if key == 'attr':
            link_id = obj[key]['link'].split('/')[-1]
            # print(f"getting data of node id: {link_id}..")
            item_dict['id'] = link_id
        # get ids of children and count and put in dict
        elif key == 'children':
            all_ids = idGetter(obj[key])
            item_dict['num_children'] = len(all_ids)
            item_dict['children_ids'] = all_ids
            for item in obj[key]:
                # then scrape each children object with scraper
                # this will do recursively until no more obj
                scraper(item, keeper)
    # store each item dict in a keeper
    keeper.append(item_dict)
    # and return when all is done
    return keeper

### Any step need to run this cell first

In [570]:
# get whole tree from endpoint
r = requests.get('http://www.siamensis.org/json?type=tree')
# get json from request
si_json = r.json()[0][0]
# extract node from si_json big object
extracted_node = scraper(si_json)
# show keys of json obj
si_json.keys()

dict_keys(['data', 'attr', 'mlid', 'num_children', 'children'])

### Start scraping

In [109]:
!mkdir -p ../node

In [110]:
# save all extracted_node as html per node
# this cell can take as long as 1.5 hrs, you can skip and use files in ../node
for node in tqdm(extracted_node):
    url = f'http://www.siamensis.org/species_index/node/{node["id"]}'
    r = requests.get(url)
    save_path = f'../node/{node["id"]}.html'
    
    with open(save_path, 'w') as f:
        f.write(r.text)

100%|██████████| 6510/6510 [1:29:47<00:00,  1.12s/it]


### Parsing helper functions

In [563]:
def siDatetimeParser(date_str):
    # if date str is in English just send to normal parser
    if re.match(r'.*[a-zA-Z].*', date_str):
        return parser.parse(date_str)
    
    # if not split and clean then send to parse as Thai year
    date_tuple = tuple(filter(lambda x: x != '', date_str.split(' ')))
    
    # month mapper
    months = {
        'Jan': ['มค', 'มกราคม', 'มกรา'],
        'Feb': ['กพ', 'กุมภาพันธ์', 'กุมภา'],
        'Mar': ['มีค', 'มีนาคม', 'มีนา'],
        'Apr': ['เมย', 'เมษายน', 'เมษา'],
        'May': ['พค', 'พฤษภาคม', 'พฤษภา'],
        'Jun': ['มิย', 'มิถุนายน', 'มิถุนา'],
        'Jul': ['กค', 'กรกฎาคม', 'กรกฎา'],
        'Aug': ['สค', 'สิงหาคม', 'สิงหา'],
        'Sep': ['กย', 'กันยายน', 'กันยา'],
        'Oct': ['ตค', 'ตุลาคม', 'ตุลา'],
        'Nov': ['พย', 'พฤศจิกายน', 'พฤศจิกา'],
        'Dec': ['ธค', 'ธันวาคม', 'ธันวา']
    }
    
    # strip possible artifacts like dot and space
    month_strip = date_tuple[1].replace('.', '').replace(' ', '')
    year_strip = date_tuple[2]
    
    # roughly check if this is really buddhist calendar
    # if not return the input year
    if int(year_strip) > dt.now().year:
        # this is most likely a buddhist calendar in the present time
        yc = str(int(year_strip) - 543)
    else:
        yc = year_strip
        
    # this is real risky because no way to detect error
    # but just skip it for now :d
    for m in months.keys():
        if month_strip in months[m]:
            mc = m
            break
            
    return parser.parse(' '.join([date_tuple[0], mc, yc]), dayfirst=True)
    
def get_id(raw_node):
    return raw_node['id']

def get_rank_and_title(node_soup):
    # select rank and title tag
    tmp = node_soup.select('.node-title')[0].text
    
    # get rank first
    main_rank_pt = r'^[\s ฺฺ]*([a-zA-Z]+)\s*'
    rank_pt = re.compile(fr'{main_rank_pt}\:')
    no_colon_pt = re.compile(fr'{main_rank_pt}')
    try:
        rank = rank_pt.search(tmp).group(1)
    except:
        try:
            rank = no_colon_pt.search(tmp).group(1)
            # print(rank)
        except:
            print('---- exception is throw at [rank]:')
            print(each)
            print('---- prettified')
            print(node_soup.prettify())
            # return None so the outer loop breaks
            return None
    
    # if rank passes get title
    main_title_pt = r'([\(\)a-zA-ZÀ-ÖØ-öø-ÿĀ-ž \-,&0-9\.=ก-๙\[\]\']+)\s*$'
    title_pt = re.compile(fr':\s*{main_title_pt}')
    no_colon_pt = re.compile(fr'^\s*[a-zA-ZÀ-ÖØ-öø-ÿĀ-ž]+ {main_title_pt}')
    try:
        title = title_pt.search(tmp).group(1)
    except:
        try:
            title = no_colon_pt.search(tmp).group(1)
            # print(title)
        except:
            print('---- exception is throw at [title]:')
            print(each)
            print('---- prettified')
            print(tmp)
            print(node_soup.prettify())
            # return None so the outer loop breaks
            return None
    # if all pass, return properly
    return (rank, title)

def get_author_and_timestamp(node_soup):
    # select tag where author and timestamp live
    tmp = node_soup.select('.node-submitted')[0].text
    # some of the datetime is appended with this abnormal text
    tmp = tmp.replace('(IP:  )', '')
    
    author_pt = re.compile(r'.*เขียนโดย (.*) เมื่อ.*')
    author = author_pt.search(tmp).group(1)
    
    timestamp_pt = re.compile(r'.*เมื่อ (.*)$')
    
    try:
        timestamp = siDatetimeParser(timestamp_pt.search(tmp).group(1))
    except:
        print('get:', timestamp_pt.search(tmp).group(1))
        print('actual:', tmp)
        # return None so the outer loop breaks
        return None
    
    return (author, timestamp)

def get_description_and_images(node_soup):
    # get all tags after node-header class
    # this is where description and images live.. in HTML
    tmp = node_soup.select('.node-header')[0].next_siblings
    # declare empty content and images to append later
    content = ''
    images = []
    
    # loop through each tag
    for tag in tmp:
        try:
            # try getting attributes keys
            attrs_keys = tag.attrs.keys()
            
            if tag.name == 'div':
                # if it is a class check if it is the last tag of content
                if 'class' in attrs_keys:
                    # print(tag.attrs.keys(), tag.attrs['class'], tag.attrs['class'][0] == 'node-submitted')
                    if tag.attrs['class'][0] == 'node-submitted':
                        # print(tag, 'break')
                        break
                # if it is an id check if it is images div
                if 'id' in attrs_keys:
                    # additional images
                    if tag.attrs['id'] == 'jstree_thumb':
                        # extract image url from within href of a
                        img_links = [f'http://www.siamensis.org{x["href"]}' for x in tag.find_all('a', attrs={'class': 'si-image'})]
                        # caption is from alt of img 
                        captions = [f'{x["alt"]}' for x in tag.find_all('img', attrs={'class': 'image-item'})]
                        # construct image array with obj of images
                        images = [{
                            'url': k,
                            'caption': v
                        } for k,v in zip(img_links, captions)]
                        
                        # print(images)
                        break
            try:
                # if none of above, this is part of a description
                # add it to content
                content += tag.prettify()
            except:
                pass
        except:
            pass
    # finally return content and images which can be empty
    return (content, images)
                                    
def get_parent_list(node_soup):
     # list of parents
    tmp = node_soup.find(attrs={'type': 'text/javascript'}).text
    parent_pt = re.compile(r"jQ.parseJSON\('(.*)'\).*")  
    parent_list = parent_pt.search(tmp).group(1)
    # turn list that is parsed as string into a list
    parent_list = ast.literal_eval(parent_list)
    # remove all white space
    parent_list = [x.strip() for x in parent_list]
    # remove itself from the list
    parent_list.remove(each['id'])        
    
    return parent_list

### Start parsing

In [123]:
nodes = glob('../node/*.html')

In [139]:
copied_extracted_node = deepcopy(extracted_node)

In [565]:
# pre-allocate counter and list for collecting all nodes
counter = 0
si_collection = []

for each in tqdm(copied_extracted_node):
    # pre-allocate object
    si_node = {
        'id': '',
        'rank': '',
        'title': '',
        'author': '',
        'timestamp': '',
        'description': '',
        'images': [],
        'parents': [],
    }
    
    ## ---- ID
    si_node['id'] = get_id(each)
    
    ## ---- opening the file
    # file name of each html
    file_name = f"../node/{each['id']}.html"
    # soup of each html
    soup = BeautifulSoup(open(file_name, "r"), "html.parser")

    ## ---- RANK AND TITLE
    # get rank and rank name
    node_rank, node_title = get_rank_and_title(soup)
        
    if not node_rank or not node_title:
        break
        
    # assign to a node
    si_node['rank'] = node_rank
    si_node['title'] = node_title

    ## ---- AUTHOR AND TIMESTAMP
    node_author, node_timestamp = get_author_and_timestamp(soup)
    if not node_author or not node_timestamp:
        break
    
    # assign to a node
    si_node['author'] = node_author
    si_node['timestamp'] = node_timestamp.strftime("%d %b %Y, %H:%M")
           
    ## ---- DESCRIPTION AND IMAGES
    # get description and images
    node_content, node_images = get_description_and_images(soup)
    
    # assign to a node
    si_node['description'] = node_content
    si_node['images'] = node_images
    
    ## ---- PARENT LIST
    # assign to a node
    node_parent_list = get_parent_list(soup)
    si_node['parents'] = node_parent_list
                              
    ## ---- add to collection
    # add to list and count success 
    si_collection.append(si_node)
    counter += 1
                                    
print(f'DONE at: {counter}')

100%|██████████| 6510/6510 [00:24<00:00, 265.23it/s]

DONE at: 6510





### Add number of children to each node

In [566]:
cloned_collection = deepcopy(si_collection)

In [567]:
for idx, node in enumerate(tqdm(cloned_collection)):
    tmp_id = node['id']
    tmp = list(map(lambda x: tmp_id in x['parents'], si_collection))
    
    cloned_collection[idx]['num_children'] = tmp.count(True)

100%|██████████| 6510/6510 [00:40<00:00, 162.02it/s]


### Save to json file

In [568]:
save_path = path.join(path.pardir, 'si-data.json')

In [569]:
with open(save_path, 'w') as f:
    json.dump(cloned_collection, f, sort_keys=True, indent=2)