# SI Scraper

This notebook scrapes data from [Siamensis SI](http://www.siamensis.org/species_index).

## Scraping

- Get a data object from Siamensis SI.
- Loop through all children nodes recursively.
- Get `id`, `num_children`, `children_id` from each node and store as a list.
- Download `html` file of each `id` into `../node` folder.

## Parsing

- Loop through each item in a scraped list.
- Parse each `html` page and store parsed data in a dict.
- Save this parsed data as `json`.


### Get Data Object

In [354]:
import requests
import json
import re
import pandas as pd
from os import path, pardir, mkdir
from bs4 import BeautifulSoup
from tqdm import tqdm
from glob import glob
from copy import deepcopy
from dateutil import parser
from datetime import datetime as dt
import html2text

In [2]:
# get whole tree from endpoint
r = requests.get('http://www.siamensis.org/json?type=tree')
# get json from request
si_json = r.json()[0][0]
# show keys of json obj
si_json.keys()

dict_keys(['data', 'attr', 'mlid', 'num_children', 'children'])

### Scraping

In [101]:
# function to get ids of all children in a list
def idGetter(children_ls):
    ids = []
    for child in children_ls:
        ids.append(child['attr']['link'].split('/')[-1])
    return ids

# function to scrapte data in the object recursively
def scraper(obj, keeper=[]):
    # each item is stored in dict
    item_dict = dict()
    # loop through keys in the object
    for key in obj.keys():
        # take attr link as an id and put in dict
        if key == 'attr':
            link_id = obj[key]['link'].split('/')[-1]
            # print(f"getting data of node id: {link_id}..")
            item_dict['id'] = link_id
        # get ids of children and count and put in dict
        elif key == 'children':
            all_ids = idGetter(obj[key])
            item_dict['num_children'] = len(all_ids)
            item_dict['children_ids'] = all_ids
            for item in obj[key]:
                # then scrape each children object with scraper
                # this will do recursively until no more obj
                scraper(item, keeper)
    # store each item dict in a keeper
    keeper.append(item_dict)
    # and return when all is done
    return keeper

In [131]:
extracted_node = scraper(si_json)

In [109]:
!mkdir -p ../node

In [110]:
# save all extracted_node as html per node
# this cell can take as long as 1.5 hrs, you can skip and use files in ../node
for node in tqdm(extracted_node):
    url = f'http://www.siamensis.org/species_index/node/{node["id"]}'
    r = requests.get(url)
    save_path = f'../node/{node["id"]}.html'
    
    with open(save_path, 'w') as f:
        f.write(r.text)

100%|██████████| 6510/6510 [1:29:47<00:00,  1.12s/it]


### Parsing

In [123]:
nodes = glob('../node/*.html')

In [139]:
copied_extracted_node = deepcopy(extracted_node)

In [399]:
counter = 0

for each in copied_extracted_node:
    file_name = f"../node/{each['id']}.html"
    
    soup = BeautifulSoup(open(file_name, "r"), "html.parser")

    # rank and rank name
    tmp = soup.select('.node-title')[0].text
    
    rank_pt = re.compile(r'^\s*([a-zA-Z]+)\s*\:')
    no_colon_pt = re.compile(r'^\s*([a-zA-Z]+)\s*')
    try:
        rank = rank_pt.search(tmp).group(1)
    except:
        try:
            rank = no_colon_pt.search(tmp).group(1)
#             print(rank)
        except:
            print(each)
            print(soup.prettify())
            break
    
    title_pt = re.compile(r':\s*([\(\)a-zA-Z \-,0-9\.]+)\s*$')
    no_colon_pt = re.compile(r'^\s*[a-zA-Z]+ ([\(\)a-zA-Z \-,0-9\.]+)\s*$')
    try:
        title = title_pt.search(tmp).group(1)
    except:
        try:
            title = no_colon_pt.search(tmp).group(1)
#             print(title)
        except:
            print(each)
            print(soup.prettify())
            break
    
    # author, timestamp and modified
    tmp = soup.select('.node-submitted')[0].text
    
    author_pt = re.compile(r'.*เขียนโดย (.*) เมื่อ.*')
    author = author_pt.search(tmp).group(1)
    
    timestamp_pt = re.compile(r'.*เมื่อ (.*)$')
    timestamp = timestamp_pt.search(tmp).group(1)
    
#     print(author)
#     print(timestamp)
#     print(siDatetimeParser(timestamp))
    
    # content, example images
    # content is anything between </div> of <div class="node-header">
    # to before <div class="node-submitted">, this could be tricky
    # images are also included here but the full link is a thumbnail
    # full size image is available in the href of a wrapping img
    # and the link is relative
    
    # TODOs.
    # extract all content and filter images out
    # get only image uri in href and prepend with full link
    # content should be converted to MD and store in 'description'
    # images should be stored in 'images'
    
    tmp = soup.select('.node-header')[0].next_siblings
    content = ''
    
#     print(soup.prettify(),'\n--')
    
    for tag in tmp:
        try:
            attrs_keys = tag.attrs.keys()
            
            if tag.name == 'div':
                if 'class' in attrs_keys:
    #             print(tag.attrs.keys(), tag.attrs['class'], tag.attrs['class'][0] == 'node-submitted')
                    if tag.attrs['class'][0] == 'node-submitted':
#                         print(tag, 'break')
                        break
                if 'id' in attrs_keys:
                    # additional images
                    if tag.attrs['id'] == 'jstree_thumb':
#                         print(tag, 'break')
                        break
            try:
                content += tag.prettify()
            except:
                pass
        except:
            pass
        
        
    
    if content:
        print(content)
        cv = html2text.html2text(content)
        print(repr(cv))
        print(cv)
        print('\n---------\n')
        counter+=1

    
#     print(tmp)
#     print(soup.prettify())
#     print(each)
#     print('\n---------\n')
    
    
    if counter >= 20:
        break
#     counter += 1

<p>
 โดนเมนอาร์เคีย (archaebacteria)
</p>

'โดนเมนอาร์เคีย (archaebacteria)\n\n'
โดนเมนอาร์เคีย (archaebacteria)



---------

<span style="color:#000080;">
 <strong>
  ชื่อวิทยาศาสตร์
 </strong>
</span>
<em>
 Oscillatoria limosa
</em>
<br/>
<span style="color:#000080;">
 <strong>
  บรรณานุกรม:
 </strong>
</span>
<ol>
 <li>
  ศูนย์วิจัยความหลากหลายทางชีวภาพ เฉลิมพระเกียรติ 72 พรรษา บรมราชินีนาถ. (2555).
  <span style="color:#000080;">
   <strong>
    สาหร่ายน้ำจืดในหุบเขาลำพญา
   </strong>
  </span>
  . สงขลา: มหาวิทยาลัยราชภัฏยะลา. 117 หน้า
 </li>
 <li>
  <span style="color:#000080;">
   <strong>
    Catalogue of Life: 2017 Annual Checklist
   </strong>
  </span>
 </li>
</ol>

'**ชื่อวิทยาศาสตร์** _Oscillatoria limosa_  \n**บรรณานุกรม:**\n\n  1. ศูนย์วิจัยความหลากหลายทางชีวภาพ เฉลิมพระเกียรติ 72 พรรษา บรมราชินีนาถ. (2555).  **สาหร่ายน้ำจืดในหุบเขาลำพญา** . สงขลา: มหาวิทยาลัยราชภัฏยะลา. 117 หน้า \n  2. **Catalogue of Life: 2017 Annual Checklist**\n\n'
**ชื่อวิทยาศาสตร์** _Oscillatoria 

In [281]:
def siDatetimeParser(date_str):
    # if date str is in English just send to normal parser
    if re.match(r'.*[a-zA-Z].*', timestamp):
        return parser.parse(timestamp)
    
    # if not split and clean then send to parse as Thai year
    date_tuple = tuple(filter(lambda x: x != '', date_str.split(' ')))
    
    # month mapper
    months = {
        'Jan': ['มค', 'มกราคม', 'มกรา'],
        'Feb': ['กพ', 'กุมภาพันธ์', 'กุมภา'],
        'Mar': ['มีค', 'มีนาคม', 'มีนา'],
        'Apr': ['เมย', 'เมษายน', 'เมษา'],
        'May': ['พค', 'พฤษภาคม', 'พฤษภา'],
        'Jun': ['มิย', 'มิถุนายน', 'มิถุนา'],
        'Jul': ['กค', 'กรกฎาคม', 'กรกฎา'],
        'Aug': ['สค', 'สิงหาคม', 'สิงหา'],
        'Sep': ['กย', 'กันยายน', 'กันยา'],
        'Oct': ['ตค', 'ตุลาคม', 'ตุลา'],
        'Nov': ['พย', 'พฤศจิกายน', 'พฤศจิกา'],
        'Dec': ['ธค', 'ธันวาคม', 'ธันวา']
    }
    
    # strip possible artifacts like dot and space
    month_strip = date_tuple[1].replace('.', '').replace(' ', '')
    year_strip = date_tuple[2]
    
    # roughly check if this is really buddhist calendar
    # if not return the input year
    if int(year_strip) > dt.now().year:
        # this is most likely a buddhist calendar in the present time
        yc = str(int(year_strip) - 543)
    else:
        yc = year_strip
        
    # this is real risky because no way to detect error
    # but just skip it for now :d
    for m in months.keys():
        if month_strip in months[m]:
            mc = m
            break
            
    return parser.parse(' '.join([date_tuple[0], mc, yc]), dayfirst=True)
    