# SI Scraper

This notebook scrapes data from [Siamensis SI](http://www.siamensis.org/species_index). It then saves the data to `full_si_data.json` for easier access with Python Pandas.

## Steps

- Get a data object from Siamensis SI.
- Loop through all children nodes recursively.
- Get data of each node and append to a list.
- Create pandas dataframe from the list.
- Saved dataframe to `.json`.

In [2]:
import requests
import json
import re
import pandas as pd
from os import path, pardir
from bs4 import BeautifulSoup

In [6]:
# get whole tree from endpoint
r = requests.get('http://www.siamensis.org/json?type=tree')
# get json from request
si_json = r.json()[0][0]
# show keys of json obj
si_json.keys()

dict_keys(['data', 'attr', 'mlid', 'num_children', 'children'])

In [7]:
# a function to strip html tag from string
# this also remove number of children in parentheses
# and also remove space in front of : of species level
def rmHTMLTag(string):
    return re.sub(r'\s+:', ':', re.sub(r'\s*?\([0-9]+\)', '', re.sub(r'<[/\sa-zA-Z\"=-]+>\s?', '', string)))

# a function to scrape data of each node
def getNodeData(node_id, new_id):
    # make request
    cont = requests.get('http://www.siamensis.org/species_index/node/{0}'.format(node_id))
    soup = BeautifulSoup(cont.text, 'html.parser')

    # retrieving data
    title = str(soup.h2.text)
    header_img = None if soup.find_all('img', 'imagecache-jstree_header') == [] else soup.find_all('img', 'imagecache-jstree_header')[0]['src']
    description = [str(i) for i in soup.find_all('p')]
    all_imgs = [img.get('src') for img in soup.find_all(True) 
                if img.has_attr('src') and 'jstree_header' not in img.get('src')]
    author = [re.sub(r'Authenticated user |\s+$', '', i.get_text()) for i in soup.find_all('div') 
              if i.has_attr('class') and 'node-submitted' in i.get('class')][0]
    editors = [re.sub(r'\s+', ' ', i.get_text()) for i in soup.ul.find_all('li')]

    # pack in dict
    data = {
        'id': new_id,
        'title': title,
        'header_img': header_img,
        'description': description,
        'all_imgs': all_imgs,
        'author': author,
        'editors': editors,
    }
    
    return data

# this function prints out each item with space prepended
def walkThroughSITree(jsonObj, higher_order, parent_id, my_id):
    prefix = '' + higher_order
    itemId = jsonObj['attr']['link'].split('/')[-1]
    tmp = rmHTMLTag(jsonObj['data'])
    # {0}: prefix space
    # {1}: data
    # {2}: old id
    # {3}: new id
    this_new_id = "{0}-{1}{2}".format(parent_id, tmp[0], my_id)
    # remote - if it is the first string
    this_new_id = re.sub(r'^-', '', this_new_id)
#     print("{0}{1}, id {2}, new id {3}"
#           .format(prefix, tmp, itemId, this_new_id))
    scraped_tree.append(getNodeData(int(itemId), this_new_id))
    print("{0} is saved to list..".format(this_new_id))
    # if there is children, pass function into each children, with their id
    if 'num_children' in jsonObj:
        tmpId = 0
        for obj in jsonObj['children']:
            walkThroughSITree(obj, ' '+prefix, this_new_id, tmpId)
            tmpId += 1
            

In [None]:
# declare global list to collect each node
global scraped_tree
scraped_tree = list()

# call walk function with args
# jsonObj : a si tree json obj
# higher_order : prefix space derived from higher level or its parent, this only used for printing
# parent_id : chained ids of the parent
# my_id : id of this node derived from its parent
walkThroughSITree(si_json, '', '', '0')

In [24]:
# pack scraped tree into pandas dataframe
full_si = pd.DataFrame([pd.Series(i) for i in scraped_tree])

In [25]:
# save scraped tree to .json file in root dir
full_si.to_json(path.join(pardir, 'full_si_data.json'), orient='records')

In [26]:
# loadedtest = pd.read_csv('test_si.csv', index_col=0)
loadedtest = pd.read_json(path.join(pardir, 'full_si_data.json'))

In [27]:
# count each column
loadedtest.count()

all_imgs       6494
author         6494
description    6494
editors        6494
header_img     3024
id             6494
title          6494
dtype: int64

In [28]:
# see number of image which is not header
loadedtest.all_imgs[loadedtest.all_imgs.str.len() != 0].count()

3692