In [17]:
#
# This tiny python script scrapes the talendforge.org web site for a list of Talend components and their corresponding
# categories (families) and creates a hierarchy and writes the hierarchy to a json file to be input into an Observable
# notebook for visualizing the data.
#
import requests
from bs4 import BeautifulSoup
import json 
# need this to get correct response from web site; and they change over time!
parameters = {
    'version':'254',
    'edition':'13',
    'ALL':'Show All'
}
#
# To find the above parameters:
# Open up Developer Tools in Chrome by selecting View > Developer > Developer Tools.
# Select the Network Tab.
# Visit the page you’re going to do your search from.
# Click the Clear button and then submit your form.
# The Network tab will fill with activity!
# Find the thing in the Network tab that looks like the same name as your webpage. Click it. In this case it was index.php
# On the right-hand side you get a new pane. Scroll down, keep scrolling, yep, all the way down until you see
# Form Data. Those are the parameters I found for version, edition and ALL.
#
# Refer to: http://jonathansoma.com/lede/foundations-2017/classes/adv-scraping/advanced-scraping-form-submission/

# we are doing a post here instead of the usual get because we are scraping a form. It gets trickier if we have to 
# submit headers.
req = requests.post("http://www.talendforge.org/components/index.php", data = parameters)

# If you are going to be testing a lot, then write out the file to be polite to the web server 
# and observe web scraping etiquette. Your mother will love you for it.

# and the magic begins (lxml is the faster parser and recommended by BeautifulSoup. See docs.)
soup = BeautifulSoup(req.content, 'lxml')

# start with all targeted divs, then descend down the hierarchy
# as appropriate. We found out via viewing the source of the web page that we are interested in 
# two classe: "family" (category) and "line_component"
divs = soup.find_all('div', class_=['family','line_component'])

# family and line_component counters for data viz parameters
i = j = 0
# create main dictionary and initialize top element
data_fabric = {}
data_fabric['name'] = "Talend Data Fabric"
data_fabric['children'] = []

#
# need to inspect web page to test the following logic (in case web page has changed).
# this logic was driven by the fact that the family div and the div where the component information
# resided were at the same level; no explict parent:child relationship defined but the relationship was 
# implicit based upon position or sequence in the web page so the family category was decoupled from
# the component info.
#
# initialize first family value; this is our trigger to add family and component children to dictionary
# and start a new list of components
oldfamily = ""
#
#create list to store components
#
listofcomponents = []
#
# loop over all the divs returned from BeautifulSoup's find_all
#
for div in divs:
    classname = div.get('class',[''])[0] #stolen from stackoverflow 8)
    if classname == 'family':
        family = div.text.strip()
        if (family != oldfamily): #indicates we are at another level
            data_fabric['children'].append({'name':oldfamily,'children':listofcomponents})
            oldfamily = family
            print("Components List", family,"::", listofcomponents)
        # reset list
            listofcomponents = []
        j = j + 1 #family counter
    elif classname =='line_component':
        # this may be overkill but its Sunday morning and I want to finish this
        name = div.find(class_='name').find('a').string.strip()
    
        #.string().strip()
        # construct list of components
        listofcomponents.append({'name':name,'value':i})

#        print("\t", i, family, name)
        i = i + 1 # component counter

    
#filter out empty strings


#filter_object = filter(lambda x: x != "", data_fabric)

#clean_data_fabric = list(filter_object)
#output file
with open('data_fabric2.json', 'w') as outfile:
   json.dump(data_fabric, outfile, indent=1)
# need this info for Observablehq work (data viz)    
print("Statistics:\n Number of families:\t", j,"\n Number of components:\t", i)


Components List Authentication :: []
Components List AWS :: [{'name': 'tSetKeystore', 'value': 0}]
Components List Big Data :: [{'name': 'cAWSConnection', 'value': 1}, {'name': 'cAWSS3', 'value': 2}, {'name': 'cAWSSES', 'value': 3}, {'name': 'cAWSSNS', 'value': 4}, {'name': 'cAWSSQS', 'value': 5}]
Components List Business :: [{'name': 'tBigQueryBulkExec', 'value': 6}, {'name': 'tBigQueryInput', 'value': 7}, {'name': 'tBigQueryOutput', 'value': 8}, {'name': 'tBigQueryOutputBulk', 'value': 9}, {'name': 'tBigQuerySQLRow', 'value': 10}, {'name': 'tCassandraBulkExec', 'value': 11}, {'name': 'tCassandraClose', 'value': 12}, {'name': 'tCassandraConnection', 'value': 13}, {'name': 'tCassandraInput', 'value': 14}, {'name': 'tCassandraOutput', 'value': 15}, {'name': 'tCassandraOutputBulk', 'value': 16}, {'name': 'tCassandraOutputBulkExec', 'value': 17}, {'name': 'tCassandraRow', 'value': 18}, {'name': 'tDBFSConnection', 'value': 19}, {'name': 'tDBFSGet', 'value': 20}, {'name': 'tDBFSPut', 'value