In [1]:
#
# This tiny python script scrapes the talendforge.org web site for a list of Talend components and their corresponding
# categories (families) and creates a hierarchy and writes the hierarchy to a json file to be input into an Observable
# notebook for visualizing the data.
#
import requests
from bs4 import BeautifulSoup
import json 
# need this to get correct response from web site
parameters = {
    'version':'254',
    'edition':'13',
    'ALL':'Show All'
}
#
# To find the above parameters:
# Open up Developer Tools in Chrome by selecting View > Developer > Developer Tools.
# Select the Network Tab.
# Visit the page you’re going to do your search from.
# Click the Clear button and then submit your form.
# The Network tab will fill with activity!
# Find the thing in the Network tab that looks like the same name as your webpage. Click it. In this case it was index.php
# On the right-hand side you get a new pane. Scroll down, keep scrolling, yep, all the way down until you see
# Form Data. Those are the parameters I found for version, edition and ALL.
#
# Refer to: http://jonathansoma.com/lede/foundations-2017/classes/adv-scraping/advanced-scraping-form-submission/

# we are doing a post here instead of the usual get because we are scraping a form. It gets trickier if we have to 
# submit headers.
req = requests.post("http://www.talendforge.org/components/index.php", data = parameters)

# If you are going to be testing a lot, then write out the file to be polite to the web server 
# and observe web scraping etiquette. Your mother will love you for it.

# and the magic begins (lxml is the faster parser and recommended by BeautifulSoup. See docs.)
soup = BeautifulSoup(req.content, 'lxml')

# start with all targeted divs, then descend down the hierarchy
# as appropriate. We found out via viewing the source of the web page that we are interested in 
# two classe: "family" (category) and "line_component"
divs = soup.find_all('div', class_=['family','line_component'])

# family and line_component counters for data viz parameters
i = j = 0
# create main dictionary and initialize top element
data_fabric = {}
data_fabric['name'] = "Talend Data Fabric"
data_fabric['children'] = []

#
# need to inspect web page to test the following logic (in case web page has changed).
# this logic was driven by the fact that the family div and the div where the component information
# resided were at the same level; no explict parent:child relationship defined but the relationship was 
# implicit based upon position or sequence in the web page so the family category was decoupled from
# the component info.
#
# initialize first family value; this is our trigger to add family and component children to dictionary
# and start a new list of components
oldfamily = ""
#create list to store components
listofcomponents = []
# loop over all the divs returned from BeautifulSoup's find_all
for div in divs:
    classname = div.get('class',[''])[0] #stolen from stackoverflow 8)
    if classname == 'family':
        family = div.text.strip()
        if (family != oldfamily): #indicates we are at another level
            data_fabric['children'].append({'name':oldfamily,'children':listofcomponents})
            oldfamily = family
            print("Components List", family,"::", listofcomponents)
        # reset list
            listofcomponents = []
        j = j + 1 #family counter
    elif classname =='line_component':
        # this may be overkill but its Sunday morning and I want to finish this
        name = div.find(class_='name').find('a').string.strip()
        # construct list of components
        listofcomponents.append({'name':name,'value':i})

        print("\t", i, family, name)
        i = i + 1 # component counter
#output file
with open('data_fabric.json', 'w') as outfile:
    json.dump(data_fabric, outfile, indent=1)
# need this info for Observablehq work (data viz)    
print("Statistics:\n Number of families:\t", j,"\n Number of components:\t", i)


Components List Authentication :: []
	 0 Authentication tSetKeystore
Components List AWS :: [{'name': 'tSetKeystore', 'value': 0}]
	 1 AWS cAWSConnection
	 2 AWS cAWSS3
	 3 AWS cAWSSES
	 4 AWS cAWSSNS
	 5 AWS cAWSSQS
Components List Big Data :: [{'name': 'cAWSConnection', 'value': 1}, {'name': 'cAWSS3', 'value': 2}, {'name': 'cAWSSES', 'value': 3}, {'name': 'cAWSSNS', 'value': 4}, {'name': 'cAWSSQS', 'value': 5}]
	 6 Big Data tBigQueryBulkExec
	 7 Big Data tBigQueryInput
	 8 Big Data tBigQueryOutput
	 9 Big Data tBigQueryOutputBulk
	 10 Big Data tBigQuerySQLRow
	 11 Big Data tCassandraBulkExec
	 12 Big Data tCassandraClose
	 13 Big Data tCassandraConnection
	 14 Big Data tCassandraInput
	 15 Big Data tCassandraOutput
	 16 Big Data tCassandraOutputBulk
	 17 Big Data tCassandraOutputBulkExec
	 18 Big Data tCassandraRow
	 19 Big Data tDBFSConnection
	 20 Big Data tDBFSGet
	 21 Big Data tDBFSPut
	 22 Big Data tDynamoDBInput
	 23 Big Data tDynamoDBOutput
	 24 Big Data tGSBucketCreate
	 25 B

	 484 Databases tAmazonMysqlClose
	 485 Databases tAmazonMysqlCommit
	 486 Databases tAmazonMysqlConnection
	 487 Databases tAmazonMysqlInput
	 488 Databases tAmazonMysqlOutput
	 489 Databases tAmazonMysqlRollback
	 490 Databases tAmazonMysqlRow
	 491 Databases tAmazonOracleClose
	 492 Databases tAmazonOracleCommit
	 493 Databases tAmazonOracleConnection
	 494 Databases tAmazonOracleInput
	 495 Databases tAmazonOracleOutput
	 496 Databases tAmazonOracleRollback
	 497 Databases tAmazonOracleRow
	 498 Databases tAmazonRedshiftManage
	 499 Databases tAS400CDC
	 500 Databases tAS400Close
	 501 Databases tAS400Commit
	 502 Databases tAS400Connection
	 503 Databases tAS400Input
	 504 Databases tAS400LastInsertId
	 505 Databases tAS400Output
	 506 Databases tAS400Rollback
	 507 Databases tAS400Row
	 508 Databases tBigQueryConfiguration
	 509 Databases tBigQueryInput
	 510 Databases tBigQueryOutput
	 511 Databases tCassandraConfiguration
	 512 Databases tCassandraInput
	 513 Databases tCassand

	 859 Databases tVectorWiseRow
	 860 Databases tVerticaBulkExec
	 861 Databases tVerticaClose
	 862 Databases tVerticaCommit
	 863 Databases tVerticaConnection
	 864 Databases tVerticaInput
	 865 Databases tVerticaOutput
	 866 Databases tVerticaOutputBulk
	 867 Databases tVerticaOutputBulkExec
	 868 Databases tVerticaRollback
	 869 Databases tVerticaRow
	 870 Databases tVerticaSCD
Components List Databases NoSQL :: [{'name': 'tAccessBulkExec', 'value': 468}, {'name': 'tAccessClose', 'value': 469}, {'name': 'tAccessCommit', 'value': 470}, {'name': 'tAccessConnection', 'value': 471}, {'name': 'tAccessInput', 'value': 472}, {'name': 'tAccessOutput', 'value': 473}, {'name': 'tAccessOutputBulk', 'value': 474}, {'name': 'tAccessOutputBulkExec', 'value': 475}, {'name': 'tAccessRollback', 'value': 476}, {'name': 'tAccessRow', 'value': 477}, {'name': 'tAmazonAuroraClose', 'value': 478}, {'name': 'tAmazonAuroraCommit', 'value': 479}, {'name': 'tAmazonAuroraConnection', 'value': 480}, {'name': 't

	 1289 Processing tMap
	 1290 Processing tNormalize
	 1291 Processing tPartition
	 1292 Processing tReplace
	 1293 Processing tReplicate
	 1294 Processing tRules
	 1295 Processing tSample
	 1296 Processing tSampleRow
	 1297 Processing tSortRow
	 1298 Processing tSplitRow
	 1299 Processing tSqlRow
	 1300 Processing tSurviveFields
	 1301 Processing tTop
	 1302 Processing tTopBy
	 1303 Processing tUniqRow
	 1304 Processing tWindow
	 1305 Processing tWriteAvroFields
	 1306 Processing tWriteDelimitedFields
	 1307 Processing tWriteDynamicFields
	 1308 Processing tWriteJSONField
	 1309 Processing tWritePositionalFields
	 1310 Processing tWriteXMLFields
	 1311 Processing tXMLMap
Components List Routing :: [{'name': 'tAggregateRow', 'value': 1265}, {'name': 'tAggregateSortedRow', 'value': 1266}, {'name': 'tBRMS', 'value': 1267}, {'name': 'tCacheIn', 'value': 1268}, {'name': 'tCacheOut', 'value': 1269}, {'name': 'tConvertType', 'value': 1270}, {'name': 'tDenormalize', 'value': 1271}, {'name': 't