In [9]:
#
# This tiny python script scrapes the talendforge.org web site for a list of Talend components and their corresponding
# categories (families) and creates a hierarchy and writes the hierarchy to a json file to be input into a D3 data
# visualization.
#
# Data format we want to output:
# Note that only leaf nodes or edge nodes have a value.

#id,value
#flare,
#flare.analytics,
#flare.analytics.cluster,
#flare.analytics.cluster.AgglomerativeCluster,3938
#flare.analytics.cluster.CommunityStructure,3812
#flare.analytics.cluster.HierarchicalCluster,6714
#flare.analytics.cluster.MergeEdge,743
#flare.analytics.graph,
#flare.analytics.graph.BetweennessCentrality,3534
#flare.analytics.graph.LinkDistance,5731
#flare.analytics.graph.MaxFlowMinCut,7840
#flare.analytics.graph.ShortestPaths,5914
#flare.analytics.graph.SpanningTree,3416
#flare.analytics.optimization,
#flare.analytics.optimization.AspectRatioBanker,7074

import os
import requests
from bs4 import BeautifulSoup
import json 

# need this to get correct response from web site
parameters = {
    'version':'254',
    'edition':'13',
    'ALL':'Show All'
}
#
# To find the above parameters:
# Open up Developer Tools in Chrome by selecting View > Developer > Developer Tools.
# Select the Network Tab.
# Visit the page you’re going to do your search from.
# Click the Clear button and then submit your form.
# The Network tab will fill with activity!
# Find the thing in the Network tab that looks like the same name as your webpage. Click it. In this case it was index.php
# On the right-hand side you get a new pane. Scroll down, keep scrolling, yep, all the way down until you see
# Form Data. Those are the parameters I found for version, edition and ALL.
#
# Refer to: http://jonathansoma.com/lede/foundations-2017/classes/adv-scraping/advanced-scraping-form-submission/

# we are doing a post here instead of the usual get because we are scraping a form. It gets trickier if we have to 
# submit headers.
req = requests.post("http://www.talendforge.org/components/index.php", data = parameters)

# If you are going to be testing a lot, then write out the file to be polite to the web server 
# and observe web scraping etiquette. Your mother will love you for it.

# and the magic begins (lxml is the faster parser and recommended by BeautifulSoup. See docs.)
soup = BeautifulSoup(req.content, 'lxml')

# start with all targeted divs, then descend down the hierarchy
# as appropriate. We found out via viewing the source of the web page that we are interested in 
# two classes: "family" (category) and "line_component"
divs = soup.find_all('div', class_=['family','line_component'])



In [21]:
# Matchlist creates another level or two if there are multiple matches. 
# Examining the data leads us to conclude that the max number of levels is 2.
def matchparent(input): 
    matchlist = ["Amazon","Cassandra","DynamoDB","HBase","HDFS","Hive","Impala","MapR","MarkLogic",
             "MongoDB","Neo4j","Riak","Sqoop","Marketo","Netsuite","Salesforce","SAP",
             "ServiceNow","MSSql","Netezza","Oracle","Palo","Postgres","Splunk","Sybase",
             "Teradata", "Aurora","Redshift","Azure","Box","Cosmos","Dropbox",
             "Google","S3","Snowflake","Java","Access","EXA","Firebird","Greenplum","Informix",
             "Ingres","Interbase","Kudu","ParAccel","Couchbase","JBoss","Kafka","Microsoft", "Mysql",
             "Vertica", "VectorWise", "SQLite", "SQLDWH", "LDAP", "JDBC", "AS400", "DB2", "HSQLD",
             "EXist", "SQS", "FileInput", "FileOutput", "FileStream", "Uniserve",
             "Match", "Address"]
    list = []
    for item in matchlist:
        if item in input:
            list.append(item)
    
    return list


# family and line_component counters for data viz parameters
i = j = 0
# create main dictionary and initialize top element
data_fabric = []
#data_fabric['id'] = "Talend Data Fabric"
#data_fabric['children'] = []

#
# need to inspect web page to test the following logic (in case web page has changed).
# this logic was driven by the fact that the family div and the div where the component information
# resided were at the same level; no explict parent:child relationship defined but the relationship was 
# implicit based upon position or sequence in the web page so the family category was decoupled from
# the component info.
#
# initialize first family value; this is our trigger to add family and component children to dictionary
# and start a new list of components
oldfamily = ""
#create dictionaries to store component levels
matches = {}
matchesL2 = {}
#open file (should check for errors but we are hacking here)
f = open("TalendComponentsV8.csv","w+")
    
out = "id, value\nTalend Data Fabric,\n"
f.write(out)
#print("id, value\nTalend_Data_Fabric,")
root = "Talend Data Fabric."
outputString = ""
matchitems = [] # a list of matched items
descriptionList = {}
# loop over all the divs returned from BeautifulSoup's find_all
for div in divs: #iterate over the divs
    classname = div.get('class',[''])[0] # stolen from stackoverflow 8)
    if classname == 'family':
        family = div.text.strip()
        # family = family.replace(" ", "_") # not sure if D3 can handle spaces. Yep it can.
        if (family != oldfamily): #indicates we are at another family
            oldfamily = family
        outputString = family
        out = root+outputString + ",\n"
        f.write(out)
        # print(root+outputString + ",")
        j+=1 #family counter
    elif classname =='line_component': # edge node
        # this may be overkill but its Sunday morning and I want to finish this
        name = div.find(class_='name').find('a').string.strip()
        # We are getting the component icon and description for tooltips in our data viz.
        img = div.find('img', src=True)
        desc = div.find(class_='description').string.strip()
        descriptionList[name] = desc
        print("Name, img, Desc = ", name, img['src'], desc)
        
        # the following code works but (and should be put in a function) but we only do this once
        #filename = os.path.basename(img['src'])
        #
        #image = "https://talendforge.org/components/" + img['src']
        #
        # get icon file and write it locally; this takes awhile so we only do it once
        #
        # with open('images/'+filename, 'wb') as f2:
        #    res = requests.get(image)
        #    f2.write(res.content)
        #    f2.close()

        # the following logic further categorizes the components to make the data viz graphic more readable
        matchlist = matchparent(name) # do we need to create another level?
        if matchlist:                 # found a match, create a new level.
            matcheditem = family + matchlist[0]
            if (not(matcheditem in matches)): # create another level below 'family'
                matches[matcheditem] = matcheditem
                out = root+family + "."+ matchlist[0] +",\n" # kludgy
                f.write(out)
                #print(root+family + "."+ matchlist[0] +",")
            if(len(matchlist) == 2): # create yet another level below 'level1'; no more, max reached.
                checkitem = family+matchlist[0]+matchlist[1]
                if (not(checkitem in matchesL2)): 
                    matchesL2[checkitem] = checkitem
                    out = root+family + "."+ matchlist[0] + "." + matchlist[1] + ",\n"
                    f.write(out)
                    #print(root+family + "."+ matchlist[0] + "." + matchlist[1] + ",")
                
                out = root+family + "."+ matchlist[0] + "."+ matchlist[1] + "." + name +"," + str(i+j) + "\n"
                f.write(out)
                #print(root+family + "."+ matchlist[0] + "."+ matchlist[1] + "." + name +",",i+j)
            else:
                out = root+family + "."+ matchlist[0] + "." + name +"," + str(i+j) + "\n"
                f.write(out)
                #print(root+family + "."+ matchlist[0] + "." + name +",",i+j) 
        else: # edge node
            out = root+family + "." + name + "," + str(i+j) + "\n"
            f.write(out)
            #print(root+family + "." + name + ",",i+j)
        outputString = ""

        i+=1 # component counter
f.close()
#print (descriptionList)
json = json.dumps(descriptionList)
f = open("ComponentDescriptionsV8.json","w")
f.write(json)
f.close()
# output description list in the following format:
# component Name: component Description
    
# current stats are 43 families and 1399 components!
print("Statistics:\n Number of families:\t", j,"\n Number of components:\t", i)


Name, img, Desc =  tSetKeystore images/7.3.1/tSetKeystore_icon32.png Sets the authentication data type between PKCS 12 and JKS.
Name, img, Desc =  cAWSConnection images/7.3.1/cAWSConnection_icon32.png Establishes a connection to Amazon Web Services for data storage and retrieval.
Name, img, Desc =  cAWSS3 images/7.3.1/cAWSS3_icon32.png Stores and retrieves objects from/to Amazon's Simple Storage Service (S3)
Name, img, Desc =  cAWSSES images/7.3.1/cAWSSES_icon32.png Sends emails with Amazon's Simple Email Service (SES).
Name, img, Desc =  cAWSSNS images/7.3.1/cAWSSNS_icon32.png Sends messages to an Amazon's Simple Notification topic.
Name, img, Desc =  cAWSSQS images/7.3.1/cAWSSQS_icon32.png Sends and receives messages to/from Amazon's Simple Queue Service (SQS). The AWS SQS FIFO Feature for Queues are supported.
Name, img, Desc =  tBigQueryBulkExec images/7.3.1/tBigQueryBulkExec_icon32.png Transfers given data to Google BigQuery.
Name, img, Desc =  tBigQueryInput images/7.3.1/tBigQuer

Name, img, Desc =  cMail images/7.3.1/cMail_icon32.png Sends or receives mails in a Route.
Name, img, Desc =  cMQConnectionFactory images/7.3.1/cMQConnectionFactory_icon32.png Encapsulates a set of configuration parameters to connect to a MQ server. The connection can be called by multiple cJMS, cWMQ, cAMQP or cMQTT components in a Route.
Name, img, Desc =  cMQTT images/7.3.1/cMQTT_icon32.png Sends messages to, or consumes messages from MQTT compliant message brokers.
Name, img, Desc =  cREST images/7.3.1/cREST_icon32.png Provides integration with Apache CXF for connecting to JAX-RS services.
Name, img, Desc =  cSOAP images/7.3.1/cSOAP_icon32.png Provides integration with Apache CXF for connecting to JAX-WS services.
Name, img, Desc =  cWMQ images/7.3.1/cWMQ_icon32.png Exchanges messages between a Route and a JMS provider using WMQ.
Name, img, Desc =  cDirect images/7.3.1/cDirect_icon32.png Produces and consumes messages synchronously in different threads within a single CamelContext.


Name, img, Desc =  tMysqlSCDELT images/7.3.1/tMysqlSCDELT_icon32.png Reflects and tracks changes in a dedicated MySQL SCD table through SQL queries.
Name, img, Desc =  tMysqlSP images/7.3.1/tMysqlSP_icon32.png Calls a MySQL database stored procedure.
Name, img, Desc =  tMysqlTableList images/7.3.1/tMysqlTableList_icon32.png Lists the names of a given set of Mysql tables using a select statement based on a Where clause.
Name, img, Desc =  tMySQLValidRows images/7.3.1/tMySQLValidRows_icon32.png Checks MySQL database rows against Data Quality patterns (regular expression).
Name, img, Desc =  tNetezzaBulkExec images/7.3.1/tNetezzaBulkExec_icon32.png Offers gains in performance while carrying out the Insert operations to a Netezza database.
Name, img, Desc =  tNetezzaClose images/7.3.1/tNetezzaClose_icon32.png Closes the transaction committed in the connected Netazza database.
Name, img, Desc =  tNetezzaCommit images/7.3.1/tNetezzaCommit_icon32.png Validates the data processed through the J

Name, img, Desc =  tHDFSProperties images/7.3.1/tHDFSProperties_icon32.png Creates a single row flow that displays the properties of a file processed in HDFS.
Name, img, Desc =  tHDFSPut images/7.3.1/tHDFSPut_icon32.png Connects to Hadoop distributed file system to load large-scale files into it with optimized performance.
Name, img, Desc =  tHDFSRename images/7.3.1/tHDFSRename_icon32.png Renames the selected files or specified directory on HDFS.
Name, img, Desc =  tHDFSRowCount images/7.3.1/tHDFSRowCount_icon32.png Reads a file in HDFS row by row in order to determine the number of rows this file contains.
Name, img, Desc =  tNamedPipeClose images/7.3.1/tNamedPipeClose_icon32.png Closes a named-pipe at the end of a process.
Name, img, Desc =  tNamedPipeOpen images/7.3.1/tNamedPipeOpen_icon32.png Opens a named-pipe for writing data into it.
Name, img, Desc =  tNamedPipeOutput images/7.3.1/tNamedPipeOutput_icon32.png Writes data into an existing open named-pipe.
Name, img, Desc =  tPivo

AttributeError: 'str' object has no attribute 'dumps'