In [28]:
#
# This tiny python script scrapes the talendforge.org web site for a list of Talend components and their corresponding
# categories (families) and creates a hierarchy and writes the hierarchy to various json and csv files to be input into a D3 data
# visualization.
# This notebook consists of three parts:
# Cell 1 scrapes the web site and loads the div data structure
# Cell 2 iterates over the divs and extracts component names and the families they belong too
# Pending; not implemented -> Cell 3 unfortunately there are duplicate component names so this cell reads in the component names
#        csv file and de-duplicates the component names file
#
# Data format we want to output:
# Note that only leaf nodes or edge nodes have a value.

#id,value
#flare,
#flare.analytics,
#flare.analytics.cluster,
#flare.analytics.cluster.AgglomerativeCluster,3938
#flare.analytics.cluster.CommunityStructure,3812
#flare.analytics.cluster.HierarchicalCluster,6714
#flare.analytics.cluster.MergeEdge,743
#flare.analytics.graph,
#flare.analytics.graph.BetweennessCentrality,3534
#flare.analytics.graph.LinkDistance,5731
#flare.analytics.graph.MaxFlowMinCut,7840
#flare.analytics.graph.ShortestPaths,5914
#flare.analytics.graph.SpanningTree,3416
#flare.analytics.optimization,
#flare.analytics.optimization.AspectRatioBanker,7074

import os
import requests
from bs4 import BeautifulSoup
import json 

# need these parameters to get correct response from web site
parameters = {
    'version':'255',
    'edition':'13',
    'ALL':'Show All'
}
#
# To find the above parameters:
# Open up Developer Tools in Chrome by selecting View > Developer > Developer Tools.
# Select the Network Tab.
# Visit the page you’re going to do your search from.
# Click the Clear button and then submit your form.
# The Network tab will fill with activity!
# Find the thing in the Network tab that looks like the same name as your webpage. Click it. In this case it was index.php
# On the right-hand side you get a new pane. Scroll down, keep scrolling, yep, all the way down until you see
# Form Data. Those are the parameters I found for version, edition and ALL.
#
# Refer to: http://jonathansoma.com/lede/foundations-2017/classes/adv-scraping/advanced-scraping-form-submission/

# we are doing a post here instead of the usual get because we are scraping a form. It gets trickier if we have to 
# submit headers.
#req = requests.post("http://www.talendforge.org/components/index.php", data = parameters)
req = requests.post("http://www.talendforge.org/components/index.php?version=255&edition=63&showAll=1")


# If you are going to be testing a lot, then write out the file to be polite to the web server 
# and observe web scraping etiquette. Your mother will love you for it.

# and the magic begins (lxml is the faster parser and recommended by BeautifulSoup. See docs.)
soup = BeautifulSoup(req.content, 'lxml')
#print(soup)
# start with all targeted divs, then descend down the hierarchy
# as appropriate. We found out via viewing the source of the web page that we are interested in 
# two classes: "family" (category) and "line_component" (name of the component)
# In the 2024 version of the web site this has changed so this find_all no longer functions appropriately
# divs = soup.find_all('div', class_=['family','line_component'])
tds = soup.find_all('td')
print(tds)


[<td colspan="9"><strong>AWS</strong></td>, <td>
<img alt="Image" class="img-thumbnail" src="/images/8.0.1/cAWSConnection_icon32.png"/>
</td>, <td>
<a href="https://help.talend.com/access/sources/content/topic?pageid=cawsconnection&amp;EnrichVersion=8.0&amp;afs:lang=en">
                                cAWSConnection
                            </a>
</td>, <td>
                            Establishes a connection to Amazon Web Services for data storage and retrieval.
                        </td>, <td>
<img alt="Delete" src="/images/delete.jpg"/>
</td>, <td>
<img alt="Delete" src="/images/delete.jpg"/>
</td>, <td>
<img alt="Delete" src="/images/delete.jpg"/>
</td>, <td>
<img alt="Delete" src="/images/delete.jpg"/>
</td>, <td>
<img alt="Delete" src="/images/delete.jpg"/>
</td>, <td>
<img alt="Apply" src="/images/apply.jpg"/>
</td>, <td>
<img alt="Image" class="img-thumbnail" src="/images/8.0.1/cAWSS3_icon32.png"/>
</td>, <td>
<a href="https://help.talend.com/access/sources/content/topic

In [30]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <title>
   Components
  </title>
  <!-- Include Bootstrap CSS (you need to have Bootstrap installed in your project) -->
  <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css" rel="stylesheet"/>
 </head>
 <body>
  <div class="container mt-5">
   <h1>
    Components
   </h1>
   <form method="get">
    <div class="form-group">
     <label for="versionDropdown">
      Select Version:
     </label>
     <select class="form-control" id="versionDropdown" name="version">
      <option data-id="255" value="255">
       8.0.1
      </option>
      <option data-id="254" value="254">
       7.3.1
      </option>
      <option data-id="253" value="253">
       7.2.1
      </option>
      <option data-id="252" value="252">
       7.1.1
      </option>
      <option data-id="251" value="251">
       7.0.1
      </option>
      <option data-id="250" value="250">
       6.5.1
      </option>
      <option data-

In [29]:
cats = soup.find_all('strong')
print(cats)


[<strong>AWS</strong>, <strong>Big Data</strong>, <strong>Business</strong>, <strong>Business Intelligence</strong>, <strong>Business_Intelligence</strong>, <strong>Cloud</strong>, <strong>Connectivity</strong>, <strong>Core</strong>, <strong>Custom</strong>, <strong>Custom_Code</strong>, <strong>Data Quality</strong>, <strong>Databases</strong>, <strong>Databases NoSQL</strong>, <strong>Data_Privacy</strong>, <strong>Data_Quality</strong>, <strong>Deprecated</strong>, <strong>DotNET</strong>, <strong>ElasticSearch</strong>, <strong>ELT</strong>, <strong>ESB</strong>, <strong>Exception Handling</strong>, <strong>File</strong>, <strong>Internet</strong>, <strong>Logs_Errors</strong>, <strong>Machine Learning</strong>, <strong>Messaging</strong>, <strong>Misc</strong>, <strong>Miscellaneous</strong>, <strong>Orchestration</strong>, <strong>Processing</strong>, <strong>Routing</strong>, <strong>Storage</strong>, <strong>System</strong>, <strong>Talend</strong>, <strong>Talend Cloud</stron

In [13]:
# Matchlist creates another level or two if there are multiple matches. 
# Examining the data leads us to conclude that the max number of levels is 3 but the data viz shows up to 
# four levels and it gets cut off by the viewport. (fixed in 2023)
# Data Engineering note: These entries do not have corresponding icons in the gathered data
# so we delete them manually for now:
"""
Talend_Studio_8.Cloud.Azure.tAzureStorageInputTable,221
Talend_Studio_8.Cloud.Azure.tAzureStorageOutputTable,223
Talend_Studio_8.Databases.SAP.tSAPHanaUnload,702
Talend_Studio_8.Talend MDM,
Talend_Studio_8.Talend MDM.tMDMBulkLoad,1306
Talend_Studio_8.Talend MDM.tMDMClose,1307
Talend_Studio_8.Talend MDM.tMDMCommit,1308
Talend_Studio_8.Talend MDM.tMDMConnection,1309
Talend_Studio_8.Talend MDM.tMDMDelete,1310
Talend_Studio_8.Talend MDM.tMDMInput,1311
Talend_Studio_8.Talend MDM.tMDMOutput,1312
Talend_Studio_8.Talend MDM.tMDMReceive,1313
Talend_Studio_8.Talend MDM.tMDMRestInput,1314
Talend_Studio_8.Talend MDM.tMDMRollback,1315
Talend_Studio_8.Talend MDM.tMDMRouteRecord,1316
Talend_Studio_8.Talend MDM.tMDMSP,1317
Talend_Studio_8.Talend MDM.tMDMTriggerInput,1318
Talend_Studio_8.Talend MDM.tMDMTriggerOutput,1319
Talend_Studio_8.Talend MDM.tMDMViewSearch,1320
"""
def matchparent(input):
    # matchlist was crafted in a manual and iterative fashion based upon looking at the data. The categories
    # are logical albeit contrived. Beware some artifacts may have been introduced but the total number of component
    # names scraped from the web site was not altered or duplicated unless there was not a corresponding icon file.
    # Handled in post-processing.
    matchlist = ["Amazon","Cassandra","DynamoDB","HBase","HDFS","Hive","Impala","MapR","MarkLogic",
             "MongoDB","Neo4j","Riak","Sqoop","Marketo","Netsuite","Salesforce","SAP",
             "ServiceNow","MSSql","Netezza","Oracle","Palo","Postgres","Splunk","Sybase",
             "Teradata", "Aurora","Redshift","Azure","Box","Cosmos","Dropbox",
             "Google","S3","Snowflake","Java","Access","EXA","Firebird","Greenplum","Informix",
             "Ingres","Interbase","Kudu","ParAccel","Couchbase","JBoss","Kafka","Microsoft", "Mysql",
             "Vertica", "VectorWise", "SQLite", "SQLDWH", "LDAP", "JDBC", "AS400", "DB2", "HSQLD",
             "EXist", "SQS", "FileInput", "FileOutput", "FileStream", "Uniserve",
             "Match", "Address"]
    list = []
    for item in matchlist:
        if item in input:
            list.append(item)
    
    return list


# family and line_component counters for data viz parameters
i = j = 0
# create main dictionary and initialize top element
data_fabric = []
#data_fabric['id'] = "Talend Data Fabric"
#data_fabric['children'] = []

#
# need to inspect web page to test the following logic (in case web page has changed).
# this logic was driven by the fact that the family div and the div where the component information
# resided were at the same level; no explict parent:child relationship defined but the relationship was 
# implicit based upon position or sequence in the web page so the family category was decoupled from
# the component info.
#
# initialize first family value; this is our trigger to add family and component children to the dictionary
# and start a new list of components
oldfamily = ""
#create dictionaries to store component levels
matches = {}
matchesL2 = {}
#open output file (should check for errors but we are hacking here)
#this is the main data file that is read in for the component tree visualization
f = open("XTalendComponentsV8-2023.csv","w+")
    
out = "id, value\nTalend Studio 8,\n"
#out = "id, value\nTalend Data Fabric,\n"
f.write(out)
#print("id, value\nTalend_Data_Fabric,")
#using underscores because D3 / DOM doesn't like spaces (used to be Talend Studio 8)
#parsed and replaced with spaces in Javascript code so they don't show up in the component tree
root = "Talend_Studio_8."
#root = "Talend Data Fabric."
outputString = ""
matchitems = [] # a list of matched items
descriptionList = {}
iconList = {} # added this list to output the entire name of the icon file as some icons are tagged with generic_component
# icon files. Older logic created the filename on the fly and this failed so we output the file name from the scrapage.

# seen is for actual list of component names (de-duped)
seen = []
# loop over all the divs returned from BeautifulSoup's find_all
# remember the family and the component name are at the same level in the DOM so we have to create a hierarchy
for div in divs: #iterate over the divs
    classname = div.get('class',[''])[0] # stolen from stackoverflow 8)
    if classname == 'family':
        family = div.text.strip()
        # family = family.replace(" ", "_") # not sure if D3 can handle spaces. Yep it can.
        if (family != oldfamily): #indicates we are at another family
            oldfamily = family
        outputString = family
        out = root+outputString + ",\n"
        f.write(out)
        # print(root+outputString + ",")
        j+=1 #family counter
    elif classname =='line_component': # edge node
        # this may be overkill but its Sunday morning and I want to finish this
        name = div.find(class_='name').find('a').string.strip()
        if name in seen:
            print("family, duplicate name", family, name)
        else:
            # maintain a list of actual component names for the sake of veracity 8)
            seen.append(name)
        # We are getting the component icon and description for tooltips in our data viz.
        img = div.find('img', src=True)
        #print("img: ", img['src'].split("images/8.0.1/")[1]) #get the icon filename; needs to change if the path is differnent
        iconList[name] = img['src'].split("images/8.0.1/")[1]
        desc = div.find(class_='description')
        # the following are research notes for the logic for no description in a div (bad data)
        # check for value in description (or else it is NoneType)
        # NoneType = type(None)
        #if isinstance(type(desc), NoneType):
        #if desc is not None:
        #
        # some of divs don't have a description in them so we check for this (len == 0). If we didn't then crash.
        if len( desc.get_text ( strip = True )) == 0:
            #print("No description")
            desc = "No description available."
        else:
            #print(i)
            desc = desc.string.strip()

        descriptionList[name] = desc
        #print("Name, img, Desc = ", name, img['src'], desc)
       '''
        # the following code works but (and should be put in a function) but we only do this once
        filename = os.path.basename(img['src'])
        #
        image = "https://talendforge.org/components/" + img['src']
        #
        # get icon file and write it locally; this takes awhile so we only do it once
        # Note: manually create the images directory first
        with open('img/'+filename, 'wb') as f2:
            res = requests.get(image)
            f2.write(res.content)
            f2.close()
        '''
        # the following logic further categorizes the components to make the data viz graphic more readable
        matchlist = matchparent(name) # do we need to create another level?
        if matchlist:                 # found a match, create a new level.
            matcheditem = family + matchlist[0]
            if (not(matcheditem in matches)): # create another level below 'family'
                matches[matcheditem] = matcheditem
                out = root+family + "."+ matchlist[0] +",\n" # kludgy
                f.write(out)
                #print(root+family + "."+ matchlist[0] +",")
            if(len(matchlist) == 2): # create yet another level below 'level1'; no more, max reached.
                checkitem = family+matchlist[0]+matchlist[1]
                if (not(checkitem in matchesL2)): 
                    matchesL2[checkitem] = checkitem
                    out = root+family + "."+ matchlist[0] + "." + matchlist[1] + ",\n"
                    f.write(out)
                    #print(root+family + "."+ matchlist[0] + "." + matchlist[1] + ",")
                
                out = root+family + "."+ matchlist[0] + "."+ matchlist[1] + "." + name +"," + str(i+j) + "\n"
                f.write(out)
                #print(root+family + "."+ matchlist[0] + "."+ matchlist[1] + "." + name +",",i+j)
            else:
                out = root+family + "."+ matchlist[0] + "." + name +"," + str(i+j) + "\n"
                f.write(out)
                #print(root+family + "."+ matchlist[0] + "." + name +",",i+j) 
        else: # edge node
            out = root+family + "." + name + "," + str(i+j) + "\n"
            f.write(out)
            #print(root+family + "." + name + ",",i+j)
        outputString = ""

        i+=1 # component counter
f.close()
print('end of divs...')
#print (descriptionList)
json_string = json.dumps(descriptionList)
f = open("XComponentDescriptionsV8-2024.json","w")
f.write(json_string)
f.close()
json_string = json.dumps(iconList)
f = open("XComponentIconsV8-2024.json","w")
f.write(json_string)
f.close()
# output description list in the following format:
# component Name: component Description
    
# current stats are 43 families and 1399 components!
print("Statistics:\n Number of families/categories:\t", j,"\n Number of scraped components:\t",i,"\n Number of actual components:\t", len(seen))

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 132)