In [1]:
#
# This tiny python script scrapes the talendforge.org web site for a list of Talend components and their corresponding
# categories (families) and creates a hierarchy and writes the hierarchy to a json file to be input into a D3 data
# visualization.
#
# Data format we want to output:
# Note that only leaf nodes or edge nodes have a value.

import os
import requests
from bs4 import BeautifulSoup
import json 
'''
Reference data from the web page's source to submit parameters for form request:

Versions:
<option value="255">8.0.1</option>
<option VALUE=254 >7.3.1</option>
<option VALUE=253 >7.2.1</option>
<option VALUE=252 >7.1.1</option>
<option VALUE=251 >7.0.1</option>
<option VALUE=250 >6.5.1</option>
<option VALUE=249 >6.4.1</option>

Editions:

<td colspan="2" id="tdEdition">
<option value="1">Talend Open Studio for Data Integration</option>
<option value="3">Talend Open Studio for Big Data</option> What happened to '2'?
<option value="4">Talend Open Studio for ESB</option>
<option value="5">Talend Data Integration</option>
<option value="6">Talend Big Data</option>
<option value="7">Talend ESB</option>
<option value="8">Talend Data Management Platform</option>
<option value="9">Talend Big Data Platform</option>
<option value="10">Talend Data Services Platform</option>
<option value="11">Talend Real-time Big Data Platform</option>
<option value="12">Talend MDM Platform</option>
<option value="13">Talend Data Fabric</option>
<option value="57">Talend Cloud Big Data</option>
<option value="58">Talend Cloud Big Data Platform</option>
<option value="59">Talend Cloud Data Integration</option>
<option value="60">Talend Cloud Data Management Platform</option>
<option value="61">Talend Cloud Real-Time Big Data Platform</option>
<option value="62">Talend Cloud API Services Platform</option>
<option value="63">Talend Cloud Data Fabric</option>
</td>
'''
# *** Change these based on version
# need these to get correct response from web site
parameters = {
    'version':'255',
    'edition':'63',
    'ALL':'Show All'
}
#
# To find the above parameters:
# Open up Developer Tools in Chrome by selecting View > Developer > Developer Tools.
# Select the Network Tab.
# Visit the page you’re going to do your search from.
# Click the Clear button and then submit your form.
# The Network tab will fill with activity!
# Find the thing in the Network tab that looks like the same name as your webpage. Click it. In this case it was index.php
# On the right-hand side you get a new pane. Scroll down, keep scrolling, yep, all the way down until you see
# Form Data. Those are the parameters I found for version, edition and ALL.
#
# Refer to: http://jonathansoma.com/lede/foundations-2017/classes/adv-scraping/advanced-scraping-form-submission/

# we are doing a post here instead of the usual get because we are scraping a form. It gets trickier if we have to 
# submit headers.
req = requests.post("http://www.talendforge.org/components/index.php", data = parameters)

# If you are going to be testing a lot, then write out the file to be polite to the web server 
# and observe web scraping etiquette. Your mother will love you for it.

# and the magic begins (lxml is the faster parser and recommended by BeautifulSoup. See docs.)
soup = BeautifulSoup(req.content, 'lxml')

# start with all targeted divs, then descend down the hierarchy
# as appropriate. We found out via viewing the source of the web page that we are interested in 
# two classes: "family" (category) and "line_component"
divs = soup.find_all('div', class_=['family','line_component'])



In [2]:



# family and line_component counters for data viz parameters
i = 0

#
# need to inspect web page to test the following logic (in case web page has changed).
# this logic was driven by the fact that the family div and the div where the component information
# resided were at the same level; no explict parent:child relationship defined but the relationship was 
# implicit based upon position or sequence in the web page so the family category was decoupled from
# the component info.
#
# These two parameters change the path of the image file that is the 'src' attribute.
# Change these based upon what you want the input string is and what you want the outputImg field to look like.
substringFind = 'images/8.0.1'
substringReplace = 'img'

#open file (should check for errors but we are hacking here)
f = open("XTalendComponentsV8-2023.csv","w+")
    
out = "name, img, description, helpReference\n"
f.write(out) # output header information

# loop over all the divs returned from BeautifulSoup's find_all
for div in divs: #iterate over the divs
    classname = div.get('class',[''])[0] # stolen from stackoverflow 8)
    if classname =='line_component': # edge node
        # this may be overkill but its Sunday morning and I want to finish this
        # Get the 'raw' string name of the component
        name = div.find(class_='name').find('a').string.strip()
        # Get the http href link to the help.talend.com web page
        helpReference = div.find('a')['href']
        # Get the path to the image/icon
        img = div.find('img', src=True)
        # Create new outputImg string based upon parameters specified above.
        outputImg = img['src'].replace(substringFind, substringReplace)
        # Get the description for tooltips in our data viz.
        # sometimes a description is blank so we need to handle that.
        try:
            desc = div.find(class_='description').string.strip()
        except:
            desc = 'Description not available.'
        #print('Description: ', desc)
        #print("Name, OutputImg, Desc, Help Ref = ", name, outputImg, desc, helpReference)
        out = name + ',' + outputImg + ',' + desc + ',' + helpReference + '\n'
        print(out)
        f.write(out)
        # the following code works but (and should be put in a function)
        # but we only do this once as it is slow 8)
        filename = os.path.basename(img['src'])
        #
        image = "https://talendforge.org/components/" + img['src']
        #
        # get icon file and write it locally; this takes awhile so we only do it once
        #
        with open('images/'+filename, 'wb') as f2:
           res = requests.get(image) # sluggo
           f2.write(res.content)
           f2.close()


        i+=1 # component counter
f.close()

    
# current stats are 1286 components!
print("Statistics: Number of components:\t", i)


Statistics: Number of components:	 0
