In [1]:
#
# This tiny python script scrapes the talendforge.org web site for a list of Talend components and their corresponding
# categories (families) and creates a csv file in the following format:
# Family, Component Name, Data Integration, Map/Reduce, Spark Batch, Spark Streaming, Storm, Camel
# The last six are flags: Yes or No
#
import os
import requests
from bs4 import BeautifulSoup

'''
Reference data from the web page's source:

Versions:
<option value="255">8.0.1</option>
<option VALUE=254 >7.3.1</option>
<option VALUE=253 >7.2.1</option>
<option VALUE=252 >7.1.1</option>
<option VALUE=251 >7.0.1</option>
<option VALUE=250 >6.5.1</option>
<option VALUE=249 >6.4.1</option>

Editions:

<td colspan="2" id="tdEdition">
<option value="1">Talend Open Studio for Data Integration</option>
<option value="3">Talend Open Studio for Big Data</option> What happened to '2'?
<option value="4">Talend Open Studio for ESB</option>
<option value="5">Talend Data Integration</option>
<option value="6">Talend Big Data</option>
<option value="7">Talend ESB</option>
<option value="8">Talend Data Management Platform</option>
<option value="9">Talend Big Data Platform</option>
<option value="10">Talend Data Services Platform</option>
<option value="11">Talend Real-time Big Data Platform</option>
<option value="12">Talend MDM Platform</option>
<option value="13">Talend Data Fabric</option>
<option value="57">Talend Cloud Big Data</option>
<option value="58">Talend Cloud Big Data Platform</option>
<option value="59">Talend Cloud Data Integration</option>
<option value="60">Talend Cloud Data Management Platform</option>
<option value="61">Talend Cloud Real-Time Big Data Platform</option>
<option value="62">Talend Cloud API Services Platform</option>
<option value="63">Talend Cloud Data Fabric</option>
</td>
'''
# need this to get correct response from web site; and they change over time so check the page source!
parameters = {
    'version':'253',
    'edition':'8',
    'ALL':'Show All'
}
#
# To find the above parameters:
# Open up Developer Tools in Chrome by selecting View > Developer > Developer Tools.
# Select the Network Tab.
# Visit the page youâ€™re going to do your search from.
# Click the Clear button and then submit your form.
# The Network tab will fill with activity!
# Find the thing in the Network tab that looks like the same name as your webpage. Click it. In this case it was index.php
# On the right-hand side you get a new pane. Scroll down, keep scrolling, yep, all the way down until you see
# Form Data. Those are the parameters I found for version, edition and ALL.
#
# Refer to: http://jonathansoma.com/lede/foundations-2017/classes/adv-scraping/advanced-scraping-form-submission/

# we are doing a post here instead of the usual get because we are scraping a form. It gets trickier if we have to 
# submit headers.
req = requests.post("http://www.talendforge.org/components/index.php", data = parameters)

# If you are going to be testing a lot, then write out the file to be polite to the web server 
# and observe web scraping etiquette. Your mother will love you for it.

# and the magic begins (lxml is the faster parser and recommended by BeautifulSoup. See docs.)
soup = BeautifulSoup(req.content, 'lxml')

# start with all targeted divs, then descend down the hierarchy
# as appropriate. We found out via viewing the source of the web page that we are interested in 
# two classe: "family" (category) and "line_component"
divs = soup.find_all('div', class_=['family','line_component'])

# family and line_component counters
i = j = 0

#
# need to inspect web page to test the following logic (in case web page has changed).
# this logic was driven by the fact that the family div and the div where the component information
# resided were at the same level; no explict parent:child relationship defined but the relationship was 
# implicit based upon position or sequence in the web page so the family category was decoupled from
# the component info.
#
# initialize first family value; this is our trigger to add family and component children to dictionary
# and start a new list of components
oldfamily = ""
#
#matchpatterns - here is the logic, if the platform is needed then the apply.jpg is shown, if not then the
#delete.jpg is shown. Used the img tag to discern and then translated to No or Yes flags.

matchpattern = "components/delete.jpg"
matchpattern2 = "components/apply.jpg"
#
#open file (should check for errors but we are hacking here)
#721 is the version and DMP is the edition we are scraping for so name your output file accordingly.
#filename = "TalendComponents_v721DMP.csv"
# (could automate this...nah.)
filename = "TalendComponents_v721DMP.csv"

f = open(filename,"w+")
    
out = "Family, Component Name, Data Integration, Map/Reduce, Spark Batch, Spark Streaming, Storm, Camel\n"
#write out the header
f.write(out)

#
# loop over all the divs returned from BeautifulSoup's find_all
#
for div in divs:
    classname = div.get('class',[''])[0] #stolen from stackoverflow 8)
    if classname == 'family':
        family = div.text.strip()
        if (family != oldfamily): #indicates we are at another level
            #data_fabric['children'].append({'name':oldfamily,'children':listofcomponents})
            oldfamily = family
        # reset list
            listofcomponents = []
        j = j + 1 #family counter
    elif classname =='line_component':
        # this may be overkill but its Sunday morning and I want to finish this
        name = div.find(class_='name').find('a').string.strip()

        listofflags = []
        listofimgs = []
        #within the 'line_component' context we grab the versions class and all imgs tags.
        #this logic is based upon viewing the web page source
        imgs = div.find(class_='versions').find_all('img')
        
        for img in imgs:
            listofimgs.append(str(img))   
        #based upon careful observations of the web page source, we see that the sequence of the
        #images are reversed so we reverse to map to correct header order
        k = 5
        for i in range(6):
            #print("Reversed = ", listofimgs[k])
            #match to create flags - No/Yes
            if matchpattern in listofimgs[k]:
                 listofflags.append("No")
            elif matchpattern2 in listofimgs[k]:
                 listofflags.append("Yes") 
            k = k - 1
        print(family, name, *listofflags, sep=", ", file=f)
        #print(family, name, *listofflags, sep=", ")

#output file close
f.close()


Authentication, tSetKeystore, No, No, No, No, No, No
Big Data, tBigQueryBulkExec, Yes, No, No, No, No, No
Big Data, tBigQueryInput, Yes, No, No, No, No, No
Big Data, tBigQueryOutput, Yes, No, No, No, No, No
Big Data, tBigQueryOutputBulk, Yes, No, No, No, No, No
Big Data, tBigQuerySQLRow, Yes, No, No, No, No, No
Big Data, tGSBucketCreate, Yes, No, No, No, No, No
Big Data, tGSBucketDelete, Yes, No, No, No, No, No
Big Data, tGSBucketExist, Yes, No, No, No, No, No
Big Data, tGSBucketList, Yes, No, No, No, No, No
Big Data, tGSClose, Yes, No, No, No, No, No
Big Data, tGSConnection, Yes, No, No, No, No, No
Big Data, tGSCopy, Yes, No, No, No, No, No
Big Data, tGSDelete, Yes, No, No, No, No, No
Big Data, tGSGet, Yes, No, No, No, No, No
Big Data, tGSList, Yes, No, No, No, No, No
Big Data, tGSPut, Yes, No, No, No, No, No
Big Data, tHiveClose, Yes, No, No, No, No, No
Big Data, tHiveConnection, Yes, No, No, No, No, No
Big Data, tHiveRow, Yes, No, No, No, No, No
Business, tBonitaDeploy, Yes, No, No,

Cloud, tGSList, Yes, No, No, No, No, No
Cloud, tGSPut, Yes, No, No, No, No, No
Cloud, tMarketoBulkExec, Yes, No, No, No, No, No
Cloud, tMarketoCampaign, Yes, No, No, No, No, No
Cloud, tMarketoConnection, Yes, No, No, No, No, No
Cloud, tMarketoInput, Yes, No, No, No, No, No
Cloud, tMarketoListOperation, Yes, No, No, No, No, No
Cloud, tMarketoOutput, Yes, No, No, No, No, No
Cloud, tNetsuiteInput, Yes, No, No, No, No, No
Cloud, tNetsuiteOutput, Yes, No, No, No, No, No
Cloud, tRedshiftBulkExec, Yes, No, No, No, No, No
Cloud, tRedshiftClose, Yes, No, No, No, No, No
Cloud, tRedshiftCommit, Yes, No, No, No, No, No
Cloud, tRedshiftConnection, Yes, No, No, No, No, No
Cloud, tRedshiftInput, Yes, No, No, No, No, No
Cloud, tRedshiftOutput, Yes, No, No, No, No, No
Cloud, tRedshiftOutputBulk, Yes, No, No, No, No, No
Cloud, tRedshiftOutputBulkExec, Yes, No, No, No, No, No
Cloud, tRedshiftRollback, Yes, No, No, No, No, No
Cloud, tRedshiftRow, Yes, No, No, No, No, No
Cloud, tRedshiftUnload, Yes, No, No

Databases, tAmazonOracleClose, Yes, No, No, No, No, No
Databases, tAmazonOracleCommit, Yes, No, No, No, No, No
Databases, tAmazonOracleConnection, Yes, No, No, No, No, No
Databases, tAmazonOracleInput, Yes, No, No, No, No, No
Databases, tAmazonOracleOutput, Yes, No, No, No, No, No
Databases, tAmazonOracleRollback, Yes, No, No, No, No, No
Databases, tAmazonOracleRow, Yes, No, No, No, No, No
Databases, tAmazonRedshiftManage, Yes, No, No, No, No, No
Databases, tAS400CDC, Yes, No, No, No, No, No
Databases, tAS400Close, Yes, No, No, No, No, No
Databases, tAS400Commit, Yes, No, No, No, No, No
Databases, tAS400Connection, Yes, No, No, No, No, No
Databases, tAS400Input, Yes, No, No, No, No, No
Databases, tAS400LastInsertId, Yes, No, No, No, No, No
Databases, tAS400Output, Yes, No, No, No, No, No
Databases, tAS400Rollback, Yes, No, No, No, No, No
Databases, tAS400Row, Yes, No, No, No, No, No
Databases, tBigQueryInput, No, No, No, No, No, No
Databases, tBigQueryOutput, No, No, No, No, No, No
Dat

Databases, tNetezzaInput, Yes, No, No, No, No, No
Databases, tNetezzaNzLoad, Yes, No, No, No, No, No
Databases, tNetezzaOutput, Yes, No, No, No, No, No
Databases, tNetezzaRollback, Yes, No, No, No, No, No
Databases, tNetezzaRow, Yes, No, No, No, No, No
Databases, tNetezzaSCD, Yes, No, No, No, No, No
Databases, tOleDbInput, Yes, No, No, No, No, No
Databases, tOleDbOutput, Yes, No, No, No, No, No
Databases, tOleDbRow, Yes, No, No, No, No, No
Databases, tOracleBulkExec, Yes, No, No, No, No, No
Databases, tOracleCDC, Yes, No, No, No, No, No
Databases, tOracleCDCOutput, Yes, No, No, No, No, No
Databases, tOracleClose, Yes, No, No, No, No, No
Databases, tOracleCommit, Yes, No, No, No, No, No
Databases, tOracleConnection, Yes, No, No, No, No, No
Databases, tOracleInput, Yes, No, No, No, No, No
Databases, tOracleInvalidRows, Yes, No, No, No, No, No
Databases, tOracleOutput, Yes, No, No, No, No, No
Databases, tOracleOutputBulk, Yes, No, No, No, No, No
Databases, tOracleOutputBulkExec, Yes, No, 

ELT, tFirebirdConnection, Yes, No, No, No, No, No
ELT, tGreenplumConnection, Yes, No, No, No, No, No
ELT, tHiveConnection, Yes, No, No, No, No, No
ELT, tIngresConnection, Yes, No, No, No, No, No
ELT, tInterbaseConnection, Yes, No, No, No, No, No
ELT, tJDBCConnection, Yes, No, No, No, No, No
ELT, tMSSqlConnection, Yes, No, No, No, No, No
ELT, tMysqlConnection, Yes, No, No, No, No, No
ELT, tNetezzaConnection, Yes, No, No, No, No, No
ELT, tOracleConnection, Yes, No, No, No, No, No
ELT, tParAccelConnection, Yes, No, No, No, No, No
ELT, tPostgresPlusConnection, Yes, No, No, No, No, No
ELT, tPostgresqlConnection, Yes, No, No, No, No, No
ELT, tSQLiteConnection, Yes, No, No, No, No, No
ELT, tSQLTemplate, Yes, No, No, No, No, No
ELT, tSQLTemplateAggregate, Yes, No, No, No, No, No
ELT, tSQLTemplateCommit, Yes, No, No, No, No, No
ELT, tSQLTemplateFilterColumns, Yes, No, No, No, No, No
ELT, tSQLTemplateFilterRows, Yes, No, No, No, No, No
ELT, tSQLTemplateMerge, Yes, No, No, No, No, No
ELT, tSQLTem

Processing, tExtractEDIField, Yes, No, No, No, No, No
Processing, tExtractJSONFields, Yes, No, No, No, No, No
Processing, tExtractPositionalFields, Yes, No, No, No, No, No
Processing, tExtractRegexFields, Yes, No, No, No, No, No
Processing, tExtractXMLField, Yes, No, No, No, No, No
Processing, tFilterColumns, Yes, No, No, No, No, No
Processing, tFilterRow, Yes, No, No, No, No, No
Processing, tHMap, Yes, No, No, No, No, No
Processing, tJoin, Yes, No, No, No, No, No
Processing, tMap, Yes, No, No, No, No, No
Processing, tNormalize, Yes, No, No, No, No, No
Processing, tReplace, Yes, No, No, No, No, No
Processing, tReplicate, No, No, No, No, No, No
Processing, tRules, Yes, No, No, No, No, No
Processing, tSampleRow, Yes, No, No, No, No, No
Processing, tSortRow, Yes, No, No, No, No, No
Processing, tSplitRow, Yes, No, No, No, No, No
Processing, tSurviveFields, Yes, No, No, No, No, No
Processing, tUniqRow, No, No, No, No, No, No
Processing, tWriteDynamicFields, Yes, No, No, No, No, No
Processin