In [1]:
# Python frameworks imports
import pandas as pd
import json
import requests
from lxml import html

In [2]:
## All I do here is to load the “html_content“ variable with the html content of the first - it works

# Extracting the page content and parsing the json content

url = "https://developer.salesforce.com/docs/get_document/atlas.en-us.api.meta"
response = requests.get(url)
parsed = json.loads(response.content)

# Getting the list of the Standard Objects dictionnaries

std_obj_list = parsed['toc'][1]['children'][1]['children']

# Creating the list of all the Standard Objects url and a list of all the get_content pages

url_list = []
getcontent_url_list = []

for i in range(len(std_obj_list)):
    url_end = std_obj_list[i]['a_attr']['href']
    
    url = 'https://developer.salesforce.com/docs/atlas.en-us.api.meta/api/'+url_end
    url_list.append(url)
    
    getcontent_url = 'https://developer.salesforce.com/docs/get_document_content/api/'+url_end+'/en-us/228.0'
    getcontent_url_list.append(getcontent_url)

In [32]:
error_url = []
df_objects = pd.DataFrame(columns=['ObjectName', 'Summary', 'Supported Calls','Special Access Rules','Usage','Associated Objects','Additional Considerations and Related Objects'])

for url in getcontent_url_list:
    # Try operator to avoid IndexError that'd raise if an object page doesn't render
    try :
        gc_content = requests.get(url)
        json_content = json.loads(gc_content.content)
        html_content = json_content['content']

        ## Cleaning the html code
        # I removed the span[@class='keyword parmname'] because I wouldn't get “Comment“ in the Description of the Field Name “Response“
        # I removed the <samp class="codeph nolang"> because I wouldn't get the Supported Calls otherwise
        html_content_1 = html_content.replace('<span class="keyword parmname">', "")
        html_content_2 = html_content_1.replace('</span>', "")
        html_content_3 = html_content_2.replace('\n            ', " ")
        html_content_4 = html_content_3.replace('<samp class="codeph nolang">','')
        html_content_5 = html_content_4.replace('</samp>','')
        html_content_6 = html_content_5.replace('\n   ','')
        html_content_7 = html_content_6.replace('       ','')
        html_content_8 = html_content_7.replace('\t','')
        html_content_9 = html_content_8.replace('\n',' ')
        html_content_clean = html_content_9.replace('\n           ',' ')


        # Importing lxml and creating a tree
        from lxml import html
        tree = html.fromstring(html_content_clean)

        # Extracting the name of the standard object
        object_name_list = tree.xpath('//span[@id="topic-title"]/text()')
        # Trying different xpaths to match all patterns found in the html code
        if len(object_name_list) > 0:
            object_name = object_name_list[0]
        else: 
            object_name_list = tree.xpath('//h1[@class="helpHead1"]/text()')
            object_name = object_name_list[0]

        # Extracting the description of the standard object
        summary_list = tree.xpath('//span[@id="summary"]/text()')
        # Trying different xpaths to match all patterns found in the html code
        if len(summary_list) > 0:
            summary = summary_list[0]
        else:
            summary_list = tree.xpath('//span[@id="summary"]/span/text()')
            summary = summary_list[0]

        # Extracting the Supported Calls of the standard object
        supported_calls_list = tree.xpath('//div[@class="section" and ./h2/text()="Supported Calls"]/p/text()')
        # Trying different xpaths to match all patterns found in the html code
        if len(supported_calls_list)>0: 
            supported_calls = supported_calls_list[0]
        else: supported_calls = None


        # Extracting the Special Access Rules of the standard object
        special_access_rules_list = tree.xpath('//div[@class="section" and ./h2/text()="Special Access Rules"]/p/text()')
        # Trying different xpaths to match all patterns found in the html code
        if len(special_access_rules_list)>0: 
            special_access_rules = special_access_rules_list[0]
        else: 
            special_access_rules_list = tree.xpath('//div[@class="section" and ./h2/text()="Special Access\nRules"]/p/text()')
            if len(special_access_rules_list)>0:
                special_access_rules = special_access_rules_list[0]
            else: special_access_rules = None

        # Extracting the Usage of the standard object
        usage_list = tree.xpath('//div[@class="section" and ./h2/text()="Usage"]/p/text()')
        if len(usage_list)>0: 
            usage = usage_list[0]
        else: 
            usage_list = tree.xpath('//div[@class="section" and ./h2/text()=" Usage"]/p/text()')
            if len(usage_list)>0:
                usage = usage_list[0]
            else : 
                usage_list = tree.xpath('//div[@class="section" and ./h2/text()="Usage"]/div/text()')
                if len(usage_list)>0:
                    usage = usage_list[0]
                else : usage = None
            

        # Extracting the Associated Objects of the standard object
        associated_objects_list = tree.xpath('//div[@class="section" and ./h2/text()="Associated Objects"]/p/text()')
        if len(associated_objects_list)>0: 
            associated_objects = associated_objects_list[0]
        else: associated_objects = None

        # Extracting the Additional Considerations and Related Objects of the standard object
        additional_considerations_list = tree.xpath('//div[@class="section" and ./h2/text()="Additional Considerations and Related Objects"]/p/text()')
        if len(additional_considerations_list)>0: 
            additional_considerations = additional_considerations_list[0]
        else: additional_considerations = None

        # Creating a dictionnary with all the scraped elements
        info_dict = {'ObjectName':object_name,'Summary':summary,'Supported Calls':supported_calls,'Special Access Rules':special_access_rules,'Usage':usage,'Associated Objects':associated_objects,'Additional Considerations and Related Objects':additional_considerations}
        # Adding a new row to the “Objects dataframe“. 
        df_objects = df_objects.append(info_dict, ignore_index=True)
       
    # Keeping all the error url to come back to them later
    except IndexError :
        error_url.append(url)
        

In [33]:
print(df_objects)

                 ObjectName  \
0     AcceptedEventRelation   
1                   Account   
2              AccountBrand   
3    AccountContactRelation   
4         AccountCleanInfo    
..                      ...   
884      WorkRewardFundType   
885             WorkThanks    
886                WorkType   
887           WorkTypeGroup   
888     WorkTypeGroupMember   

                                               Summary  \
0    Represents event participants (invitees or att...   
1    Represents an individual account, which is an ...   
2    Represents the brand details of a Partner Acco...   
3    Represents a relationship between a contact an...   
4    Stores the metadata Data.com Clean uses to det...   
..                                                 ...   
884       Represents the type of WorkRewardFundobject.   
885  Represents the source and message of a thanks ...   
886  Represents a type of work to be performed in F...   
887  Represents a grouping of work types used t

In [40]:
df_objects.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 7 columns):
 #   Column                                         Non-Null Count  Dtype 
---  ------                                         --------------  ----- 
 0   ObjectName                                     889 non-null    object
 1   Summary                                        889 non-null    object
 2   Supported Calls                                873 non-null    object
 3   Special Access Rules                           389 non-null    object
 4   Usage                                          381 non-null    object
 5   Associated Objects                             177 non-null    object
 6   Additional Considerations and Related Objects  6 non-null      object
dtypes: object(7)
memory usage: 48.7+ KB


In [51]:
df_objects.head(10)

Unnamed: 0,ObjectName,Summary,Supported Calls,Special Access Rules,Usage,Associated Objects,Additional Considerations and Related Objects
0,AcceptedEventRelation,Represents event participants (invitees or att...,"describeSObjects(), query(), retrieve()",,,,
1,Account,"Represents an individual account, which is an ...","create(), delete(), describeLayout(), describe...",Customer Portal users can access their own acc...,Use this object to query and manage accounts i...,This object has the following associated objec...,
2,AccountBrand,Represents the brand details of a Partner Acco...,"create(), delete(), describeLayout(), describe...",This object is available only if Salesforce Co...,,This object has the following associated objec...,
3,AccountContactRelation,Represents a relationship between a contact an...,"create(), delete(), describeLayout(), describe...",,Use this object to associate a single contact ...,,
4,AccountCleanInfo,Stores the metadata Data.com Clean uses to det...,"describeSObjects(), getDeleted(), getUpdated()...",,Administrators can modify a limited set of Acc...,,
5,AccountContactRole,Represents the role that a Contactplays on an ...,"create(), delete(), describeSObjects(), getDel...",,,,
6,AccountInsight,Represents an individual insight (a key busine...,"describeLayout(), describeSObjects(), getDelet...",To see an insight related to a specific accoun...,This object is read-only and isn’t supported w...,,
7,AccountOwnerSharingRule,Represents the rules for sharing an account wi...,"create(), delete(), describeSObjects(), getDel...",,Use this object to manage the sharing rules fo...,,
8,AccountPartner,This object represents a partner relationship ...,"create(), delete(), describeLayout()describeSO...",,,,
9,AccountRelationship,Represents a relationship of a given type betw...,"create(), delete(), describeLayout(), describe...","In Community Settings, turn on the Enable Acco...",,This object has the following associated objec...,


In [35]:
print(len(error_url))

14


In [37]:
df_objects.to_csv(r'/Users/macbook16decamille/Castor/Objects.csv', index = False)

In [44]:
error_url_2 = []
df_objects_2 = pd.DataFrame(columns=['ObjectName', 'Summary', 'Supported Calls','Special Access Rules','Usage','Associated Objects','Additional Considerations and Related Objects'])

for url in getcontent_url_list:
    try :
        gc_content = requests.get(url)
        json_content = json.loads(gc_content.content)
        html_content = json_content['content']

        ## Cleaning the html code
        # I removed the span[@class='keyword parmname'] because I wouldn't get “Comment“ in the Description of the Field Name “Response“
        # I removed the <samp class="codeph nolang"> because I wouldn't get the Supported Calls otherwise
        html_content_1 = html_content.replace('<span class="keyword parmname">', "")
        html_content_2 = html_content_1.replace('</span>', "")
        html_content_3 = html_content_2.rweplace('\n            ', " ")
        html_content_4 = html_content_3.replace('<samp class="codeph nolang">','')
        html_content_5 = html_content_4.replace('</samp>','')
        html_content_6 = html_content_5.replace('\n   ','')
        html_content_7 = html_content_6.replace('       ','')
        html_content_8 = html_content_7.replace('\t','')
        html_content_9 = html_content_8.replace('\n',' ')
        html_content_clean = html_content_9.replace('\n           ',' ')


        # Importing lxml and creating a tree

        from lxml import html
        tree = html.fromstring(html_content_clean)

        # Extracting the name of the standard object
        object_name_list = tree.xpath('//span[@id="topic-title"]/text()')
        if len(object_name_list) > 0:
            object_name = object_name_list[0]
        else: 
            object_name_list = tree.xpath('//h1[@class="helpHead1"]/text()')
            object_name = object_name_list[0]

        # Extracting the description of the standard object
        summary_list = tree.xpath('//span[contains(@id,"summary")]/text()')
        if len(summary_list) > 0:
            summary = summary_list[0]
        else:
            summary_list = tree.xpath('//span[contains(@id,"summary")]/span/text()')
            summary = summary_list[0]

        # Extracting the Supported Calls of the standard object
        supported_calls_list = tree.xpath('//div[@class="section" and ./h2[contains(text(),"Supported Calls")]]/p/text()')
        if len(supported_calls_list)>0: 
            supported_calls = supported_calls_list[0]
        else: supported_calls = None


        # Extracting the Special Access Rules of the standard object
        special_access_rules_list = tree.xpath('//div[@class="section" and ./h2[contains(text(),"Special Access")]]/p/text()')
        if len(special_access_rules_list)>0: 
            special_access_rules = special_access_rules_list[0]
        else: special_access_rules = None

        # Extracting the Usage of the standard object
        usage_list = tree.xpath('//div[@class="section" and ./h2[contains(text(),"Usage")]]/p/text()')
        if len(usage_list)>0: 
            usage = usage_list[0]
        else: usage = None

        # Extracting the Associated Objects of the standard object
        associated_objects_list = tree.xpath('//div[@class="section" and ./h2[contains(text(),"Associated Objects")]]/p/text()')
        if len(associated_objects_list)>0: 
            associated_objects = associated_objects_list[0]
        else: associated_objects = None

        # Extracting the Additional Considerations and Related Objects of the standard object
        additional_considerations_list = tree.xpath('//div[@class="section" and ./h2[contains(text(),"Additional Considerations")]]/p/text()')
        if len(additional_considerations_list)>0: 
            additional_considerations = additional_considerations_list[0]
        else: additional_considerations = None

        info_dict = {'ObjectName':object_name,'Summary':summary,'Supported Calls':supported_calls,'Special Access Rules':special_access_rules,'Usage':usage,'Associated Objects':associated_objects,'Additional Considerations and Related Objects':additional_considerations}
        df_objects_2 = df_objects.append(info_dict, ignore_index=True)
        
    except IndexError :
        error_url_2.append(url)

In [45]:
print(df_objects_2)

                 ObjectName  \
0     AcceptedEventRelation   
1                   Account   
2              AccountBrand   
3    AccountContactRelation   
4         AccountCleanInfo    
..                      ...   
885             WorkThanks    
886                WorkType   
887           WorkTypeGroup   
888     WorkTypeGroupMember   
889     WorkTypeGroupMember   

                                               Summary  \
0    Represents event participants (invitees or att...   
1    Represents an individual account, which is an ...   
2    Represents the brand details of a Partner Acco...   
3    Represents a relationship between a contact an...   
4    Stores the metadata Data.com Clean uses to det...   
..                                                 ...   
885  Represents the source and message of a thanks ...   
886  Represents a type of work to be performed in F...   
887  Represents a grouping of work types used to ca...   
888  Represents the relationship between a work

In [46]:
df_objects_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890 entries, 0 to 889
Data columns (total 7 columns):
 #   Column                                         Non-Null Count  Dtype 
---  ------                                         --------------  ----- 
 0   ObjectName                                     890 non-null    object
 1   Summary                                        890 non-null    object
 2   Supported Calls                                874 non-null    object
 3   Special Access Rules                           389 non-null    object
 4   Usage                                          381 non-null    object
 5   Associated Objects                             178 non-null    object
 6   Additional Considerations and Related Objects  6 non-null      object
dtypes: object(7)
memory usage: 48.8+ KB


In [48]:
print(len(error_url_2))

14
