This notebook presents a bot to calculate ESG Scoref from a list of companies divided in

- The data collection
- ESG calculation using the calculation of the ESG score based on the method introduced in the paper Sustainable Entrepreneurship: Identification of Startups' ESG Properties from Text Data with Machine Learning

Python code for the calculation of the ESG score based on available code at sustainableentrepreneurship.org
- https://github.com/sasi2400/sustainableentrepreneurship.org/blob/main/Notebooks/ESG%20score%20calculation.ipynb

# 1 - Imports and methods

In [9]:
# import required packages
import pandas as pd
import numpy as np
import re
from cleantext import clean # install via: pip install clean-text

#import the word-lists
ESGwordlist= pd.read_csv('https://www.dropbox.com/s/e28dihonntg8o82/expanded_dict.csv?dl=1')
e = [x.replace('_',' ') for x in list(ESGwordlist['E'].dropna())]
s = [x.replace('_',' ') for x in list(ESGwordlist['S'].dropna())]
g = [x.replace('_',' ') for x in list(ESGwordlist['G'].dropna())]

In [10]:
# packages required for scraping
import requests
import re
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup

from urllib.request import urlopen

In [11]:
# required methods
def get_ngrams(s, n):
    '''
    tokenize an input text 
    source: https://albertauyeung.github.io/2018/06/03/generating-ngrams.html
    '''
    # Convert to lowercases
    s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

def cleaner(txt):    
    '''
    Clean the input text
    '''
    return clean(
        txt,
        fix_unicode=True,               # fix various unicode errors
        to_ascii=True,                  # transliterate to closest ASCII representation
        lower=True,                     # lowercase text
        no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
        no_urls=True,                  # replace all URLs with a special token
        no_emails=True,                # replace all email addresses with a special token
        no_phone_numbers=True,         # replace all phone numbers with a special token
        no_numbers=True,                # replace all numbers with a special token
        no_digits=True,                 # replace all digits with a special token
        no_currency_symbols=True,      # replace all currency symbols with a special token
        no_punct=True,                 # remove punctuations
        replace_with_punct="",          # instead of removing punctuations you may replace them
        replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="",
        replace_with_number="",
        replace_with_digit="",
        replace_with_currency_symbol="",
        lang="en"                       # set to 'de' for German special handling
        )

# 2 - ESG Calculator

In [12]:
def ESG_Calculator(text):
    '''
    return the ESG scores for an input text
    '''
    try:      
        text=re.sub('[\\n]','',text)
        text=get_ngrams(text,1)+get_ngrams(text,2)+get_ngrams(text,3)+get_ngrams(text,4)
        counts=[]
        counts= [text.count(x) for x in e]
        e_Freq=sum(counts) 
        e_Diversity = len([x for x in counts if x!=0])    
        counts= [text.count(x) for x in s]
        s_Freq=sum(counts) 
        s_Diversity = len([x for x in counts if x!=0])            
        counts= [text.count(x) for x in g]
        g_Freq=sum(counts) 
        g_Diversity = len([x for x in counts if x!=0])    
        esg_Diversity= e_Diversity/len(e) +s_Diversity/len(s) +g_Diversity/len(g)
        return (esg_Diversity,(e_Diversity/len(e)),(s_Diversity/len(s)),(g_Diversity/len(g)))
    except Exception as ex:
        print(ex)
        return (0,0,0,0)


In [13]:
def mostreESG(ESG):
    print('''
            ESG Score: {0}  \n
            Environmental Score: {1}  \n
            Social Score: {2} \n
            Governance Score {3} 
            '''.format(np.round(ESG[0], 3),np.round(ESG[1], 3),np.round(ESG[2], 3),np.round(ESG[3], 3)))

In [14]:
import csv

AllData = []
count=0
with open('../../data/icobase/TOTAL_ICOMARKS.csv') as argsFile:
	spamreader = csv.reader(argsFile, delimiter=',')
	for row in spamreader:
		if row[2].find(".html") == -1:
			row[2] = row[2] + ".html"
			count=count+1
		AllData.append(row)
header = AllData.pop(0)

In [16]:
import csv

path = '../../data/'
AllData = []
count=0
with open(path+'icobase/TOTAL_ICOMARKS.csv') as argsFile:
	spamreader = csv.reader(argsFile, delimiter=',')
	for row in spamreader:
		AllData.append(row)
header = AllData.pop(0)

## WebScout ESG
  - Designed to automate the process of gathering and analyzing data from a list of web addresses.
  - It systematically visits each provided URL, extracts relevant textual content—most likely company
    descriptions—and or whitepapers
- Use the function *ESG_Calculator*, to compute Environmental, Social, and Governance (ESG) scores
   based on the extracted text of each company, and integrated the ESG scores back into the original
   data structure (`AllData`).
- **Note:** the script incorporates robust error handling mechanisms to manage potential issues 
  during network requests and maintains a record of any URLs that could not be successfully processed,
  allowing for subsequent review or re-attempt.

In [278]:
ammntICOs = len(AllData)
failed_urls = []
for i in range(ammntICOs-1):
    url = AllData[i][2]
    #response = None
    try:
        response = requests.get(url) #, timeout=3)
        response.raise_for_status()
    except requests.exceptions.RequestException as err:
        print ("OOps: Something Else",err)
    except requests.exceptions.HTTPError as errh:
        print ("Http Error:",errh)
    except requests.exceptions.ConnectionError as errc:
        print ("Error Connecting:",errc)
    except requests.exceptions.Timeout as errt:
        print ("Timeout Error:",errt)  
    if response:
        # parse the HTML content of the page
        soup = BeautifulSoup(response.content, "html.parser")
        product_search_element = soup.find(id="company-description")
        list_item = soup.find('div', attrs={'class': 'company-description'})
        text = list_item.text.strip()
        ESG = ESG_Calculator(text)
        AllData[i][3] = np.round(ESG[0], 3)
        AllData[i][4] = np.round(ESG[1], 3)
        AllData[i][5] = np.round(ESG[2], 3)
        AllData[i][6] = np.round(ESG[3], 3)
        #mostreESG(ESG)
    else:
        failed_urls.append(url)

OOps: Something Else 404 Client Error: Not Found for url: https://icomarks.com/ico/Flash
OOps: Something Else 404 Client Error: Not Found for url: https://icomarks.com/ico/black-wave
OOps: Something Else 404 Client Error: Not Found for url: https://icomarks.com/ico/livetree
OOps: Something Else 404 Client Error: Not Found for url: https://icomarks.com/ico/Digipay-Network
OOps: Something Else 404 Client Error: Not Found for url: https://icomarks.com/ico/Azuma-Games
OOps: Something Else 404 Client Error: Not Found for url: https://icomarks.com/blockchain/the-ux
OOps: Something Else 404 Client Error: Not Found for url: https://icomarks.com/ico/TAGZ
OOps: Something Else 404 Client Error: Not Found for url: https://icomarks.com/ico/EasyDex
OOps: Something Else 404 Client Error: Not Found for url: https://icomarks.com/ico/CRYPTO-TRON-EXCHANGE-AND-SHOP
OOps: Something Else 404 Client Error: Not Found for url: https://icomarks.com/ico/CareerXon
OOps: Something Else 404 Client Error: Not Found 

- ico_notfound is alist of URLs that could not be successfully processed

In [285]:
import csv 

ammnt_failed = len(failed_urls)
tam = len(AllData)

copy = failed_urls
    
ico_notfound = []
ico_notfound.append(['id', 'name', 'url', 'ESG', 'E', 'S', 'G'])

for i in range(ammnt_failed):  
    notfound = True 
    j = 0
    while notfound and j < tam:
        if failed_urls[i] == AllData[j][2]:
            ico_notfound.append([AllData[j][0],AllData[j][1],copy[i],AllData[j][3],AllData[j][4],AllData[j][5],AllData[j][6]])
            notfound = False
        else:
            j= j + 1
len(ico_notfound)

76

In [288]:
# FALHAS_ICOMARKS.csv has all companies url that failed to load 
import csv 
path = '../../data/'
filename = path+'icobase/FALHAS_ICOMARKS.csv'
with open(filename, 'w', newline="") as newfile:
	csvwriter = csv.writer(newfile)
#	csvwriter.writerow(header[0])
	csvwriter.writerows(ico_notfound)
newfile.close()

In [289]:
DataFoundico = []

with open(path+'icobase/TOTAL_FOUNDICO.csv') as argsFile:
	spamreader = csv.reader(argsFile, delimiter=',')
	for row in spamreader:
		DataFoundico.append(row)
headerOf = DataFoundico.pop(0)
ammntFoundico = len(DataFoundico)

In [292]:
failed_foundico_urls = []
ammntFoundico = len(DataFoundico)
for i in range(ammntFoundico-1):
    url = DataFoundico[i][2]
#    response = requests.get(url)   
    try:
        response = requests.get(url) #, timeout=3)
        response.raise_for_status()
    except requests.exceptions.RequestException as err:
        print ("OOps: Something Else",err)
    except requests.exceptions.HTTPError as errh:
        print ("Http Error:",errh)
    except requests.exceptions.ConnectionError as errc:
        print ("Error Connecting:",errc)
    except requests.exceptions.Timeout as errt:
        print ("Timeout Error:",errt)  
    if response:
        # parse the HTML content of the page
        print(url)
        soup = BeautifulSoup(response.content, "html.parser")
        lista = soup.find_all('p')
        text = soup.find_all('p')[1].get_text()
        #text = list_item.text.strip()
        ESG = ESG_Calculator(text)
        DataFoundico[i][3] = np.round(ESG[0], 3)
        DataFoundico[i][4] = np.round(ESG[1], 3)
        DataFoundico[i][5] = np.round(ESG[2], 3)
        DataFoundico[i][6] = np.round(ESG[3], 3)
        #mostreESG(ESG)
    else:
        failed_foundico_urls.append(url)

OOps: Something Else 404 Client Error: Not Found for url: https://foundico.com/ico/greenenergy.html
https://foundico.com/ico/PowerCorp.HTML
https://foundico.com/ico/HERO-Token.HTML
https://foundico.com/ico/PowerLoan.HTML
https://foundico.com/ico/Gene-Blockchain-Token.HTML
https://foundico.com/ico/Vinnd.HTML
https://foundico.com/ico/SmartHealthcareToday.HTML
https://foundico.com/ico/NWP-solution.HTML
https://foundico.com/ico/THEBIKECOIN.HTML
https://foundico.com/ico/HUSSY.HTML
https://foundico.com/ico/Wind-Energy-Mining.HTML
https://foundico.com/ico/GMP-Project.HTML
https://foundico.com/ico/Industria.HTML
https://foundico.com/ico/Seratio-Platform.HTML
https://foundico.com/ico/TuteCoin.HTML
https://foundico.com/ico/Clear-Shop-Vision-ORC.HTML
https://foundico.com/ico/TrustedCars-Flex.HTML
https://foundico.com/ico/LEONARDO.HTML
https://foundico.com/ico/Cher-Ecocity.HTML
https://foundico.com/ico/flat-earth-coin-.html
https://foundico.com/ico/CryptoHunters.HTML
https://foundico.com/ico/Izzy-

https://foundico.com/ico/HAWKOIN.HTML
https://foundico.com/ico/Clear-Shop-Vision-ltd.HTML
https://foundico.com/ico/SunX.HTML
https://foundico.com/ico/RAD-Lending-Inc.HTML
https://foundico.com/ico/GastroAdvisor.HTML
https://foundico.com/ico/Upstake.HTML
https://foundico.com/ico/Briastorm.HTML
https://foundico.com/ico/Neuroseed.HTML
https://foundico.com/ico/Dealjoy.HTML
https://foundico.com/ico/Orvium.HTML
https://foundico.com/ico/DrupeCoin.HTML
https://foundico.com/ico/MODULE.HTML
https://foundico.com/ico/Museums-Chain.HTML
https://foundico.com/ico/CEREAL.HTML
https://foundico.com/ico/ECX-Token.HTML
https://foundico.com/ico/EJA-COIN.HTML
https://foundico.com/ico/ROAR.HTML
https://foundico.com/ico/LENDELTA.HTML
https://foundico.com/ico/PayPerBlock.HTML
https://foundico.com/ico/BIT-MONEY.HTML
https://foundico.com/ico/Land-LayBy-Listing.HTML
https://foundico.com/ico/Echarge.HTML
https://foundico.com/ico/CarHash.HTML
https://foundico.com/ico/First-Investment-Token.HTML
https://foundico.com/

In [328]:
FoundicoFound = []
tam_failed = len(failed_foundico_urls)
ammntFoundico = len(DataFoundico)

for i in range(ammntFoundico - 1):
    notFound = True
    j = 0
    while notFound and j < tam_failed:
        if (DataFoundico[i][2] != failed_foundico_urls[j]):
            FoundicoFound.append(DataFoundico[i])
            notFound = False
        else:
            print(failed_foundico_urls[j])
            j = j+1

https://foundico.com/ico/GreenEnergy.HTML


In [329]:
tam_found = len(FoundicoFound)
for i in range(tam_found-1):
    AllData.append(FoundicoFound[i])

In [330]:
len(AllData)

6357

In [331]:
import csv 
tam = len(AllData)

icos_esg = []
icos_esg.append(['id', 'name', 'url', 'ESG', 'E', 'S', 'G'])

for i in range(tam):  
    icos_esg.append(AllData[i])
path = '../../data/'
filename = path+'icobase/icos_esg_09_04.csv'
with open(filename, 'w', newline="") as newfile:
    csvwriter = csv.writer(newfile)
    csvwriter.writerows(icos_esg)
newfile.close()

In [223]:
failed_urls = []

for i in range(ammntResrICOs-1):
    url = RestOfAllData[i][2]
    response = requests.get(url)
    if response:
        # parse the HTML content of the page
        soup = BeautifulSoup(response.content, "html.parser")
        product_search_element = soup.find(id="company-description")
        list_item = soup.find('div', attrs={'class': 'company-description'})
        text = list_item.text.strip()
        ESG = ESG_Calculator(text)
        RestOfAllData[i][3] = np.round(ESG[0], 3)
        RestOfAllData[i][4] = np.round(ESG[1], 3)
        RestOfAllData[i][5] = np.round(ESG[2], 3)
        RestOfAllData[i][6] = np.round(ESG[3], 3)
        #mostreESG(ESG)
    else:
        failed_urls.append(url)

IndexError: list index out of range