In [1]:
import requests
from bs4 import BeautifulSoup # para analizar el HTML
import pandas as pd
import numpy as np
from boilerpy3 import extractors
import re

### OBTAIN THE METADATA OF THE PAGE
The metadata is a simple description of the page.

In [2]:
def html_request(url: str):
    try:
        # Realizar la solicitud HTTP
        response = requests.get(url)
        response.raise_for_status()  # Lanzar una excepción para errores HTTP 

        # Verificar si la solicitud fue exitosa (código de estado 200)
        print(response.status_code)
        if response.status_code == 200:
            # Parsear el contenido HTML de la página
            soup = BeautifulSoup(response.text, 'html.parser')
            return soup
        else:
            print(f'Error al realizar la solicitud. Código de estado: {response.status_code}')
            return f'ERROR!: {response.status_code}'
    
    except requests.exceptions.RequestException as e:
        # Capturar excepción de solicitud y devolver un mensaje de error
        return f'Request_error!: {e}'

    except Exception as e:
        # Capturar cualquier otra excepción y devolver un mensaje de error
        return f'Unexpected_error!: {e}'

In [8]:
def obtain_metadata(html):
    if html is None or not isinstance(html, BeautifulSoup):
        return html
    
    meta_tags = html.find_all('meta')  

    # Inicializar un diccionario para almacenar la metadata
    metadata = {}

    # Iterar sobre las etiquetas meta y extraer información relevante
    for tag in meta_tags:
        nombre = tag.get('name', '').lower()
        propiedad = tag.get('property', '').lower()
        contenido = tag.get('content', '')

        if nombre or propiedad:
            # Almacenar la información en el diccionario de metadata
            metadata[nombre or propiedad] = contenido

    return metadata

In [2]:
import pandas as pd

In [3]:
csv_file = '/home/unai/datasets/POC Description of operations - Sheet3.csv'
# Leer el archivo CSV y crear un DataFrame
data = pd.read_csv(csv_file)

In [4]:
data.head()

Unnamed: 0,Company_NAME,SIC1,URL,WEB
0,"BRINK'S UKRAINE, INC.",7381,us.brinks.com,see my product options\nThe way in which you h...
1,"AES CALGARY, INC.",4911,www.aes.com,e n e r g y\nAccelerating the future of\nenerg...
2,AURORA AES HOLDINGS INC,4911,www.aes.com,e n e r g y\nAccelerating the future of\nenerg...
3,"AES CENTRAL AMERICAN HOLDINGS, INC.",4911,www.aes.com,e n e r g y\nAccelerating the future of\nenerg...
4,"AES CARBON HOLDINGS, LLC",4911,www.aes.com,e n e r g y\nAccelerating the future of\nenerg...


In [25]:
url = 'https://'
url = url + 'www.creelighting.com'  #'www.carlyle.com'
html = html_request(url)

200


In [None]:
metadata = obtain_metadata(html)
# Imprimir la metadata obtenida
for clave, valor in metadata.items():
    print(f'{clave}: {valor}')

description: With $382 billion of assets under management, Carlyle’s purpose is to invest wisely and create value on behalf of our investors, portfolio companies, and communities.
abstract: With $382 billion of assets under management, Carlyle’s purpose is to invest wisely and create value on behalf of our investors, portfolio companies, and communities.
robots: index, follow
google-site-verification: 2Y2hej7s3nXX9HyOiRt8ZoZ5bImc4N2_iigmai9tSpY
generator: Drupal 9 (https://www.drupal.org)
mobileoptimized: width
handheldfriendly: true
viewport: width=device-width, initial-scale=1.0


Here in an example, we can see the info obtained inside the meta tag of the html. As we can see the description field, summarizes in a single line the objectives of the organization

In [6]:
urls = data['URL'].sample(n=10, random_state=1)

In [24]:
all_urls = data['URL']

In [28]:
url_descriptions_all = {}
url_descriptions_all = pd.DataFrame({'URL': all_urls})
url_base = 'http://'
for url in all_urls:
    print(url)
    url_completa = url_base + url
    html = html_request(url_completa)
    metadata = obtain_metadata(html)
    if type(metadata) != str:
        print(metadata.get('description'))
        url_descriptions_all.loc[url_descriptions_all['URL'] == url, 'metadata'] = metadata.get('description')
    else:
        url_descriptions_all.loc[url_descriptions_all['URL'] == url, 'metadata'] = metadata

us.brinks.com
200
When you choose Brink’s, you’ll help your business save time and money while optimizing your operations, protecting your funds, managing your cash.
www.aes.com
200
AES is a global energy company that creates greener, smarter and innovative energy solutions. Together, we can accelerate the future of energy.
www.aes.com
200
AES is a global energy company that creates greener, smarter and innovative energy solutions. Together, we can accelerate the future of energy.
www.aes.com
200
AES is a global energy company that creates greener, smarter and innovative energy solutions. Together, we can accelerate the future of energy.
www.aes.com
200
AES is a global energy company that creates greener, smarter and innovative energy solutions. Together, we can accelerate the future of energy.
www.aes.com
200
AES is a global energy company that creates greener, smarter and innovative energy solutions. Together, we can accelerate the future of energy.
www.aes.com
200
AES is a global en

In [29]:
url_descriptions_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   URL       300 non-null    object
 1   metadata  267 non-null    object
dtypes: object(2)
memory usage: 4.8+ KB


As we can see 30 organizations hadn't meta tag inside their html

In [30]:
metadata_csv = 'metadata_empresas.csv'

# Guardar el DataFrame en un archivo CSV
url_descriptions_all.to_csv(metadata_csv, index=False)

Now we want to see how many error have meen obtained during the request of the html because different reasons

In [33]:
errores_metadata = url_descriptions_all
errores_metadata['errores'] = url_descriptions_all['metadata'].apply(lambda x: str(x).lower().find('error'))

In [37]:
errores_metadata['errores'].value_counts(normalize=True)

errores
-1    0.766667
 8    0.233333
Name: proportion, dtype: float64

# We obtain a correct summarize of the organization role with the meta tag in 76% of the cases.

### OBTAIN ALL THE HTML OF THE WEB

In [42]:
html_extractor = extractors.ArticleExtractor()
html_extractor.get_content_from_url('https://www.carlyle.com/our-firm')

"Sorry, your browser doesn't support embedded videos\nWho We Are\nOur Firm\nCarlyle is one of the world’s largest and most diversified global investment firms, with $382 billion of assets under management across 3 business segments and 600 investment vehicles. Founded in 1987 in Washington, DC, our global team today is comprised of more than 2,200 professionals operating in 28 offices across 4 continents. Together, across our firm, our mission is to drive long-term value for our investors, companies, shareholders, people and communities.\nOur Business Segments\nGLOBAL PRIVATE EQUITY\nDirectory\nOur Approach\nSince our founding over 30 years ago, we have always looked to create lasting partnerships across all of our businesses. We work with our partners to find solutions that drive sustainable value and impact over the long term. We have built trust and credibility by leveraging our global scale, industry expertise and diverse insights to deliver better solutions and build better busine

In [None]:
html_extractor

In [26]:
# Ejemplo: Extraer todos los enlaces de la página
links = html.find_all('a')

# Imprimir los enlaces encontrados
for link in links:
    print(link.get('href'))

/?lang=en
/?lang=fr
/document-library/?document_categories=spec-sheets
/company/careers/
/contact/
/resources/general-form/
/lighting-contact-form/
/consumer-lighting-form/
/where-to-buy/
/where-to-buy/commercial/
/where-to-buy/petroleum/
/products/intelligent-lighting/connected-max-smart-products/#where-to-buy
tel:800-236-6800
tel:800-473-1234
/
/products/
/products/outdoor/
/products/outdoor/area/
/products/outdoor/canopy-and-soffit/
/products/outdoor/street-and-roadway/
/products/outdoor/decorative-street-and-roadway/
/products/outdoor/parking-structure/
/products/outdoor/bollards-and-pathway/
/products/outdoor/flood/
/products/outdoor/wall-mount/
/products/outdoor/vapor-tight/
/products/outdoor/accessories-outdoor/
/products/outdoor/accessories-outdoor/?product_categories=poles-tenons-brackets
/products/indoor/
/products/indoor/troffers/
/products/indoor/high-bay-low-bay/
/products/indoor/dynamic-skylight/
/products/indoor/specification-linear/
/products/indoor/surface-ambient/
/pr

In [19]:
html_compelte = html.prettify()

In [26]:
# Returns the text from a HTML file
def parse_html(html_path):
    # Text extraction with boilerpy3
    html_extractor = extractors.ArticleExtractor()
    return html_extractor.get_content(html_path)

In [13]:
# Returns the text from a HTML file
def parse_html_url(html_path):
    # Text extraction with boilerpy3
    html_extractor = extractors.ArticleExtractor()
    return html_extractor.get_content_from_url(html_path)

In [14]:
# Extracts the text from all html files in a specified directory
def html_to_text(html):
    parsed_text = parse_html_url(html)
    return parsed_text

In [22]:
def obtain_menu_link(html):
    # Ejemplo: Extraer todos los enlaces de los menus de la pagina
    links = html.find_all('li')
    print(links)
    # Imprimir los enlaces encontrados
    for link in links:
        if link.get('class') == ['menu-item']: # nos quedamos con los de tipo menu
            item = link.find_all('a')
            item_links = [a['href'] for a in item]
    return item_links[0] 

In [None]:
urls_extractor = data.sample(n=15, random_state=1).reset_index(drop=True)

In [24]:
url_base = 'https://'
html_text = np.empty(15, dtype=object)
for i, url in urls_extractor.iterrows():
    print(url['Company_NAME'])
    print(url['URL'])
    url_complete = url_base + url['URL']
    html = html_request(url_complete)
    item_links = obtain_menu_link(html)
    if str(html).lower().find('error!:') != -1:
        print(html)
    else:
        html_compelte = html.prettify()
        html_text[i] = html_to_text(html_compelte)
    for item_link in item_links: 
        print(url_complete + item_link)
        html = html_request(url_complete + item_link)
        if str(html).lower().find('error!:') != -1:
            print(html)
        else:
            html_compelte = html.prettify()
            html_text[i] = html_text[i] + html_to_text(html_compelte)

IDEAL INDUSTRIES LIGHTING LLC
www.creelighting.com
200
[<li class="css-pr10xp ezh7jmf0"><style data-emotion="css 1ny9qib">.css-1ny9qib{-webkit-transition:border-bottom 200ms ease-in-out 0ms;transition:border-bottom 200ms ease-in-out 0ms;font-size:1.8rem;color:#2C2C2C;position:relative;}.css-1ny9qib:hover,.css-1ny9qib:focus{-webkit-text-decoration:underline;text-decoration:underline;}</style><a class="css-1ny9qib ezh7jmf2" data-label="Products" href="/products/outdoor/area/">Area</a></li>, <li class="css-pr10xp ezh7jmf0"><a class="css-1ny9qib ezh7jmf2" data-label="Products" href="/products/outdoor/canopy-and-soffit/">Canopy &amp; Soffit</a></li>, <li class="css-pr10xp ezh7jmf0"><a class="css-1ny9qib ezh7jmf2" data-label="Products" href="/products/outdoor/street-and-roadway/">Street &amp; Roadway</a></li>, <li class="css-pr10xp ezh7jmf0"><a class="css-1ny9qib ezh7jmf2" data-label="Products" href="/products/outdoor/decorative-street-and-roadway/">Decorative Street &amp; Roadway</a></li>, 

UnboundLocalError: cannot access local variable 'item_links' where it is not associated with a value

In [18]:
# Ejemplo: Extraer todos los enlaces de los menus de la pagina
links = html.find_all('li')

# Imprimir los enlaces encontrados
for link in links:
    if link.get('class') == ['menu-item']:
        item = link.find_all('a')
        item_links = [a['href'] for a in item]
        print(item_links[0])

/our-firm
/our-firm/global-private-equity
/our-firm/global-credit
/our-firm/global-investment-solutions
/global-insights
/esg
/diversity
/careers
/news-media
/contact-us
https://lpconnect.carlyle.com/
http://ir.carlyle.com/
/notices-and-disclaimers
/notices-and-disclaimers#notice-regarding-fake-news-and-fraudulent-activity
/notices-and-disclaimers#transparency--reporting
/notices-and-disclaimers#cookies-policy
/notices-and-disclaimers#privacy-notice
/notices-and-disclaimers#terms-of-use-policy


In [28]:
text = html_to_text(html_compelte)

In [29]:
text

"Sorry, your browser doesn't support embedded videos\nAnatomy of a Deal\nAnatomy of a Private Equity Deal: ESG integration in action\nIntegrating ESG data, analysis, and action helps us drive value across four major components of our equity investment process\nInvestment Deal Sourcing\nIn a changing world we are constantly evaluating our investment opportunity set, led by the thematic expertise of our specialized investment teams. Increasingly, ESG and impact themes are helping our investors assess investment opportunities from shifts such as:\nEmerging growth markets – for example, technologies driving better health outcomes at a lower cost, such as One Medical’s platform, described here\nMarket disruptions – the electrification of the vehicle fleet, for example, as seen through our Axletech investment in last year’s report here\nChanging consumer preferences – growing demand for sustainable and transparent goods and services, as demonstrated through Weiman’s growth in greener cleanin

In [40]:
urls_extractor = data.sample(n=15, random_state=1).reset_index(drop=True)

In [41]:
urls_extractor.head()

Unnamed: 0,Company_NAME,SIC1,URL
0,IDEAL INDUSTRIES LIGHTING LLC,3641,www.creelighting.com
1,"GOODWILL INDUSTRIES OF ORANGE COUNTY, CALIFORNIA",5932,www.ocgoodwill.org
2,KKR GROUP FINANCE CO. VI LLC,6282,www.kkr.com
3,WINK TO WEBSTER PIPELINE LLC,1623,www.winktowebsterpipeline.com
4,"AUTOMATIC DATA PROCESSING INSURANCE AGENCY, INC.",7374,insurance.adp.com


In [45]:
url_base = 'https://'
html_text = np.empty(15, dtype=object)
for i, url in urls_extractor.iterrows():
    print(url['Company_NAME'])
    url_complete = url_base + url['URL']
    html = html_request(url_complete)
    if str(html).lower().find('error!:') != -1:
        print(html)
    else:
        html_compelte = html.prettify()
        html_text[i] = html_to_text(html_compelte)

IDEAL INDUSTRIES LIGHTING LLC
200
GOODWILL INDUSTRIES OF ORANGE COUNTY, CALIFORNIA
Request_error!: 403 Client Error: Forbidden for url: https://www.ocgoodwill.org/
KKR GROUP FINANCE CO. VI LLC
200
WINK TO WEBSTER PIPELINE LLC
Request_error!: HTTPSConnectionPool(host='www.winktowebsterpipeline.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7ff9c14c4690>: Failed to establish a new connection: [Errno 111] Connection refused'))
AUTOMATIC DATA PROCESSING INSURANCE AGENCY, INC.
200
OIL STATES INDUSTRIES US, INC.
200
AMERICAN RETIREMENT ASSOCIATION




200
BECTON DICKINSON LUXEMBOURG III LLC
200
CARLYLE ASIA REAL ESTATE II GP, L.P.
200
ROC NATION ADVERTISING LLC
200
RAH SACO-CH, LLC
200
NEXEO PLASTICS, LLC
200
ALLIED GROUP, LLC
200
INDY ASBURY CHEV LLC
200
MONGODB, INC.
200


In [52]:
html_text[5]

'More News »\nOffshore/Manufactured Products\nDesigns, manufactures and sells capital equipment utilized on floating production systems, subsea pipelines, offshore drilling rigs and vessels, and for industrial applications along with shorter-cycle products used in land applications.\nLearn More »\nWell Site Services\nProvides service equipment and personnel primarily for completion and production operations throughout the active regions of the United States, Gulf of Mexico, and internationally, along with land drilling services in the U.S. Rocky Mountain region.\n'

I dind't develop so much si option as it has too much noise and i don't know if it could be neccesary to analize all that info to extract some value data from this