In [1]:
import requests
from bs4 import BeautifulSoup # para analizar el HTML
import pandas as pd
import numpy as np

### OBTAIN THE METADATA OF THE PAGE
The metadata is a simple description of the page.

In [2]:
def html_request(url: str):
    try:
        # Realizar la solicitud HTTP
        response = requests.get(url)
        response.raise_for_status()  # Lanzar una excepción para errores HTTP 

        # Verificar si la solicitud fue exitosa (código de estado 200)
        print(response.status_code)
        if response.status_code == 200:
            # Parsear el contenido HTML de la página
            soup = BeautifulSoup(response.text, 'html.parser')
            return soup
        else:
            print(f'Error al realizar la solicitud. Código de estado: {response.status_code}')
            return f'ERROR: {response.status_code}'
    
    except requests.exceptions.RequestException as e:
        # Capturar excepción de solicitud y devolver un mensaje de error
        return f'Request_error: {e}'

    except Exception as e:
        # Capturar cualquier otra excepción y devolver un mensaje de error
        return f'Unexpected_error: {e}'

In [3]:
def obtain_metadata(html):
    if html is None or not isinstance(html, BeautifulSoup):
        return html
    
    meta_tags = html.find_all('meta')  

    # Inicializar un diccionario para almacenar la metadata
    metadata = {}

    # Iterar sobre las etiquetas meta y extraer información relevante
    for tag in meta_tags:
        nombre = tag.get('name', '').lower()
        propiedad = tag.get('property', '').lower()
        contenido = tag.get('content', '')

        if nombre or propiedad:
            # Almacenar la información en el diccionario de metadata
            metadata[nombre or propiedad] = contenido

    return metadata

In [4]:
csv_file = '/home/unai/datasets/POC Description of operations - Sheet3.csv'
# Leer el archivo CSV y crear un DataFrame
data = pd.read_csv(csv_file)

In [5]:
data.head()

Unnamed: 0,Company_NAME,SIC1,URL
0,"BRINK'S UKRAINE, INC.",7381,us.brinks.com
1,"AES CALGARY, INC.",4911,www.aes.com
2,AURORA AES HOLDINGS INC,4911,www.aes.com
3,"AES CENTRAL AMERICAN HOLDINGS, INC.",4911,www.aes.com
4,"AES CARBON HOLDINGS, LLC",4911,www.aes.com


In [None]:
url = 'https://'
url = url + 'www.carlyle.com'
html = html_request(url)
type(html)

In [None]:
metadata = obtain_metadata(html)
# Imprimir la metadata obtenida
for clave, valor in metadata.items():
    print(f'{clave}: {valor}')

description: With $382 billion of assets under management, Carlyle’s purpose is to invest wisely and create value on behalf of our investors, portfolio companies, and communities.
abstract: With $382 billion of assets under management, Carlyle’s purpose is to invest wisely and create value on behalf of our investors, portfolio companies, and communities.
robots: index, follow
google-site-verification: 2Y2hej7s3nXX9HyOiRt8ZoZ5bImc4N2_iigmai9tSpY
generator: Drupal 9 (https://www.drupal.org)
mobileoptimized: width
handheldfriendly: true
viewport: width=device-width, initial-scale=1.0


Here in an example, we can see the info obtained inside the meta tag of the html. As we can see the description field, summarizes in a single line the objectives of the organization

In [6]:
urls = data['URL'].sample(n=10, random_state=1)

In [24]:
all_urls = data['URL']

In [28]:
url_descriptions_all = {}
url_descriptions_all = pd.DataFrame({'URL': all_urls})
url_base = 'http://'
for url in all_urls:
    print(url)
    url_completa = url_base + url
    html = html_request(url_completa)
    metadata = obtain_metadata(html)
    if type(metadata) != str:
        print(metadata.get('description'))
        url_descriptions_all.loc[url_descriptions_all['URL'] == url, 'metadata'] = metadata.get('description')
    else:
        url_descriptions_all.loc[url_descriptions_all['URL'] == url, 'metadata'] = metadata

us.brinks.com
200
When you choose Brink’s, you’ll help your business save time and money while optimizing your operations, protecting your funds, managing your cash.
www.aes.com
200
AES is a global energy company that creates greener, smarter and innovative energy solutions. Together, we can accelerate the future of energy.
www.aes.com
200
AES is a global energy company that creates greener, smarter and innovative energy solutions. Together, we can accelerate the future of energy.
www.aes.com
200
AES is a global energy company that creates greener, smarter and innovative energy solutions. Together, we can accelerate the future of energy.
www.aes.com
200
AES is a global energy company that creates greener, smarter and innovative energy solutions. Together, we can accelerate the future of energy.
www.aes.com
200
AES is a global energy company that creates greener, smarter and innovative energy solutions. Together, we can accelerate the future of energy.
www.aes.com
200
AES is a global en

In [29]:
url_descriptions_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   URL       300 non-null    object
 1   metadata  267 non-null    object
dtypes: object(2)
memory usage: 4.8+ KB


As we can see 30 organizations hadn't meta tag inside their html

In [30]:
metadata_csv = 'metadata_empresas.csv'

# Guardar el DataFrame en un archivo CSV
url_descriptions_all.to_csv(metadata_csv, index=False)

Now we want to see how many error have meen obtained during the request of the html because different reasons

In [33]:
errores_metadata = url_descriptions_all
errores_metadata['errores'] = url_descriptions_all['metadata'].apply(lambda x: str(x).lower().find('error'))

In [37]:
errores_metadata['errores'].value_counts(normalize=True)

errores
-1    0.766667
 8    0.233333
Name: proportion, dtype: float64

# We obtain a correct summarize of the organization role with the meta tag in 76% of the cases.

### OBTAIN ALL THE HTML OF THE WEB

In [None]:
# Ejemplo: Extraer todos los enlaces de la página
links = html.find_all('a')

# Imprimir los enlaces encontrados
for link in links:
    print(link.get('href'))

In [16]:
print(html.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="en" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# ">
 <head>
  <link href="https://www.carlyle.com/themes/carlyle_2020/favicons/favicon-96x96.png" rel="icon" sizes="96x96" type="image/png"/>
  <link href="https://www.carlyle.com/themes/carlyle_2020/favicons/favicon.ico" rel="shortcut icon"/>
  <meta charset="utf-8"/>
  <script type="text/javascript">
   (window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"abf569a42c",applicationID:"65780628"};;/*! For license information please see nr-loader-rum-1.248.0.min.js.LICENSE.txt */
(()=>{var e,t,n={234:(e,t,n)=>{"us

I dind't develop so much si option as it has too much noise and i don't know if it could be neccesary to analize all that info to extract some value data from this