In [1]:
import requests
from bs4 import BeautifulSoup # para analizar el HTML
import pandas as pd
import numpy as np
from boilerpy3 import extractors
import html2text
import re
import time
from openai import OpenAI

### OBTAIN THE METADATA OF THE PAGE
The metadata is a simple description of the page.

First we scrape the url of the company to extract the html in raw

In [2]:
def html_request(url: str):
    try:
        # Realizar la solicitud HTTP
        response = requests.get(url)
        response.raise_for_status()  # Lanzar una excepción para errores HTTP 

        # Verificar si la solicitud fue exitosa (código de estado 200)
        print(response.status_code)
        if response.status_code == 200:
            # Parsear el contenido HTML de la página
            soup = BeautifulSoup(response.text, 'html.parser')
            return soup
        else:
            print(f'Error al realizar la solicitud. Código de estado: {response.status_code}')
            return f'ERROR!: {response.status_code}'
    
    except requests.exceptions.RequestException as e:
        # Capturar excepción de solicitud y devolver un mensaje de error
        return f'Request_error!: {e}'

    except Exception as e:
        # Capturar cualquier otra excepción y devolver un mensaje de error
        return f'Unexpected_error!: {e}'

We filter the html to obtain the tag meta where we obtain a little description of the url 

In [8]:
def obtain_metadata(html):
    if html is None or not isinstance(html, BeautifulSoup):
        return html
    
    meta_tags = html.find_all('meta')  

    # Inicializar un diccionario para almacenar la metadata
    metadata = {}

    # Iterar sobre las etiquetas meta y extraer información relevante
    for tag in meta_tags:
        nombre = tag.get('name', '').lower()
        propiedad = tag.get('property', '').lower()
        contenido = tag.get('content', '')

        if nombre or propiedad:
            # Almacenar la información en el diccionario de metadata
            metadata[nombre or propiedad] = contenido

    return metadata

In [4]:
csv_file = '/home/unai/datasets/POC Description of operations - Sheet3.csv'
# Leer el archivo CSV y crear un DataFrame
data = pd.read_csv(csv_file)

In [None]:
data.head()

Unnamed: 0,Company_NAME,SIC1,URL,WEB,wikipedia_v1,wikipedia_v2,wikipedia_v3,WEB_2
0,"BRINK'S UKRAINE, INC.",7381,us.brinks.com,see my product options\nThe way in which you h...,,Joseph Robinette Biden Jr. ( BY-dən; born Nov...,,# Navigation\n\n * Skip to Content \n\n## A...
1,"AES CALGARY, INC.",4911,www.aes.com,e n e r g y\nAccelerating the future of\nenerg...,TransAlta Corporation (formerly Calgary Power ...,TransAlta Corporation (formerly Calgary Power ...,,Skip to main content\n\n[ ![Home](/themes/cust...
2,AURORA AES HOLDINGS INC,4911,www.aes.com,e n e r g y\nAccelerating the future of\nenerg...,This is a list of companies in the United Stat...,This is a list of companies in the United Stat...,,Skip to main content\n\n[ ![Home](/themes/cust...
3,"AES CENTRAL AMERICAN HOLDINGS, INC.",4911,www.aes.com,e n e r g y\nAccelerating the future of\nenerg...,Union Pacific Corporation is a publicly traded...,Union Pacific Corporation is a publicly traded...,,Skip to main content\n\n[ ![Home](/themes/cust...
4,"AES CARBON HOLDINGS, LLC",4911,www.aes.com,e n e r g y\nAccelerating the future of\nenerg...,The Abu Dhabi National Oil Company (Arabic: شر...,The Abu Dhabi National Oil Company (Arabic: شر...,,Skip to main content\n\n[ ![Home](/themes/cust...


In [6]:
data.loc[data['URL'] == 'www.columbuslibrary.org']

Unnamed: 0,Company_NAME,SIC1,URL,WEB,wikipedia_v1,wikipedia_v2,wikipedia_v3,WEB_2
190,COLUMBUS METROPOLITAN LIBRARY,8231,www.columbuslibrary.org,"Tuesday, Dec. 5 | 7 p.m.\nTune in to watch Sta...",Downtown Columbus is the central business dist...,Downtown Columbus is the central business dist...,,


In [10]:
url = 'https://'
url = url + 'www.columbuslibrary.org'  #'www.carlyle.com', 'www.creelighting.com'
html = html_request(url)

200


In [11]:
metadata = obtain_metadata(html)
# Imprimir la metadata obtenida
for clave, valor in metadata.items():
    print(f'{clave}: {valor}')

viewport: width=device-width, initial-scale=1
robots: index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1
description: Columbus Metropolitan Library offers helpful services and resources to the Columbus, Ohio community. Browse our library, find information, and access our tools today!
og:locale: en_US
og:type: website
og:title: Columbus Metropolitan Library
og:description: Columbus Metropolitan Library offers helpful services and resources to the Columbus, Ohio community. Browse our library, find information, and access our tools today!
og:url: https://www.columbuslibrary.org/
og:site_name: Columbus Metropolitan Library
article:publisher: https://www.facebook.com/columbuslibrary/
article:modified_time: 2023-12-11T12:25:43+00:00
og:image: https://www.columbuslibrary.org/wp-content/uploads/2023/02/150_orange_nt-1024x734.png
twitter:card: summary_large_image
twitter:site: @columbuslibrary
generator: Elementor 3.18.2; features: e_dom_optimization, e_optimized_asset

Here in an example, we can see the info obtained inside the meta tag of the html. As we can see the description field, summarizes in a single line the objectives of the organization

### Verify the type of response from the metadata in 10 companies

In [None]:
urls = data['URL'].sample(n=10, random_state=1)

In [None]:
all_urls = data['URL']

In [None]:
url_descriptions_all = {}
url_descriptions_all = pd.DataFrame({'URL': all_urls})
url_base = 'http://'
for url in all_urls:
    print(url)
    url_completa = url_base + url
    html = html_request(url_completa)
    metadata = obtain_metadata(html)
    if type(metadata) != str:
        print(metadata.get('description'))
        url_descriptions_all.loc[url_descriptions_all['URL'] == url, 'metadata'] = metadata.get('description')
    else:
        url_descriptions_all.loc[url_descriptions_all['URL'] == url, 'metadata'] = metadata

us.brinks.com
200
When you choose Brink’s, you’ll help your business save time and money while optimizing your operations, protecting your funds, managing your cash.
www.aes.com
200
AES is a global energy company that creates greener, smarter and innovative energy solutions. Together, we can accelerate the future of energy.
www.aes.com
200
AES is a global energy company that creates greener, smarter and innovative energy solutions. Together, we can accelerate the future of energy.
www.aes.com
200
AES is a global energy company that creates greener, smarter and innovative energy solutions. Together, we can accelerate the future of energy.
www.aes.com
200
AES is a global energy company that creates greener, smarter and innovative energy solutions. Together, we can accelerate the future of energy.
www.aes.com
200
AES is a global energy company that creates greener, smarter and innovative energy solutions. Together, we can accelerate the future of energy.
www.aes.com
200
AES is a global en

It looks good for a first approach, but a bit simple

In [None]:
url_descriptions_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   URL       300 non-null    object
 1   metadata  267 non-null    object
dtypes: object(2)
memory usage: 4.8+ KB


As we can see 33 organizations hadn't meta tag inside their html

In [None]:
metadata_csv = 'metadata_empresas.csv'

# Guardar el DataFrame en un archivo CSV
url_descriptions_all.to_csv(metadata_csv, index=False)

Now we want to see how many error have meen obtained during the request of the html because different reasons

In [None]:
errores_metadata = url_descriptions_all
errores_metadata['errores'] = url_descriptions_all['metadata'].apply(lambda x: str(x).lower().find('error'))

In [None]:
errores_metadata['errores'].value_counts(normalize=True)

errores
-1    0.766667
 8    0.233333
Name: proportion, dtype: float64

## INSIGHTS
- We obtain a correct summarize of the organization role with the meta tag in 76% of the cases.
- The company is the one that takes charge of writing the meta field. This implies that it can be something subjective

# OBTAIN DESCRIPTION BASED ON THE HTML TEXT AND A SUMMARIZER (CHATGPT)

In [12]:
html_extractor = extractors.ArticleExtractor()
html_extractor.get_content_from_url('https://www.carlyle.com/our-firm')

"Sorry, your browser doesn't support embedded videos\nWho We Are\nOur Firm\nCarlyle is one of the world’s largest and most diversified global investment firms, with $382 billion of assets under management across 3 business segments and 600 investment vehicles. Founded in 1987 in Washington, DC, our global team today is comprised of more than 2,200 professionals operating in 28 offices across 4 continents. Together, across our firm, our mission is to drive long-term value for our investors, companies, shareholders, people and communities.\nOur Business Segments\nGLOBAL PRIVATE EQUITY\nDirectory\nOur Approach\nSince our founding over 30 years ago, we have always looked to create lasting partnerships across all of our businesses. We work with our partners to find solutions that drive sustainable value and impact over the long term. We have built trust and credibility by leveraging our global scale, industry expertise and diverse insights to deliver better solutions and build better busine

This is a way of extracting only the text of the html with the boilerpy3 library

### We try implementing spyder with the scraping (extract new info from the links inside the web)

In [None]:
# Ejemplo: Extraer todos los enlaces de la página
links = html.find_all('a')

# Imprimir los enlaces encontrados
href_values = [link.get('href') for link in links]
href_array = np.array(href_values, dtype=object)
href_array

array(['/?lang=en', '/?lang=fr',
       '/document-library/?document_categories=spec-sheets',
       '/company/careers/', '/contact/', '/resources/general-form/',
       '/lighting-contact-form/', '/consumer-lighting-form/',
       '/where-to-buy/', '/where-to-buy/commercial/',
       '/where-to-buy/petroleum/',
       '/products/intelligent-lighting/connected-max-smart-products/#where-to-buy',
       'tel:800-236-6800', 'tel:800-473-1234', '/', '/products/',
       '/products/outdoor/', '/products/outdoor/area/',
       '/products/outdoor/canopy-and-soffit/',
       '/products/outdoor/street-and-roadway/',
       '/products/outdoor/decorative-street-and-roadway/',
       '/products/outdoor/parking-structure/',
       '/products/outdoor/bollards-and-pathway/',
       '/products/outdoor/flood/', '/products/outdoor/wall-mount/',
       '/products/outdoor/vapor-tight/',
       '/products/outdoor/accessories-outdoor/',
       '/products/outdoor/accessories-outdoor/?product_categories=poles

In [None]:
filtro = np.array(['www.hilton.com' in href if href is not None else False for href in href_array])

# Solo links relacionados con la pagina
href_filtrados = href_array[filtro]

In [None]:
len(href_filtrados)

42

In [None]:
href_filtrados

array(['https://www.hilton.com/en/',
       'https://www.hilton.com/en/locations/?cid=OH,WW,LocationsNav,MULTIPR,Header,Home,Brand',
       'https://www.hilton.com/en/offers/?cid=OH,WW,OffersNav,MULTIPR,Header,Home,Brand',
       'https://www.hilton.com/en/events/?cid=OH,WW,MeetingsEventsNav,MULTIPR,Header,Home,Brand',
       'https://www.hilton.com/en/hilton-honors/credit-cards/?cid=OH,WW,CobrandNav,MULTIPR,Header,Home,Brand',
       'https://www.hilton.com/en/hilton-honors/join/',
       'https://www.hilton.com/en/locations/?cid=OH,WW,DreamingLocations,MULTIPR,imageHeadliner,Home,Brand',
       'https://www.hilton.com/en/hotels/pdxbeqq-the-benson-portland/?cid=OH,MB,EntHPCarouselTagFriendsPDXBEQQ,MultiBR,Carousel,Home,SingleLink,i82345',
       'https://www.hilton.com/en/p/winter-travel/?cid=OH,WW,FriendsStayDec,MULTIPR,brandscarousel,Home,Brand',
       'https://www.hilton.com/en/beach/?cid=OH,WW,Beach,MULTIPR,gridthreesixnine2,Home,Brand',
       'https://www.hilton.com/en/p/pools/

In [13]:
# Ejemplo: Extraer todos los enlaces de los menus de la pagina
links = html.find_all('li')

# Imprimir los enlaces encontrados
for link in links:
    if link.get('class') == ['menu-item']:
        item = link.find_all('a')
        item_links = [a['href'] for a in item]
        print(item_links[0])

In [None]:
def obtain_menu_link(html):
    # Ejemplo: Extraer todos los enlaces de los menus de la pagina
    links = html.find_all('li')
    print(links)
    # Imprimir los enlaces encontrados
    for link in links:
        if link.get('class') == ['menu-item']: # nos quedamos con los de tipo menu
            item = link.find_all('a')
            item_links = [a['href'] for a in item]
    return item_links[0] 

## INSIGHTS:
- Bit difficult to implement, because there are too many references inside a web
- We could pick a few random links to get new information. This could also introduce some bias into the summarizer.
- Each html has it way of implementing the menu option of the header of the web. Not always does it with the menu item tag, sometimes is done with JS, which makes it more difficult to filter the search of important links

In [None]:
def get_html_text(html):
    for script in html(["script", "style"]):
        script.extract() 
        
    text = html.get_text()
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text

Another way of extracting the text from the html

In [None]:
get_html_text(html)

"Columbus Metropolitan Library\nSkip to content\nFAQs\nLocations\nContact Us\nMy Account\nFAQs\nLocations\nContact Us\nMy Account\nBooks & eContent\nExplore\nBookseContent\nFind your next great read or binge-worthy show.\nWe have it all – books, eBooks, streaming movies, music, TV shows and more.\nEvents\nWhat’s Happening\nPrograms & EventsVirtual Events (Crowdcast)Sesquicentennial Author Series (special-events)Battelle Author SeriesCulture Pass\nSesquicentennial Author Series\nOur series brings engaging national authors to you, free and open to all.Choose the Sesquicentennial Author Series (special-events) link under What’s Happening to see the lineup.\nKids & Teens\nKids, Teens & Teachers\nReady for KindergartenSchool HelpTeensTeacher ResourcesDial-A-Story\nSchool Help\nK-12 Students: Get free after-school help with your schoolwork in our School Help Centers.Choose the School Help link under\xa0Kids, Teens & Teachers.\nLEARN MORE\nAdults\nPrograms & Resources\nJob & Career HelpAdult 

In [18]:
def get_html_text2(html):
    h = html2text.HTML2Text()
    h.ignore_links = True # False para los links
    texto = h.handle(html.prettify())
    solo_letras = re.sub(r'[^a-zA-Z\s]', '', texto)
    texto_sin_links = re.sub(r'\b(.*image.*|.*png.*|.*http.*)\b', '', solo_letras)
    return texto_sin_links

Best way of extracting text (also the refs could be extracted). It returns a better format than the other options

In [15]:
urls_extractor = data.sample(n=15, random_state=1).reset_index(drop=True)

In [16]:
urls_extractor.head()

Unnamed: 0,Company_NAME,SIC1,URL,WEB,wikipedia_v1,wikipedia_v2,wikipedia_v3,WEB_2
0,IDEAL INDUSTRIES LIGHTING LLC,3641,www.creelighting.com,Search Spec Sheets & Design Files\nLight the W...,,Philippe Starck (French pronunciation: [filip ...,,
1,"GOODWILL INDUSTRIES OF ORANGE COUNTY, CALIFORNIA",5932,www.ocgoodwill.org,,"Goodwill Industries International Inc., often ...","Goodwill Industries International Inc., often ...",,
2,KKR GROUP FINANCE CO. VI LLC,6282,www.kkr.com,"As of September 30, 2023\nExplore our shared s...",,Blackstone Inc. is an American alternative inv...,,
3,WINK TO WEBSTER PIPELINE LLC,1623,www.winktowebsterpipeline.com,,"Delek US Holdings, Inc. is a diversified downs...","Delek US Holdings, Inc. is a diversified downs...",,
4,"AUTOMATIC DATA PROCESSING INSURANCE AGENCY, INC.",7374,insurance.adp.com,Talk to Sales 1-855-237-5335 Start Quote\nFind...,UNIVAC (Universal Automatic Computer) was a li...,Big data primarily refers to data sets that ar...,,


In [20]:
url_base = 'https://'
html_text = np.empty(15, dtype=object)
for i, url in urls_extractor.iterrows():
    print(url['Company_NAME'])
    url_complete = url_base + url['URL']
    html = html_request(url_complete)
    if str(html).lower().find('error!:') != -1:
        print(html)
    else:
        html_text[i] = get_html_text2(html)

IDEAL INDUSTRIES LIGHTING LLC
200
GOODWILL INDUSTRIES OF ORANGE COUNTY, CALIFORNIA
Request_error!: 403 Client Error: Forbidden for url: https://www.ocgoodwill.org/
KKR GROUP FINANCE CO. VI LLC
200
WINK TO WEBSTER PIPELINE LLC
Request_error!: 403 Client Error: Forbidden for url: https://winktowebsterpipeline.com/
AUTOMATIC DATA PROCESSING INSURANCE AGENCY, INC.
200
OIL STATES INDUSTRIES US, INC.
200
AMERICAN RETIREMENT ASSOCIATION
200
BECTON DICKINSON LUXEMBOURG III LLC
200
CARLYLE ASIA REAL ESTATE II GP, L.P.
200
ROC NATION ADVERTISING LLC
200
RAH SACO-CH, LLC
200
NEXEO PLASTICS, LLC
200
ALLIED GROUP, LLC
200
INDY ASBURY CHEV LLC
200
MONGODB, INC.
200


In [21]:
html_text[5]

'\n\nNYSE  OIS    scriptsNASDAQarrowdownRedgif \n\n   Contact Us \n    \n      \n  \n  \n  \n\n   About \n     Company Overview \n     Management Team \n     Board of Directors \n     Operations Map \n   Operations \n     OffshoreManufactured Products \n     Well Site Services \n     Downhole Technologies \n     Location Listings \n   Investors \n     Stock Information \n     SEC Filings \n     News Releases \n     Events  Presentations \n     Annual Reports  Presentations \n     Proxy Materials \n     Reconciliation of NonGAAP Financial Measures \n     Analyst Coverage \n     Investor FAQ \n     Contact Us \n   News \n     Press Releases \n   Corporate Governance \n     Management Team \n     Board of Directors \n     Committee Composition \n     Sustainability \n     FAQ \n   Careers \n     Training and Development \n     Benefits \n     Company Culture and Workforce Diversity \n     Equal Opportunity Employer \n     Career Opportunities \n     Operations Map \n\nRenewablesBannerImag

Final example of the text that we obtain

# IMPLEMENTING THE SUMMARIZER WITH CHATGPT API

In [None]:
data.head()

Unnamed: 0,Company_NAME,SIC1,URL,WEB,wikipedia_v1,wikipedia_v2,wikipedia_v3,WEB_2
0,"BRINK'S UKRAINE, INC.",7381,us.brinks.com,see my product options\nThe way in which you h...,,Joseph Robinette Biden Jr. ( BY-dən; born Nov...,,# Navigation\n\n * Skip to Content \n\n## A...
1,"AES CALGARY, INC.",4911,www.aes.com,e n e r g y\nAccelerating the future of\nenerg...,TransAlta Corporation (formerly Calgary Power ...,TransAlta Corporation (formerly Calgary Power ...,,Skip to main content\n\n[ ![Home](/themes/cust...
2,AURORA AES HOLDINGS INC,4911,www.aes.com,e n e r g y\nAccelerating the future of\nenerg...,This is a list of companies in the United Stat...,This is a list of companies in the United Stat...,,Skip to main content\n\n[ ![Home](/themes/cust...
3,"AES CENTRAL AMERICAN HOLDINGS, INC.",4911,www.aes.com,e n e r g y\nAccelerating the future of\nenerg...,Union Pacific Corporation is a publicly traded...,Union Pacific Corporation is a publicly traded...,,Skip to main content\n\n[ ![Home](/themes/cust...
4,"AES CARBON HOLDINGS, LLC",4911,www.aes.com,e n e r g y\nAccelerating the future of\nenerg...,The Abu Dhabi National Oil Company (Arabic: شر...,The Abu Dhabi National Oil Company (Arabic: شر...,,Skip to main content\n\n[ ![Home](/themes/cust...


In [None]:
def api_chatgpt(text: str, company_name: str):
    api_key = "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"
    client = OpenAI(api_key=api_key)

    message_content = f'''me haces un resumen de este html de la empresa {company_name}, 
                    en un maximo de 3 lineas que resuma a que se dedica dicha empresa por favor.
                      Respira profundamente y trabajo en este problema paso a paso:''' + text
    # Enviar solicitud de completación de chat
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-0301",
        messages=[{"role": "user", "content": message_content}],
        stream=False,
    )

    choices = response.choices
    if choices:
        first_choice = choices[0]
        message = first_choice.message
        content = message.content
        return content
    else:
        return None

In [None]:
data_notnull = data.dropna(subset=['WEB'])

In [None]:
data_notnull = data_notnull[['Company_NAME', 'URL']]
data_notnull['Description'] = np.nan

In [None]:
data_15 = data_notnull[['Company_NAME', 'URL']]

# Seleccionar 15 filas aleatorias
filas_aleatorias = np.random.choice(data_15.index, 15, replace=False)
data_15 = data_15.loc[filas_aleatorias]
data_15['Description'] = np.nan

In [None]:
url_base = 'https://'
for i, fila in data_notnull.iterrows():
    if i > 72:
        print(fila['Company_NAME'])
        print(i)
        url = url_base + fila['URL']
        html = html_request(url)
        text = get_html_text2(html)
        if len(text) >= 4096:
            text = text[:4096]
        description = api_chatgpt(text, fila['Company_NAME'])
        data_notnull.at[i, 'Description'] = description
    
        # Esperar 1 minuto
        time.sleep(30)  # 60 segundos = 1 minuto

BECTON DICKINSON LUXEMBOURG III LLC
73
200
BW/IP NEW MEXICO, INC.
74
200
BET DOCUMENTARIES, LLC
75
200
BET LIVE FROM LA, LLC
76
200
BET OH DRAMA , LLC
77
200
ASSURANT NEW VENTURES, INCORPORATED
78
200
AMSCAN NM LAND, LLC
81
200
BET ST LLC
84
200
ARBOUR ELDER SERVICES, INC.
85
200
GREEN MOUNTAIN ENERGY SUN CLUB
86
200
CABOT US INVESTMENTS LLC
87
200
BIOGEN HOLDING I LLC
88
200
BEECH OVENS LLC
90
200
LIVE VENTURES INCORPORATED
91
200
BELDEN CDT INTERNATIONAL INC.
92
200
BLACK & DECKER INVESTMENTS (AUSTRALIA) LIMITED
97
200
BLACK & DECKER DE PANAMA LLC
98
200
ASSURANT SOLUTIONS HOLDING PUERTO RICO, INC.
99
200
CARESPOT OF ORLANDO/HSI URGENT CARE, LLC
100
200
BIOGEN THERAPEUTICS INC.
101
200
BIOGEN SRO INC.
102
200
BIOMET MANUFACTURING, LLC
103
200
BIOMET INTERNATIONAL ORTHOPEDICS, LLC
104
200
AUTOMATIC DATA PROCESSING INSURANCE AGENCY, INC.
106
200
ARCOLA SECURITIES, INC.
107
200
BOSTITCH-HOLDING, L.L.C.
108
200
CARLYLE CAVALIER GP, L.L.C.
110
200
GREEN PLAINS SHENANDOAH LLC FKA GPRE SHEN

In [None]:
csv_file = "/home/unai/datasets/All_Description_Unai.csv"
data_notnull.to_csv(csv_file, index=False, sep=",")

In [None]:
data_notnull.info()

<class 'pandas.core.frame.DataFrame'>
Index: 210 entries, 0 to 299
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Company_NAME  210 non-null    object
 1   URL           210 non-null    object
 2   Description   210 non-null    object
dtypes: object(3)
memory usage: 14.7+ KB
