# Comunidad
## Emprendedores
### https://www.emprendedores.es/seccion/guia-juridica-fiscal/

In [1]:
# Módulos
import pandas as pd
import requests as r
from bs4 import BeautifulSoup as bs4
import csv

In [2]:
def scrape_new_card(new_meta):
    """
    Busca parámetros necesarios para Dataframe en contenido HTML o CSS de página
    
    Argumentos:
    'new_meta': Sección de página web que contiene la noticia
    
    Outputs:
    'comunidad, title, link, description': Retorno de parámetros
    """
        
    comunidad = 'Emprendedores'
    try:
        title = new_meta.find('h2', {'class':'full-item-title item-title entry-title'}).get_text()
    except:
        title = 'No title found'
    try:
        link = new_meta.find('a',{'rel':'bookmark'})['href']
    except:
        link = 'No link found'
    try:
        description = new_meta.find('div', {'class':'full-item-dek item-dek'}).get_text().strip('\n')
    except:
        description = 'Description not found'
    return comunidad, title, link, description

In [3]:
def scrape_news_page_meta(news_page_html, news_df):
    """
    Se encarga de buscar los artículos y noticias de la página para después pasar los contenido a un dataframe
    
    Argumentos:
    'new_page_html': Contiene página web
    'news_df': Dataframe que contiene la información de noticias en páginas pasadas
    
    Outputs:
    'news_df': Retorno de Dataframe actualizado con las noticias de la página
    """
    
    page_soup = bs4(news_page_html.text, 'lxml')
    
    for new in page_soup.find_all('article',{"class":"full-item"}):
        comunidad, title, link, description = scrape_new_card(new)    
        news_dict = {'Comunidad': comunidad,
                    'Titulos': [title],
                    'Links': [link],
                    'Descripciones': [description]}
        n_df = pd.DataFrame.from_dict(news_dict)
        news_df= pd.concat([news_df, n_df], ignore_index=True)
    return news_df

In [4]:
page = 1 # Se inicializa contador para ciclo
# Generación de dataframe
df_columns = ['Comunidad', 'Titulos', 'Links', 'Descripciones']
news_df = pd.DataFrame(columns = df_columns)

# Ciclo que hace scraping a seis páginas de la web
while page != 6:
    news_page_html = r.get(f'https://www.emprendedores.es/seccion/guia-juridica-fiscal/page/{page}')
    news_df = scrape_news_page_meta(news_page_html, news_df)
    page = page + 1

In [5]:
news_df.shape #Información sobre columnas y filas que tiene actualmente el dataframe

(50, 4)

### https://www.portafolio.co/negocios/emprendimiento

In [6]:
def scrape_new_card(new_meta):
    comunidad = 'Emprendedores'
    try:
        title = new_meta.find('h3', {'class':'listing-title'}).get_text().strip('\n')
    except:
        title = 'No title found'
    try:
        link = new_meta.find('meta',{'itemprop':'mainEntityOfPage'})['itemid']
    except:
        link = 'No link found'
    try:
        description = new_meta.find('div', {'class':'listing-epigraph'}).get_text().strip('\n')
    except:
        description = 'Description not found'
    return comunidad, title, link, description

In [7]:
def scrape_news_page_meta(news_page_html, news_df):
    page_soup = bs4(news_page_html.text, 'lxml')
    
    for new in page_soup.find_all('div',{"itemtype":"https://schema.org/NewsArticle"}):
        comunidad, title, link, description = scrape_new_card(new)    
        news_dict = {'Comunidad': comunidad,
                    'Titulos': [title],
                    'Links': [link],
                    'Descripciones': [description]}
        n_df = pd.DataFrame.from_dict(news_dict)
        news_df= pd.concat([news_df, n_df], ignore_index=True)
    return news_df

In [8]:
page = 1
#df_columns = ['Comunidad', 'Titulos', 'Links', 'Descripciones']
#news_df = pd.DataFrame(columns = df_columns)

while page != 6:
    news_page_html = r.get(f'https://www.portafolio.co/page/emprendimiento-1.html?page={page}&settings=%7B%22subcategoryFilter%22%3Afalse%2C%22viewSetting%22%3Afalse%7D')
    news_df = scrape_news_page_meta(news_page_html, news_df)
    page = page + 1

In [9]:
news_df.shape

(95, 4)

### https://www.expansion.com/expansion-empleo/emprendedores.html

In [10]:
def scrape_new_card(new_meta):
    comunidad = 'Emprendedores'
    try:
        title = new_meta.find('h2', {'class':'ue-c-cover-content__headline'}).get_text().strip('\n')
    except:
        title = 'No title found'
    try:
        link = new_meta.find('a',{'class':'ue-c-cover-content__link'})['href']
    except:
        link = 'No link found'
    try:
        description_html = r.get(link)
        description_soup = bs4(description_html.text, 'lxml')
        description = description_soup.find('p', {'class':'ue-c-article__standfirst'}).get_text().strip('\n')
    except:
        description = 'Description not found'
    return comunidad, title, link, description

In [11]:
news_page_html = r.get('https://www.expansion.com/expansion-empleo/emprendedores.html')
page_soup = bs4(news_page_html.text, 'lxml')

#df_columns = ['Comunidad', 'Titulos', 'Links', 'Descripciones']
#news_df = pd.DataFrame(columns = df_columns)


for new in page_soup.find_all('div',{"class":"ue-l-cover-grid__unit ue-l-cover-grid__unit--no-grow"}):
    comunidad, title, link, description = scrape_new_card(new)    
    news_dict = {'Comunidad': comunidad,
                'Titulos': [title],
                'Links': [link],
                'Descripciones': [description]}
    n_df = pd.DataFrame.from_dict(news_dict)
    news_df= pd.concat([news_df, n_df], ignore_index=True)

In [12]:
news_df.shape

(130, 4)

### https://www.eleconomista.es/pymes/

In [13]:
def scrape_new_card(new_meta):
    comunidad = 'Emprendedores'
    try:
        title = new_meta.find('div', {'class':'articleHeadline'}).get_text().strip('\n')
    except:
        title = 'No title found'
    try:
        link = new_meta.find('a',{'itemprop':'mainEntityOfPage'})['href']
    except:
        link = 'No link found'
    try:
        description = new_meta.find('p', {'class':'articleText'}).get_text().strip('\n')
    except:
        description = 'Description not found'
    return comunidad, title, link, description

In [14]:
news_page_html = r.get('https://www.eleconomista.es/pymes/')
page_soup = bs4(news_page_html.text, 'lxml')

#df_columns = ['Comunidad', 'Titulos', 'Links', 'Descripciones']
#news_df = pd.DataFrame(columns = df_columns)


for new in page_soup.find_all('div',{"itemtype":"http://schema.org/NewsArticle"}):
    comunidad, title, link, description = scrape_new_card(new)    
    news_dict = {'Comunidad': comunidad,
                'Titulos': [title],
                'Links': [link],
                'Descripciones': [description]}
    n_df = pd.DataFrame.from_dict(news_dict)
    news_df= pd.concat([news_df, n_df], ignore_index=True)

In [15]:
news_df.shape

(151, 4)

# Comunidad
## Empresa y discapacidad
### https://www.larepublica.co/responsabilidad-social

In [16]:
def scrape_new_card(new_meta):
    comunidad = 'Empresa y discapacidad'
    try:
        title = new_meta.select('a', {'class': 'responsabilidad-socialSect'})[2].get_text()
    except:
        title = 'No title found'
    try:
        link = new_meta.find('a',{'class':'kicker responsabilidad-socialSect'})['href']
    except:
        link = 'No link found'
    try:
        description = new_meta.find('p').get_text().strip('\n')
    except:
        description = 'Description not found'
    return comunidad, title, link, description

In [17]:
news_page_html = r.get('https://www.larepublica.co/responsabilidad-social')
page_soup = bs4(news_page_html.text, 'lxml')

#df_columns = ['Comunidad', 'Titulos', 'Links', 'Descripciones']
#news_df = pd.DataFrame(columns = df_columns)


for new in page_soup.find_all('div',{"class":"row news H_img_V_Title_Lead m-0"}):
    comunidad, title, link, description = scrape_new_card(new)    
    news_dict = {'Comunidad': comunidad,
                'Titulos': [title],
                'Links': [link],
                'Descripciones': [description]}
    n_df = pd.DataFrame.from_dict(news_dict)
    news_df= pd.concat([news_df, n_df], ignore_index=True)

In [18]:
news_df.shape

(173, 4)

### https://blogs.iadb.org/ideas-que-cuentan/es/

In [19]:
def scrape_new_card(new_meta):
    comunidad = 'Empresa y discapacidad'
    try:
        title = new_meta.find('a', {'class': 'entry-title-link'}).get_text()
    except:
        title = 'No title found'
    try:
        link = new_meta.find('a',{'class':'entry-title-link'})['href']
    except:
        link = 'No link found'
    try:
        window_descrip = new_meta.find('div',{'class':'entry-content'})
        description = window_descrip.find('p').get_text()
    except:
        description = 'Description not found'
    return comunidad, title, link, description

In [20]:
def scrape_news_page_meta(news_page_html, news_df):
    page_soup = bs4(news_page_html.text, 'lxml')
    
    for new in page_soup.find_all('article'):
        comunidad, title, link, description = scrape_new_card(new)    
        news_dict = {'Comunidad': comunidad,
                    'Titulos': [title],
                    'Links': [link],
                    'Descripciones': [description]}
        n_df = pd.DataFrame.from_dict(news_dict)
        news_df= pd.concat([news_df, n_df], ignore_index=True)
    return news_df

In [21]:
page = 1
#df_columns = ['Comunidad', 'Titulos', 'Links', 'Descripciones']
#news_df = pd.DataFrame(columns = df_columns)

while page != 11:
    news_page_html = r.get(f'https://blogs.iadb.org/ideas-que-cuentan/es/page/{page}')
    news_df = scrape_news_page_meta(news_page_html, news_df)
    page = page + 1

In [22]:
news_df.shape

(223, 4)

### https://www.weforum.org/topics/education/

In [23]:
def scrape_new_card(new_meta):
    comunidad = 'Empresa y discapacidad'
    box = new_meta.find('h5')
    try:
        title = box.find('a').get_text()
    except:
        title = 'No title found'
    try:
        link = box.find('a')['href']
    except:
        link = 'No link found'
    try:
        description = new_meta.select('p')[1].get_text().strip('\n')
    except:
        description = 'Description not found'
    return comunidad, title, link, description

In [24]:
news_page_html = r.get('https://www.weforum.org/topics/education/')
page_soup = bs4(news_page_html.text, 'lxml')

#df_columns = ['Comunidad', 'Titulos', 'Links', 'Descripciones']
#news_df = pd.DataFrame(columns = df_columns)


for new in page_soup.find_all('div', {'class': 'wef-1bygggf'}):
    comunidad, title, link, description = scrape_new_card(new)    
    news_dict = {'Comunidad': comunidad,
                'Titulos': [title],
                'Links': [link],
                'Descripciones': [description]}
    n_df = pd.DataFrame.from_dict(news_dict)
    news_df= pd.concat([news_df, n_df], ignore_index=True)

In [25]:
news_df.shape

(228, 4)

# Comunidad
## Mipymes
### https://www.bbva.com/es/especiales/bbva-compartiendo-conocimiento/

In [26]:
def scrape_new_card(new_meta):
    comunidad = 'Mipymes'
    try:
        title = new_meta.find('a', {'class': 'datalayer_podcast'})['data-podcastname']
    except:
        title = 'No title found'
    try:
        link = new_meta.find('a', {'class': 'datalayer_podcast'})['href']
    except:
        link = 'No link found'
    try:
        description = new_meta.find('p', {'class': 'notTexto'}).get_text().strip('\n')
    except:
        description = 'Description not found'
    return comunidad, title, link, description

In [27]:
news_page_html = r.get('https://www.bbva.com/es/especiales/bbva-compartiendo-conocimiento/')
page_soup = bs4(news_page_html.text, 'lxml')

#df_columns = ['Comunidad', 'Titulos', 'Links', 'Descripciones']
#news_df = pd.DataFrame(columns = df_columns)


for new in page_soup.find_all('article'):
    comunidad, title, link, description = scrape_new_card(new)    
    news_dict = {'Comunidad': comunidad,
                'Titulos': [title],
                'Links': [link],
                'Descripciones': [description]}
    n_df = pd.DataFrame.from_dict(news_dict)
    news_df= pd.concat([news_df, n_df], ignore_index=True)

In [28]:
news_df.shape

(259, 4)

### https://recruitingdaily.com/articles/

In [29]:
def scrape_new_card(new_meta):
    comunidad = 'Mipymes'
    try:
        title = new_meta.find('h4', {'class': 'card-title'}).get_text()
    except:
        title = 'No title found'
    try:
        link = new_meta.find('a')['href']
    except:
        link = 'No link found'
    try:
        description = new_meta.find('p').get_text().strip('\n')
    except:
        description = 'Description not found'
    return comunidad, title, link, description

In [30]:
def scrape_news_page_meta(news_page_html, news_df):
    page_soup = bs4(news_page_html.text, 'lxml')
    
    for new in page_soup.find_all('div', {'class': 'card card-wide'}):
        comunidad, title, link, description = scrape_new_card(new)    
        news_dict = {'Comunidad': comunidad,
                    'Titulos': [title],
                    'Links': [link],
                    'Descripciones': [description]}
        n_df = pd.DataFrame.from_dict(news_dict)
        news_df= pd.concat([news_df, n_df], ignore_index=True)
    return news_df

In [31]:
page = 1
#df_columns = ['Comunidad', 'Titulos', 'Links', 'Descripciones']
#news_df = pd.DataFrame(columns = df_columns)

while page != 6:
    news_page_html = r.get(f'https://recruitingdaily.com/articles/page/{page}')
    news_df = scrape_news_page_meta(news_page_html, news_df)
    page = page + 1

In [32]:
news_df.shape

(334, 4)

### https://www.portafolio.co/innovacion

In [33]:
def scrape_new_card(new_meta):
    comunidad = 'Mipymes'
    try:
        title = new_meta.find('h3', {'class': 'listing-title'}).get_text().strip('\n')
    except:
        title = 'No title found'
    try:
        link = new_meta.find('meta',{'itemprop':'mainEntityOfPage'})['itemid']
    except:
        link = 'No link found'
    try:
        description = new_meta.find('div', {'class':'listing-epigraph'}).get_text().strip('\n')
    except:
        description = 'Description not found'
    return comunidad, title, link, description

In [34]:
def scrape_news_page_meta(news_page_html, news_df):
    page_soup = bs4(news_page_html.text, 'lxml')
    
    for new in page_soup.find_all('div',{"itemtype":"https://schema.org/NewsArticle"}):
        comunidad, title, link, description = scrape_new_card(new)    
        news_dict = {'Comunidad': comunidad,
                    'Titulos': [title],
                    'Links': [link],
                    'Descripciones': [description]}
        n_df = pd.DataFrame.from_dict(news_dict)
        news_df= pd.concat([news_df, n_df], ignore_index=True)
    return news_df

In [35]:
page = 1
#df_columns = ['Comunidad', 'Titulos', 'Links', 'Descripciones']
#news_df = pd.DataFrame(columns = df_columns)

while page != 6:
    news_page_html = r.get(f'https://www.portafolio.co/page/innovacion-1.html?page={page}')
    news_df = scrape_news_page_meta(news_page_html, news_df)
    page = page + 1

In [36]:
news_df.shape

(379, 4)

# Dataframe completo

In [37]:
news_df.shape

(379, 4)

In [38]:
news_df.tail()

Unnamed: 0,Comunidad,Titulos,Links,Descripciones
374,Mipymes,Emprendimiento colombiano de cannabis lanza su...,https://www.portafolio.co/innovacion/greenlab-...,"Se trata de Greenlab, que pondrá a la venta 5...."
375,Mipymes,¿Está América Latina preparada para el interne...,https://www.portafolio.co/innovacion/esta-amer...,La penetración del IoT requiere de infraestruc...
376,Mipymes,Sector 'proptech' en Colombia captó más de US$...,https://www.portafolio.co/innovacion/sector-pr...,Durante el 2022 se espera aumentar de manera r...
377,Mipymes,Transferencias inmediatas por Transfiya aument...,https://www.portafolio.co/innovacion/transfere...,El servicio desarrollado por ACH Colombia tien...
378,Mipymes,WhatsApp planea mejorar la función ‘Eliminar m...,https://www.portafolio.co/innovacion/whatsapp-...,La aplicación de mensajería instantánea está t...


In [48]:
news_df.to_excel("News_whatsapp.xlsx", index=False) #Carga de Dataframe completo a archivo excel
news_df.to_pickle("./News_whatsapp.pkl")  #Carga de Dataframe completo a archivo pickle

### https://www.hrexchangenetwork.com/

In [40]:
'''
def scrape_new_card(new_meta):
    comunidad = 'Mipymes'
    try:
        title = new_meta.find('h3', {'class': 'article-title'}).get_text().strip('\n')
    except:
        title = 'No title found'
    try:
        link = new_meta.find('a')['href']
    except:
        link = 'No link found'
    try:
        description = new_meta.find('div', {'class':'entry-content mt-3'}).get_text().strip('\n')
    except:
        description = 'Description not found'
    return comunidad, title, link, description
'''

"\ndef scrape_new_card(new_meta):\n    comunidad = 'Mipymes'\n    try:\n        title = new_meta.find('h3', {'class': 'article-title'}).get_text().strip('\n')\n    except:\n        title = 'No title found'\n    try:\n        link = new_meta.find('a')['href']\n    except:\n        link = 'No link found'\n    try:\n        description = new_meta.find('div', {'class':'entry-content mt-3'}).get_text().strip('\n')\n    except:\n        description = 'Description not found'\n    return comunidad, title, link, description\n"

In [41]:
'''
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"}
news_page_html = r.get('https://www.hrexchangenetwork.com/', headers=headers)
page_soup = bs4(news_page_html.text, 'lxml')

df_columns = ['Comunidad', 'Titulos', 'Links', 'Descripciones']
news_df = pd.DataFrame(columns = df_columns)


for new in page_soup.find_all('article',{"class":"border-bottom border-hr-secondary"}):
    comunidad, title, link, description = scrape_new_card(new)    
    news_dict = {'Comunidad': comunidad,
                'Titulos': [title],
                'Links': [link],
                'Descripciones': [description]}
    n_df = pd.DataFrame.from_dict(news_dict)
    news_df= pd.concat([news_df, n_df], ignore_index=True)
'''

'\nheaders = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"}\nnews_page_html = r.get(\'https://www.hrexchangenetwork.com/\', headers=headers)\npage_soup = bs4(news_page_html.text, \'lxml\')\n\ndf_columns = [\'Comunidad\', \'Titulos\', \'Links\', \'Descripciones\']\nnews_df = pd.DataFrame(columns = df_columns)\n\n\nfor new in page_soup.find_all(\'article\',{"class":"border-bottom border-hr-secondary"}):\n    comunidad, title, link, description = scrape_new_card(new)    \n    news_dict = {\'Comunidad\': comunidad,\n                \'Titulos\': [title],\n                \'Links\': [link],\n                \'Descripciones\': [description]}\n    n_df = pd.DataFrame.from_dict(news_dict)\n    news_df= pd.concat([news_df, n_df], ignore_index=True)\n'

In [42]:
'''
for new in page_soup.find_all():
    print(new)
'''

'\nfor new in page_soup.find_all():\n    print(new)\n'