In [None]:
import requests

In [None]:
from bs4 import BeautifulSoup


In [None]:
def get_menu_links(soup):
    '''Extract URL from EFECTOCOCUYO.COM site main menu'''
    items = soup.find("div", attrs={"class":"list mtopmob"})
    a_tags = items.find_all("a")
    link_list = [link.get("href") for link in a_tags]

    return link_list


In [None]:
def get_feature_links(soup):
    '''Extract URL from carrousel news from page on EFECTOCOCUYO.COM site'''
    feature_news = soup.find("div", attrs={"class":"carousel-inner"})
    a_list = feature_news.select("#carouselExampleControls > div > div.carousel-item > div.contentImage.bloque-destacado > div > a")
    link_list = [link.get("href") for link in a_list]
    return link_list

def get_big_news(soup):
    '''Extract URL from big news from page on EFECTOCOCUYO.COM site'''
    feature_new = soup.find("div", attrs={"class":"contenttext text-center"})
    link = feature_new.a.get("href")

    return link

def get_small_news(soup):
    '''Extract all URL from carrousel news from page on EFECTOCOCUYO.COM site'''
    all_links = []
    try:
        all_rows = soup.find("div", attrs={"class":"col col-sm-9"})
        a_tag = all_rows.select("body > section.home.d-none.d-sm-block.d-md-block.d-lg-block.d-xl-block > div.container > div.row > div.col.col-sm-9 > div > div > div > div.contentbox.text-center > a:nth-child(2)")
        all_links = [link.get("href") for link in a_tag]
    except Exception as e:
        all_links.append(e)
    return all_links

print(get_feature_links(cocuyo_politics_soup), end="\n\n")
print(get_big_news(cocuyo_politics_soup), end="\n\n")
print(get_small_news(cocuyo_politics_soup))

In [None]:
url_news = "https://efectococuyo.com/politica/carmen-b-fernandez-urge-punto-de-encuentro-entre-propuestas-de-capriles-y-guaido-conlaluz/"

def get_news_info(url_news):
    '''Extract title, date and content from the EFECTOCOCUYO.COM's articles'''
    data_news = {}
    try:
        news = requests.get(url_news)
        if news.status_code != 200:
            print(f"Error: {url_news} not found")
        else:
            soup_news = BeautifulSoup(news.text, "html.parser")
            title = soup_news.find("h1", attrs={"class":"fontbree"})
            # Extract category and time from the date (h3) #
            date = soup_news.find("h3")
            all_span = date.find_all("span")
            for element in all_span:
                element.extract()
            #################################################
            body_news = soup_news.find("div", attrs={"class":"col-8 col-sm-7 col-lg-9"})
            data_news["title"] = title.text
            data_news["date"] = date.text
            data_news["body"] = body_news.text
            return data_news
    except Exception as e:
        print(f"Error: {e}")

    return data_news

def pretty(d, indent=0):
   for key, value in d.items():
      print('\t' * indent + str(key))
      if isinstance(value, dict):
         pretty(value, indent+1)
      else:
         print('\t' * (indent+1) + str(value))

pretty(get_news_info(url_news))



In [None]:
url = "https://efectococuyo.com/"
try:
    cocuyo = requests.get(url)
    cocuyo_soup = BeautifulSoup(cocuyo.text, "html.parser")
except Exception as e:
    print(f"----- ERROR getting the main soup {url} ---------")

def scrape_cocuyo(cocuyo_soup):
    '''Scrapping EFECTOCOCUYO.COM site'''
    notes = []
    all_data = []
    menu_link_list = get_menu_links(cocuyo_soup)

    for link in menu_link_list:
        try:
            link_response = requests.get(link)
        except Exception as e:
            print(f"----> Error getting response {link}: {e}")
            break

        if link_response.status_code == 200:
            soup = BeautifulSoup(link_response.text, "html.parser")
            for link in get_feature_links(soup):
                notes.append(link)
            notes.append(get_big_news(soup))
            for link in get_small_news(soup):
                notes.append(link)
        
    for i, url in enumerate(notes, 1):
        print(f"Scrapping note{i} from {len(notes)}")
        all_data.append(get_news_info(url))    
    
    return all_data


In [None]:
data = scrape_cocuyo(cocuyo_soup)


In [None]:
len(data)

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(data)

In [None]:
df.head()

In [None]:
df.to_csv("efecto_cocuyo_notes_20200915.csv")