# We are going to see how to scrape content from Wikipedia, specifically, we are going to obtain the list of the most relevant festivities in Spain, the links where we can find more information and a brief description of them.

In [31]:
#  Load packages
import requests
from bs4 import BeautifulSoup

# Define
url_base = "https://en.wikipedia.org/wiki/Tourism_in_Spain"

# Request
request_url = requests.get(url_base)
request_url.status_code

# Get HTML
html = request_url.content

# Convert HTML to BeautifulSoup object. 
soup = BeautifulSoup(html, "html.parser")

# Export HTML to a file
with open('response_wiki.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

In [32]:
# Get links from last paragraph (-35, -36)
links1 = soup.find_all('p')[-36].find_all('a')
links2 = soup.find_all('p')[-35].find_all('a')

### We have two paragraph an also know we have got names of the communities of Spain, we just need festival names. We are going to delete names of places of Spain.

In [33]:
del links1[2] # Almonte
del links1[2] # Huelva
del links1[3] # Pamplona
del links1[5] # Buñol
del links1[5] # Valencia
del links1[6] # Zaragoza
links1

[<a href="/wiki/Seville_Fair" title="Seville Fair">Seville Fair</a>,
 <a href="/wiki/Romer%C3%ADa_de_El_Roc%C3%ADo" title="Romería de El Rocío">Romería de El Rocío</a>,
 <a class="mw-redirect" href="/wiki/Running_of_the_Bulls" title="Running of the Bulls">Running of the Bulls</a>,
 <a class="mw-redirect" href="/wiki/Fallas" title="Fallas">Fallas</a>,
 <a class="mw-redirect" href="/wiki/Tomatina" title="Tomatina">Tomatina</a>,
 <a href="/wiki/Fiestas_del_Pilar" title="Fiestas del Pilar">Fiestas del Pilar</a>]

In [34]:
del links2[1] # Canary Islands
del links2[2] # Cádiz
del links2[5] #Valladolid
links2

[<a href="/wiki/Carnival" title="Carnival">Carnival</a>,
 <a href="/wiki/Carnival_of_Santa_Cruz_de_Tenerife" title="Carnival of Santa Cruz de Tenerife">Carnival of Santa Cruz de Tenerife</a>,
 <a href="/wiki/San_Sebasti%C3%A1n_International_Film_Festival" title="San Sebastián International Film Festival">San Sebastián International Film Festival</a>,
 <a class="mw-redirect" href="/wiki/M%C3%A1laga_Spanish_Film_Festival" title="Málaga Spanish Film Festival">Málaga Spanish Film Festival</a>,
 <a class="mw-redirect" href="/wiki/Seminci" title="Seminci">Seminci</a>,
 <a href="/wiki/Sitges_Film_Festival" title="Sitges Film Festival">Sitges Film Festival</a>,
 <a href="/wiki/S%C3%B3nar" title="Sónar">Sónar</a>,
 <a href="/wiki/Festival_Internacional_de_Benic%C3%A0ssim" title="Festival Internacional de Benicàssim">FIB</a>,
 <a href="/wiki/Festimad" title="Festimad">Festimad</a>,
 <a href="/wiki/Primavera_Sound" title="Primavera Sound">Primavera Sound</a>,
 <a href="/wiki/Bilbao_BBK_Live" titl

In [44]:
from urllib.parse import urljoin

# Extract the name of the holiday (associated with the 'title' attribute of the 'a' element)
n_links1=[l.get('title') for l in links1]
n_links2=[l.get('title') for l in links2]

# Extract the value of the relative url (associated with the 'href' attribute of the 'a' element)
url_rel_links1=[l.get('href') for l in links1] 
url_rel_links2=[l.get('href') for l in links2]  

#Transform relative links to absolute
url_complet_links1 = [urljoin(url_base, url) for url in url_rel_links1]
url_complet_links2 = [urljoin(url_base, url) for url in url_rel_links2]

# Extract only urls pointing to Wikipedia (internal urls)
links_int1 = [url for url in url_complet_links1 if 'wikipedia.org' in url]
links_int2 = [url for url in url_complet_links2 if 'wikipedia.org' in url]
links_int = links_int1 + links_int2
links_int

['https://en.wikipedia.org/wiki/Seville_Fair',
 'https://en.wikipedia.org/wiki/Romer%C3%ADa_de_El_Roc%C3%ADo',
 'https://en.wikipedia.org/wiki/Running_of_the_Bulls',
 'https://en.wikipedia.org/wiki/Fallas',
 'https://en.wikipedia.org/wiki/Tomatina',
 'https://en.wikipedia.org/wiki/Fiestas_del_Pilar',
 'https://en.wikipedia.org/wiki/Carnival',
 'https://en.wikipedia.org/wiki/Carnival_of_Santa_Cruz_de_Tenerife',
 'https://en.wikipedia.org/wiki/San_Sebasti%C3%A1n_International_Film_Festival',
 'https://en.wikipedia.org/wiki/M%C3%A1laga_Spanish_Film_Festival',
 'https://en.wikipedia.org/wiki/Seminci',
 'https://en.wikipedia.org/wiki/Sitges_Film_Festival',
 'https://en.wikipedia.org/wiki/S%C3%B3nar',
 'https://en.wikipedia.org/wiki/Festival_Internacional_de_Benic%C3%A0ssim',
 'https://en.wikipedia.org/wiki/Festimad',
 'https://en.wikipedia.org/wiki/Primavera_Sound',
 'https://en.wikipedia.org/wiki/Bilbao_BBK_Live']

In [45]:
name_links = n_links1 + n_links2
name_links

['Seville Fair',
 'Romería de El Rocío',
 'Running of the Bulls',
 'Fallas',
 'Tomatina',
 'Fiestas del Pilar',
 'Carnival',
 'Carnival of Santa Cruz de Tenerife',
 'San Sebastián International Film Festival',
 'Málaga Spanish Film Festival',
 'Seminci',
 'Sitges Film Festival',
 'Sónar',
 'Festival Internacional de Benicàssim',
 'Festimad',
 'Primavera Sound',
 'Bilbao BBK Live']

### In order to obtain the brief description of each of the festivities associated with the previous links, we are going to scrape the content of said urls and obtain the information contained in the first paragraph (we assume that it is where the concept is introduced)

In [49]:
import time

# initialize the list where the paragraphs will be saved with the brief description
description = []

# Loop to scrape each link
i = 0
for url in links_int:
  time.sleep(1)
  # connect to every webpage
  p_resp = requests.get(url)
  
  # checking if the request is successful
  if p_resp.status_code == 200:            # OK!
      print('URL #{0}: {1}'.format(i+1,url))    # print the iteration number along with the url  
  else:                                       
      print('Status code {0}: Skipping URL #{1}: {2}'.format(p_resp.status_code, i+1, url)) # print the error
      i = i+1
      continue
      
  # get HTML
  p_html = p_resp.content
  # convert HTML to BeautifulSoup object
  p_soup = BeautifulSoup(p_html, 'lxml')
  
  # get text relative to the first or second paragraph
  if p_soup.find("p").text == '\n':
    p_pars = p_soup.find_all("p")[1].text
  else:
    p_pars = p_soup.find("p").text

  # Add the paragraph to the paragraph list
  description.append(p_pars)
  # Incrementing the loop counter
  i = i+1

URL #1: https://en.wikipedia.org/wiki/Seville_Fair
URL #2: https://en.wikipedia.org/wiki/Romer%C3%ADa_de_El_Roc%C3%ADo
URL #3: https://en.wikipedia.org/wiki/Running_of_the_Bulls
URL #4: https://en.wikipedia.org/wiki/Fallas
URL #5: https://en.wikipedia.org/wiki/Tomatina
URL #6: https://en.wikipedia.org/wiki/Fiestas_del_Pilar
URL #7: https://en.wikipedia.org/wiki/Carnival
URL #8: https://en.wikipedia.org/wiki/Carnival_of_Santa_Cruz_de_Tenerife
URL #9: https://en.wikipedia.org/wiki/San_Sebasti%C3%A1n_International_Film_Festival
URL #10: https://en.wikipedia.org/wiki/M%C3%A1laga_Spanish_Film_Festival
URL #11: https://en.wikipedia.org/wiki/Seminci
URL #12: https://en.wikipedia.org/wiki/Sitges_Film_Festival
URL #13: https://en.wikipedia.org/wiki/S%C3%B3nar
URL #14: https://en.wikipedia.org/wiki/Festival_Internacional_de_Benic%C3%A0ssim
URL #15: https://en.wikipedia.org/wiki/Festimad
URL #16: https://en.wikipedia.org/wiki/Primavera_Sound
URL #17: https://en.wikipedia.org/wiki/Bilbao_BBK_Live


In [50]:
url_and_description = dict(zip(links_int, description)) 
url_and_description

{'https://en.wikipedia.org/wiki/Seville_Fair': 'The Seville Fair (officially and in Spanish: Feria de Abril de Sevilla, "Seville April Fair") is held in the Andalusian capital of Seville, Spain. The fair generally begins two weeks after the Semana Santa, or Easter Holy Week. \n',
 'https://en.wikipedia.org/wiki/Romer%C3%ADa_de_El_Roc%C3%ADo': 'The Romería de El Rocío is a procession/pilgrimage on the second day of Pentecost to the Hermitage of El Rocío in the countryside of Almonte, Province of Huelva, Andalucia, Spain, in honor of the Virgin of El Rocío.[1][2] In recent years the Romería has brought together roughly a million pilgrims each year.[3][4]\n',
 'https://en.wikipedia.org/wiki/Running_of_the_Bulls': "A running of the bulls (Spanish: encierro, from the verb encerrar, 'to corral, to enclose'; Occitan: abrivado, literally 'haste, momentum'; Catalan: correbous, 'run-bulls') is an event that involves running in front of a small group of bulls, typically six[1] but sometimes ten o