In [31]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [59]:
webpage = 'https://es.wikipedia.org/wiki/Anexo:Municipios_de_Espa%C3%B1a_por_poblaci%C3%B3n'

req = Request(webpage, headers={'User-Agent': 'Mozilla/5.0'})
raw_web = urlopen(req, timeout=10).read()

In [60]:
soup = BeautifulSoup(raw_web, 'html.parser')
tables = soup.find_all('table')
titulos_tablas = soup.find_all('span', attrs={"class": "mw-headline"})

In [61]:
print('Títulos de tablas: ', len(tables))
print('Tablas: ', len(tables))

Títulos de tablas:  7
Tablas:  7


In [62]:
head = []
rows_raw = []
for table in tables:
    head = table.find_all('th') 
    table_rows = table.find_all('tr')[1:] # We skip the first header as we saw we don't want it
    
    for fila in table_rows:
        cols = []
        for col in fila.find_all('td'):
            cols.append(col.text.strip())
        
        rows_raw.append(cols)
    

In [63]:
rows_raw[0]

['1',
 'Madrid',
 '3 280 782',
 'Madrid\xa0Madrid',
 'Comunidad de Madrid\xa0Comunidad de Madrid']

In [64]:
# Cleaning each row

for row in rows_raw:
    
    if len(row) == 4:
        row.append(row[3])

    row[0] = int(row[0])+1000
    row[2] = int("".join(str(row[2]).split(" ")))
    row[3] = row[3].split("\xa0")[0]
    row[4] = row[4].split("\xa0")[0]
    
rows_raw[0]

[1001, 'Madrid', 3280782, 'Madrid', 'Comunidad de Madrid']

In [65]:
rows_raw[12]

[1013, 'Valladolid', 295639, 'Valladolid', 'Castilla y León']

In [67]:
for fila in rows_raw:
    if len(fila) != len(head):
        print('Cols', len(fila))
        print(fila)
        print('---------------')

In [68]:
all_data = pd.DataFrame(rows_raw, columns=['ID', 'City', 'Population', 'Region', 'State'])

In [73]:
all_data.head(10)

Unnamed: 0,ID,City,Population,Region,State
0,1001,Madrid,3280782,Madrid,Comunidad de Madrid
1,1002,Barcelona,1636193,Barcelona,Cataluña
2,1003,Valencia,792492,ValenciaValencia,Comunidad
3,1004,Sevilla,681998,Sevilla,Andalucía
4,1005,Zaragoza,673010,Zaragoza,Aragón
5,1006,Málaga,579076,Málaga,Andalucía
6,1007,Murcia,462979,Región de Murcia,Región de Murcia
7,1008,Palma,415940,Islas Baleares,Islas Baleares
8,1009,Las Palmas de Gran Canaria,378797,Las PalmasLas Palmas,Canarias
9,1010,Bilbao,344127,Vizcaya,País Vasco


At this point, we have successfully scraped all the relevant data from Wikipedia.

Let's now proceed to create each of the requested tables.

In [78]:
# Generate the State table

state_id = 1001
states_raw = all_data['State'].drop_duplicates()

states_list = []

for state in states_raw:
    
    states_list.append({"id": state_id, "name": state, "country": "ES"})
    
    state_id += 1

states = pd.DataFrame(states_list, columns = states_list[0].keys())

In [80]:
states.head(20)

Unnamed: 0,id,name,country
0,1001,Comunidad de Madrid,ES
1,1002,Cataluña,ES
2,1003,Comunidad,ES
3,1004,Andalucía,ES
4,1005,Aragón,ES
5,1006,Región de Murcia,ES
6,1007,Islas Baleares,ES
7,1008,Canarias,ES
8,1009,País Vasco,ES
9,1010,Castilla y León,ES


In [81]:
# Generar la tabla region

region_id = 1001
regions_raw = all_data[['Region','State']].drop_duplicates().reset_index(drop=True)

regions_list = []

for i in regions_raw.index:
    
    region = regions_raw.iloc[i].to_dict()
    
    state_id = states[states['name'] == region['State']].iloc[0]['id']
    
    regions_list.append({"id":region_id, "stateID": state_id, "name": region['Region']})
    
    region_id += 1
    
regions = pd.DataFrame(regions_list, columns=regions_list[0].keys())

regions.head(50)

Unnamed: 0,id,stateID,name
0,1001,1001,Madrid
1,1002,1002,Barcelona
2,1003,1003,ValenciaValencia
3,1004,1004,Sevilla
4,1005,1005,Zaragoza
5,1006,1004,Málaga
6,1007,1006,Región de Murcia
7,1008,1007,Islas Baleares
8,1009,1008,Las PalmasLas Palmas
9,1010,1009,Vizcaya


In [83]:
# Generate the poblaciones table

cities_list = []

for ciudad in rows_raw:
    region_id = regions[regions['name'] == ciudad[3]].iloc[0]['id']
    
    cities_list.append({"id": ciudad[0], "regionID": region_id, "name": ciudad[1], "population": ciudad[2]})
    

cities = pd.DataFrame(cities_list, columns=cities_list[0].keys())


In [84]:
cities.head()

Unnamed: 0,id,regionID,name,population
0,1001,1001,Madrid,3280782
1,1002,1002,Barcelona,1636193
2,1003,1003,Valencia,792492
3,1004,1004,Sevilla,681998
4,1005,1005,Zaragoza,673010


Finally, all that remains is the assembly of the database with the relationships between the tables, which falls under the storage part. I will leave this task for you to handle!

😘