In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import re

# 1 Extracting City Pages from National Page

In [1]:
# Base URL for cities
base_url = "https://mobilityweek.eu/participating-towns-and-cities/?year=2024&country=ES"

# Get the main page listing all cities
response = requests.get(base_url)
soup = BeautifulSoup(response.content, 'html.parser')

# Extract city links
city_links = soup.find_all('a', href=True)  # Extracts all anchor tags with an href

# Filter for links that contain the "city" parameter
city_urls = []
for link in city_links:
    href = link['href']
    if 'city=' in href:  # Check if 'city=' is in the href attribute
        # Join with the base URL to handle any relative URLs
        full_url = urljoin(base_url, href)
        city_urls.append(full_url)

# Display or work with the list of city URLs
print(city_urls)


['https://mobilityweek.eu/participating-towns-and-cities/?year=2024&country=ES&city=84iGWBpo', 'https://mobilityweek.eu/participating-towns-and-cities/?year=2024&country=ES&city=IT3H5sPq', 'https://mobilityweek.eu/participating-towns-and-cities/?year=2024&country=ES&city=HxJh6PWs', 'https://mobilityweek.eu/participating-towns-and-cities/?year=2024&country=ES&city=es_jmrl0woo', 'https://mobilityweek.eu/participating-towns-and-cities/?year=2024&country=ES&city=YzuNnRu5', 'https://mobilityweek.eu/participating-towns-and-cities/?year=2024&country=ES&city=6DV7CWcv', 'https://mobilityweek.eu/participating-towns-and-cities/?year=2024&country=ES&city=es_m70tybwl', 'https://mobilityweek.eu/participating-towns-and-cities/?year=2024&country=ES&city=RQoH6noz', 'https://mobilityweek.eu/participating-towns-and-cities/?year=2024&country=ES&city=xVNF4F90', 'https://mobilityweek.eu/participating-towns-and-cities/?year=2024&country=ES&city=E16Ss1bl', 'https://mobilityweek.eu/participating-towns-and-citi

# 2 Extracting Activity and City data from raw html

In [None]:
def extract_activities(city_url):
    # Ensure city_url is a valid string and not a tuple or other structure
    if not isinstance(city_url, str):
        raise ValueError("city_url should be a string")

    # Fetch the URL content
    response = requests.get(city_url)
    response.raise_for_status()  # Raise an error for bad responses (4xx or 5xx)

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract city name
    city_heading = soup.find("h1", id="participation_page_heading")
    city_name = city_heading.text.strip() if city_heading else "Unknown"
    
    # Extract population
    population_div = soup.find("div", class_="population city_data_row")
    population = None
    if population_div:
        population_text = population_div.find("div", class_="data").text.strip()
        population_match = re.search(r'\d[\d,.]*', population_text)
        if population_match:
            population = population_match.group().replace('.','').replace(',', '')
    
    # Extract coordinates (latitude and longitude)
    script_tag = soup.find("script", text=re.compile(r"googleMaps\(.+?\);"))
    latitude = longitude = None
    if script_tag:
        # Use regex to extract lat and lng from the script tag
        coordinates = re.search(r"googleMaps\(([\d.-]+),\s*([\d.-]+)", script_tag.string)
        if coordinates:
            latitude = float(coordinates.group(1))
            longitude = float(coordinates.group(2))
    
    # Extract activities
    activities = []
    activity_boxes = soup.find_all("div", class_="activity_details_box")
    
    for box in activity_boxes:
        name = box.find("div", class_="activity_name").text.strip() if box.find("div", class_="activity_name") else "Unnamed Activity"
        description_box = box.find("div", class_="activity_description")
        description = description_box.text.strip() if description_box else "No description available"
        
        # Append as dictionary to activities list
        activities.append({
            "name": name,
            "description": description
        })
    return {
        "city_name": city_name,
        "population": population,
        "latitude": latitude,
        "longitude": longitude,
        "activities": activities
    }
    

In [None]:
data = []

# Iterate through city URLs and extract city name and activity details
for city_url in enumerate(city_urls[1:]):
    city_info = extract_activities(city_url[1])
    
    # For each activity, add a record to the data list
    for activity in city_info['activities']:
        data.append({
            "city": city_info['city_name'],
            "population": city_info['population'],
            "latitude": city_info['latitude'],
            "longitude": city_info['longitude'],
            "activity_name": activity['name'],
            "activity_description": activity['description']
        })

# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(data)
df['ID'] = df.index
df.to_csv('../data/activities.csv')
# Show the DataFrame

pd.set_option('display.max_rows', 100)
display(df)

# 3 Saving Data

In [5]:
df= pd.read_csv('../data/activities.csv')
df_city = df.drop(columns=['activity_name', 'activity_description','Unnamed: 0','ID'])
df_unique_cities = df_city.drop_duplicates()
df_unique_cities.to_csv('../data/city_data.csv', index=False)
display(df_unique_cities)

Unnamed: 0,city,population,latitude,longitude
0,"A Coruña, Spain",244850,43.355965,-8.397066
6,"Abrera, Spain",12821,41.517527,1.901780
9,"Àger, Spain",600,42.000793,0.761005
11,"Agramunt, Spain",5515,41.787187,1.097979
12,"Aiguaviva, Spain",758,41.937663,2.762086
...,...,...,...,...
1036,"Vinaixa, Spain",499,41.447165,0.972508
1037,"Vinebre, Spain",420,41.184852,0.588156
1038,"Vitoria-Gasteiz, Spain",258808,42.859166,-2.681792
1048,"Zarautz, Spain",23152,43.284771,-2.171397
