### World of Catan Wiki

In [None]:
# Imports

import requests
import bs4
import pickle

In [None]:
# Example Fandom Wiki domain name

domain_address = 'https://catan.fandom.com'

# Page with links to all "basic" pages
# NOTE - for large Fandom Wikis, there is a pagination so it is necessary to go through several subpages

all_pages_list_subpage = '/wiki/Special:AllPages'

In [None]:
# Scrap all "basic" pages links

response = requests.get(domain_address + all_pages_list_subpage)
parsed = bs4.BeautifulSoup(response.text)
list_of_pages = list()

# Collect all links

div_with_list_of_links = parsed.find_all('div', {'class': 'mw-allpages-body'})[0]
for a_element in div_with_list_of_links.find_all('a', href=True):
    list_of_pages.append(a_element['href'])

In [None]:
list_of_pages

In [None]:
# Scrap the outgoing links from each web page to create a dictionary representing the graph
#
# Dictionary structure:
# - keys - website addresses
# - values - list of website addresses to which the given page links
# 
# {k_1: [v_1_1, v_1_2, ...], k_2: [v_2_1, v_2_2, ...], ...}
#
# Graph interpretation:
# For each pair k_i: v_i_j, there is an edge in the graph k_i -> v_i_j
#
# Scraping constraints:
# - view only the main content of the site - <div class="mw-parser-output">...</div>
# - focus only on the link elements - <a href="..."></a>
# - collect only those links, which are contained in list_of_pages

graph = dict()
for idx, link in enumerate(list_of_pages):
    graph[link] = list()
    response = requests.get(domain_address + link)
    parsed = bs4.BeautifulSoup(response.text)
    content = parsed.find('div', {'class': 'mw-parser-output'})
    print(link, response.status_code)
    for a_element in content.find_all('a', href=True):
        href = a_element['href'].split('#')[0].split('?')[0]
        if href in list_of_pages:
            if href not in graph[link]:
                graph[link].append(href)

# Save the graph structure
                
with open('catan_links.pickle', 'wb') as file:
    pickle.dump(graph, file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
graph

### Tram network - ZTM Poznań

In [None]:
# Imports

import requests
import bs4
import pickle

# Tram numbers list

address_prefix = 'https://www.ztm.poznan.pl/pl/rozklad-jazdy/'
tram_list = [1, 2, 3, 5, 6, 7, 9, 10, 11, 12, 'T12', 13, 14, 15, 18, 24, 98]

# Structure to collect the data

stops_connections = dict()

# Collect all schedules

for t in tram_list:
    address = address_prefix + str(t)
    response = requests.get(address)
    print(t, address, response.status_code)
    parsed = bs4.BeautifulSoup(response.text)
    for stop_list in parsed.find_all('ul', {'class': 'line-direction__stops'}):
        prev_stop = None
        for stop_li in stop_list.find_all('li', {'class': 'show'}):
            for stop_el in stop_li.find_all('span', {'class': 'line-stop__name'}):
                stop_name = stop_el.get_text()
                if stop_name != prev_stop and prev_stop is not None:
                    if prev_stop not in stops_connections.keys():
                        stops_connections[prev_stop] = list()
                    if stop_name not in stops_connections[prev_stop]:
                        stops_connections[prev_stop].append(stop_name)
                prev_stop = stop_name

# Save the graph structure

with open('tram_stops.pickle', 'wb') as file:
    pickle.dump(stops_connections, file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
stops_connections