# City scrapper

This notebook scraps Wikipedia route based on structured marked-up language patterns on the seek of city coordinates. There are plenty of applications, but the most promising is logistics.   

In [1]:
import requests as req
from bs4 import BeautifulSoup

host = 'https://en.wikipedia.org'
head_route = '/wiki'


In [10]:
tail_route = '/List_of_municipalities_of_Brazil'

route = head_route + tail_route
css_query = "#mw-content-text > div.mw-parser-output"

r = req.get(host+route)
soup = BeautifulSoup(r.text, 'html.parser')

city_routes = []
for el in soup.select(css_query):
  city_routes.append(el.attrs)

print(city_routes)


[{'class': ['mw-parser-output']}]


In [7]:
import json

tail_route = '/Assis_Brasil'
route = head_route + tail_route

url = host + route

r = req.get(url)
soup = BeautifulSoup(r.text)

scripts = str(soup.find('script').text).strip()


In [8]:
import json

code_equalities = [
    code_line.split('=') for code_line in scripts.split(';')
]

code_equalities = list(
    filter(lambda el: len(el) == 2, code_equalities)
)

lh_terms = list(
    set([ code_equality[0] for code_equality in code_equalities ])
)

lr_dict = {
    lh_term: [] for lh_term in lh_terms
}

for code_equality in code_equalities:
    #print(code_equality)
    lh_key = code_equality[0]
    
    lr_dict[lh_key].append(code_equality[1])


field_str = "wgCoordinates"
    
confs = list(
    filter(
        lambda rh_term: field_str in rh_term, 
        lr_dict['RLCONF']
    )
)

assert len(confs) == 1

conf = json.loads(confs[0])

lat_lng = (
    conf[field_str]['lat'],
    conf[field_str]['lon']
)

print(lat_lng)


(-10.940833333333334, -69.56694444444445)


In [5]:
host = 'https://en.wikipedia.org'

def brazilian_city_routes():    
    head_route = '/wiki'
    tail_route = '/List_of_municipalities_of_Brazil'
    route = head_route + tail_route
    
    uri = host+route
    
    css_query = "#mw-content-text > div.mw-parser-output > table > tbody > tr > td > a"

    r = req.get(uri)
    
    soup = BeautifulSoup(r.text, 'html.parser')

    for el in soup.select(css_query):
      yield el.attrs['href']

def get_latlng_tuple(url):
    r = req.get(url)
    soup = BeautifulSoup(r.text)

    scripts = str(soup.find('script').text).strip()

    from json import loads

    code_equalities = [
        code_line.split('=') for code_line in scripts.split(';')
    ]

    code_equalities = list(
        filter(lambda el: len(el) == 2, code_equalities)
    )

    lh_terms = list(
        set([ code_equality[0] for code_equality in code_equalities ])
    )

    lr_dict = {
        lh_term: [] for lh_term in lh_terms
    }

    for code_equality in code_equalities:
        #print(code_equality)
        lh_key = code_equality[0]
        lh_value = code_equality[1]
        
        lr_dict[lh_key].append(lh_value)


    field_str = "wgCoordinates"

    try:
        confs = list(
            filter(
                lambda rh_term: field_str in rh_term, 
                lr_dict['RLCONF']
            )
        )
        
    except:
       return () 

    if(len(confs) == 1):
        try: 
            conf = json.loads(confs[0])
            json.loads(confs[0])
        except:
            return ()
        
        return (
            conf[field_str]['lat'],
            conf[field_str]['lon']
        )
    else:
        return ()

In [6]:
import json

for route in brazilian_city_routes():
    uri = host + route
    
    coords = str(get_latlng_tuple(uri))
    
    print(uri + " " + coords)
        