# Tutorial: Parsing OpenStreetMap

In [12]:
from pyrosm import get_data, OSM
from pyrosm.data import sources
import geopandas as gpd
import numpy as np
from collections import Counter

We can use a cool library called `pyrosm` to download data and parse it into geopandas. Here are the list of available countries in Europe, but also regions that can be downloaded from regions within countries (useful when countries are large).

In [2]:
print(sources.europe.available)

['albania', 'andorra', 'austria', 'azores', 'belarus', 'belgium', 'bosnia_herzegovina', 'bulgaria', 'croatia', 'cyprus', 'czech_republic', 'denmark', 'estonia', 'faroe_islands', 'finland', 'france', 'georgia', 'germany', 'great_britain', 'greece', 'hungary', 'iceland', 'ireland_and_northern_ireland', 'isle_of_man', 'italy', 'kosovo', 'latvia', 'liechtenstein', 'lithuania', 'luxembourg', 'macedonia', 'malta', 'moldova', 'monaco', 'montenegro', 'netherlands', 'norway', 'poland', 'portugal', 'romania', 'russia', 'serbia', 'slovakia', 'slovenia', 'spain', 'sweden', 'switzerland', 'turkey', 'ukraine']


In [3]:
print("All countries with sub-regions:", sources.subregions.available.keys())

All countries with sub-regions: dict_keys(['brazil', 'canada', 'france', 'germany', 'great_britain', 'italy', 'japan', 'netherlands', 'poland', 'russia', 'usa'])


In [4]:
sources.subregions.france.available

['alsace',
 'aquitaine',
 'auvergne',
 'basse_normandie',
 'bourgogne',
 'bretagne',
 'centre',
 'champagne_ardenne',
 'corse',
 'franche_comte',
 'guadeloupe',
 'guyane',
 'haute_normandie',
 'ile_de_france',
 'languedoc_roussillon',
 'limousin',
 'lorraine',
 'martinique',
 'mayotte',
 'midi_pyrenees',
 'nord_pas_de_calais',
 'pays_de_la_loire',
 'picardie',
 'poitou_charentes',
 'provence_alpes_cote_d_azur',
 'reunion',
 'rhone_alpes']

## Importing and filtering
It is pretty easy to download the OSM data for a given region.

In [5]:
fp = get_data("Malta",directory=r"C:\Users\miln\Desktop")

Downloaded Protobuf data 'malta-latest.osm.pbf' (4.61 MB) to:
'C:\Users\miln\Desktop\malta-latest.osm.pbf'


'C:\\Users\\miln\\Desktop\\malta-latest.osm.pbf'

You can load it into a gdf.

In [8]:
osm = OSM(fp)
buildings = osm.get_buildings()

We are only some of the columns. That's good because we are pretty much all what we need and not all the irrelevant ones.

In [9]:
buildings.columns

Index(['addr:city', 'addr:country', 'addr:housenumber', 'addr:housename',
       'addr:postcode', 'addr:street', 'email', 'name', 'opening_hours',
       'operator', 'phone', 'ref', 'website', 'building', 'amenity',
       'building:levels', 'building:material', 'building:use', 'craft',
       'height', 'internet_access', 'landuse', 'office', 'shop', 'source',
       'start_date', 'wikipedia', 'id', 'timestamp', 'version', 'tags',
       'osm_type', 'geometry', 'changeset'],
      dtype='object')

Let's keep only the actually relevant columns.

In [10]:
buildings = buildings[['id','height','building:levels','start_date','building','building:use','amenity','building:material','geometry']]

## Cleaning up columns

### Building heights
They come as strings and there is sometimes text in there. Let's remove it.

In [17]:
print(Counter(buildings['height']))

Counter({None: 16361, nan: 124, '11': 4, '22': 3, '15': 1, '20': 1, '98m': 1, '11.4': 1, '5': 1, '40': 1, '9': 1, '2': 1})


In [18]:
# cleaning heights
buildings['height'] = buildings['height'].str.extract('(\d+\.?(\d+)?)')
buildings['height'] = buildings['height'].astype(float)

Now we have clean floats that we can use to compute metrics etc.

In [20]:
buildings['height'].mean()

20.69333333333333

### Building age and floors
Let's do the same for those two.

In [33]:
# cleaning heights, levels and age
for col in ['building:levels','start_date']:
    buildings[col] = buildings[col].str.extract('(\d+\.?(\d+)?)')
    buildings[col] = buildings[col].astype(float)

### Building types
Here this is more difficult...

First, we have several variables that are useful (`building`,`building:use` and `amenity`). We need to merge these into one 'building type' column ultimately.

We have plenty of different values, actually many many, that we need to cluster... 

In [16]:
print(Counter(buildings['building']))

Counter({'yes': 14141, 'residential': 441, 'house': 429, 'apartments': 396, 'school': 182, 'church': 154, 'chapel': 132, 'greenhouse': 110, 'industrial': 101, 'commercial': 39, 'university': 36, 'roof': 32, 'hotel': 30, 'garages': 29, 'office': 25, 'shed': 21, 'hut': 21, 'retail': 18, 'construction': 18, 'hangar': 15, 'bunker': 14, 'hospital': 13, 'garage': 12, 'terrace': 12, 'farm': 7, 'public': 6, 'collapsed': 6, 'college': 5, 'cathedral': 4, 'ruins': 4, 'manufacture': 4, 'warehouse': 3, 'government': 3, 'farm_auxiliary': 3, 'kiosk': 3, 'service': 3, 'basilica': 2, 'toilets': 2, 'train_station': 2, 'convent': 2, 'windmill': 2, 'boathouse': 2, 'marquee': 2, 'grandstand': 2, 'stadium': 2, 'transportation': 1, 'sports_centre': 1, '2': 1, 'supermarket': 1, 'detached': 1, 'semidetached_house': 1, 'restaurant': 1, 'civic': 1, 'steps': 1, 'no': 1})


In [15]:
Counter(buildings['building:use'])

Counter({None: 16375, 'medical': 1, nan: 124})

Let's first harmonize all the different null values... 

In [21]:
buildings['building'].replace({'yes':None},inplace=True)
buildings[['building','building:use','amenity']] = buildings[['building','building:use','amenity']].replace({np.nan:None})

Let's create a new column that indicates whether we have at least a value for any of the three possible variables. 

In [26]:
def has_a_type_value(array):
    return(any(np.equal(array, None) == False))

In [27]:
type_cols_values = buildings[['building','building:use','amenity']].values
new_col = np.array(list(map(has_a_type_value,type_cols_values)))
buildings.insert(4,'has_type',new_col)

In [None]:
# TODO: cluster categories

Here is our final dataset that can be saved.

In [29]:
buildings.head()

Unnamed: 0,id,height,building:levels,start_date,has_type,building,building:use,amenity,building:material,geometry
0,15684588,,,,True,,,conference_centre,,"POLYGON ((14.51781 35.89993, 14.51844 35.89981..."
1,23352829,,,,True,cathedral,,place_of_worship,,"POLYGON ((14.51252 35.89737, 14.51247 35.89743..."
2,23589859,,,,False,,,,,"POLYGON ((14.32957 36.00682, 14.32983 36.00671..."
3,23738672,,,,False,,,,,"POLYGON ((14.30902 36.05544, 14.30907 36.05540..."
4,23858928,,,,True,church,,,,"POLYGON ((14.45655 35.83108, 14.45662 35.83114..."


## Compute summary statistics

We can now compute statistics!!

In [32]:
len(buildings)

16500

In [25]:
len(buildings['height'].dropna())

15

In [35]:
len(buildings['building:levels'].dropna())

160

In [36]:
len(buildings['start_date'].dropna())

29

In [31]:
len(buildings[buildings['has_type']==True])

2530