# GeoRSS xml format to GeoJSON conversion, 
## with metadata rescue

by Nikhil VJ, http://nikhilvj.co.in  
28 October 2018

- Made specifically to handle georss (.xml) downloads from Bhuvan
- Parses the metadata trapped in HTML description
- Tested to work with points, polygons, lines, and can handle a file that mixes these
- Does "right hand rule" fix for polygons (see https://mapster.me/right-hand-rule-geojson-fixer/)
- Saves an extra CSV with the metadata neatly laid out
- Saves numerical metadata as number instead of string, this helps when you want to render features based on the numbers (example: choropleth maps). Can recognise numbers like "-3.43e-4"
- Gets rid of an html whitespace (\xa0) encountered in metadata in one file

modules you'll need:  
`feedparser geojson geojson_utils pandas bs4 lxml`

In [1]:
import feedparser
import geojson, geojson_utils
import pandas as pd
from collections import OrderedDict
from bs4 import BeautifulSoup

In [2]:
# make sure your xml is there in the same folder
file2 = 'asi-BLR_004_100M.xml' # polygon
file3 = 'asi-monuments2.xml' # points
file4 = 'asi-devanahallifort_LN.xml' # lines
file5 = 'basemap-MH_Vill.xml' # large, polygons

In [3]:
filename = file2[:-4]
filename

'asi-BLR_004_100M'

In [4]:
d = feedparser.parse(file2)
# feedparser does most of the grunt work. converts georss objects into geojson objects,
# which can be handled by the geojson module

In [5]:
print('Number of shapes in this file:',len(d.entries) )

Number of shapes in this file: 1


## R & D

In [6]:
# d.entries[0]

In [7]:
# d.entries[0].where

In [8]:
# d.entries[0].where.coordinates

In [9]:
# area.. from https://github.com/brandonxiang/geojson-python-utils
# only to be done in case of polygon
# if d.entries[0].where.type == 'Polygon':
#    geojson_utils.area(d.entries[0].where)

In [10]:
# geojson.Polygon(d.entries[0].where.coordinates)

In [11]:
# geojson_utils.area(geojson.Polygon(d.entries[0].where.coordinates))

In [12]:
# pgon = geojson.Polygon(d.entries[0].where.coordinates)

In [13]:
# if pgon.type == 'Polygon': print('yay')

In [14]:
# geojson.Feature(geometry=pgon, properties={"country": "Spain"})  

In [15]:
# d.entries[0].summary

In [16]:
# cleaning out \xa0 garbage chars
# d.entries[0].summary = d.entries[0].summary.replace('\xa0', "")

In [17]:
# its html. so lets use BeautifulSoup to scrape through it
# soup =  BeautifulSoup(d.entries[0].summary,'lxml')
# soup

In [18]:
# meta = soup.select('span')
# for n in range(int(len(meta)/2)):
#    # print(n)
#    print(meta[n*2].text, meta[n*2+1].text )

In [19]:
# get these into the properties, along with the title

In [20]:
# d.entries[0].title

In [21]:
# props = {'title': d.entries[0].title}

In [22]:
# for n in range(int(len(meta)/2)):
#    props[meta[n*2].text] = meta[n*2+1].text

In [23]:
# props

In [24]:
#geojson.Feature(geometry=d.entries[0].where, properties=props)

In [25]:
# feat1 = geojson.Feature(geometry=d.entries[0].where, properties=props)

In [26]:
# geojson.FeatureCollection([feat1])

In [27]:
# geojson.dumps(geojson.FeatureCollection([feat1]))

In [28]:
# yesss that's the desired output!

## functions

In [29]:
def checkNumber(value):
    if not isinstance(value, str): return value
    # idiot-proofing: if its not a string to begin with, return unopened
    
    if value.lower().replace('.','',1).replace('e','',1).replace('-','',2).isdigit():
        # can recognize numbers like -3.4e-4
        try:
            if value.isdigit(): value = int(value)
            else: value = float(value)
        except ValueError as e:
            pass
    return value

# PS: Yes yes this will trip up with false positives like '.e--3'.
# so putting in a try-except block. 
# If nothing works out, returns the original string
print('test:')
print(checkNumber('34'), type(checkNumber('34')))
print(checkNumber('3.5643'), type(checkNumber('3.5643')))
print(checkNumber('3e7'), type(checkNumber('3e7')))
print(checkNumber('3.43e3'), type(checkNumber('3.43e3')))
print(checkNumber('3.43e-3'), type(checkNumber('3.43e-3')))
print(checkNumber('3-3'), type(checkNumber('3-3')))

test:
34 <class 'int'>
3.5643 <class 'float'>
30000000.0 <class 'float'>
3430.0 <class 'float'>
0.00343 <class 'float'>
3-3 <class 'str'>


## Loop through the shapes
GeoJSON features collected in list "collector"

In [30]:
collector = []
count = 0
for entry in d.entries:
    if count % 100 == 1: print(count,'shapes processed.')

    props = OrderedDict({'title': entry.title})
    
    # breaking up the HTML-i-fied description into properties metadata
    htmlBlock = entry.summary.replace('\xa0', "")
    # \xa0 : html whitespace char that was cropping up. So zapped it.
    soup =  BeautifulSoup(htmlBlock,'lxml')
    meta = soup.select('span')
    for n in range(int(len(meta)/2)):
        key = meta[n*2].text
        value = meta[n*2+1].text
        
        # eval to float or int if the string is number
        value = checkNumber(value)
        props[key] = value
    
    # print(props)
    
    # Right hand rule fix: if co-ords are in reversed order, then area is calculated negative
    # area.. from https://github.com/brandonxiang/geojson-python-utils
    if entry.where.type == 'Polygon':
        # only do for Polygon. 
        area = geojson_utils.area(entry.where)
        if area < 0:
            # co-ordinates: reverse the order
            entry.where.coordinates[0].reverse()
            # print('after reversing, new area:', geojson_utils.area(entry.where))
    
    count +=1
    collector.append( geojson.Feature(geometry=entry.where, properties=props, id=count) )
    # geojson Feature is created here by bringing together :
    # the geometry, which was already processed into geojson form by feedparser,
    # the properties, which most of this loop was processing out from the html block under "summary"
    # optionally, an id. Using it here because in some shapes the title was non-unique, and ..
    # ...good to have a primary key to tie this to the CSV also being made.

## geojson
Create FeatureCollection and 'dump' as a geojson file

In [31]:
with open(filename+'.geojson','w') as f:
    f.write( geojson.dumps(geojson.FeatureCollection(collector), indent=2 ) )

## saving the same as CSV table too

In [32]:
dfCollector = []
for x in collector:
    row = OrderedDict({'id':x.id})
    row.update(x.properties)
    row['geometry_type'] = x.geometry.type
    
    # optional: comment out below lines if you don't want to export the geometry ie list of latlongs in CSV
    row['geometry_coordinates'] = geojson.dumps(x.geometry.coordinates)
    
    # Extra: if it's point coords, then make in lat and long columns so that the CSV is easily mappable in geojson.io, google my maps etc
    if row['geometry_type'] == 'Point':
        row['latitude'] = x.geometry.coordinates[1]
        row['longitude'] = x.geometry.coordinates[0]
    
    dfCollector.append(row)

In [33]:
df = pd.DataFrame(dfCollector).set_index('id')
df.head()

Unnamed: 0_level_0,title,Name,Shape_Leng,Shape_Area,MON_NO,No_,monumentna,MON_NUM,Location,District,BUFF_DIST,geometry_type,geometry_coordinates
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,BLR_004_100M.1,BLR_004,0.010207,7e-06,4,4.0,Fort,4,Devanahalli,Bangalore,100.0,Polygon,"[[[77.71140589580278, 13.250946251295886], [77..."


In [34]:
# df.iloc[0]['geometry_coordinates']
# confirming it's stored as a string

In [35]:
df.to_csv(filename+'.csv')

# saving out to a CSV. Metadata can be easily analysed here
# and if its a point locations file then even the CSV can directly be mapped

In [36]:
df.columns

Index(['title', 'Name', 'Shape_Leng', 'Shape_Area', 'MON_NO', 'No_',
       'monumentna', 'MON_NUM', 'Location', 'District', 'BUFF_DIST',
       'geometry_type', 'geometry_coordinates'],
      dtype='object')

In [37]:
metaFields = [ x for x in list(df.columns) if x not in ['title','geometry_type','geometry_coordinates','latitude','longitude']]
metaFields

['Name',
 'Shape_Leng',
 'Shape_Area',
 'MON_NO',
 'No_',
 'monumentna',
 'MON_NUM',
 'Location',
 'District',
 'BUFF_DIST']