# Parsing ASI monuments XML data
required modules : pandas, xmltodict, bs4, lxml

In [1]:
import xmltodict
import pandas as pd
from collections import OrderedDict
from bs4 import BeautifulSoup

In [2]:
file1 = 'asi-monuments2.xml'

In [3]:
with open( file1 , encoding='utf8' ) as fd:
    data = xmltodict.parse(fd.read(), attr_prefix='')

In [4]:
len(data)

1

In [5]:
data.keys()

odict_keys(['rss'])

In [6]:
data.get('rss').keys()

odict_keys(['xmlns:georss', 'xmlns:atom', 'version', 'channel'])

In [7]:
data.get('rss').get('channel').keys()

odict_keys(['title', 'description', 'link', 'atom:link', 'item'])

In [8]:
channel = data.get('rss').get('channel')

In [9]:
len(channel)

5

In [10]:
items = data.get('rss').get('channel').get('item')
items[:2]

[OrderedDict([('title', 'monuments2.fid-3f3c22b6_1667d4434db_44be'),
              ('link',
               'http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/reflect?format=application%2Fatom%2Bxml&layers=asi%3Amonuments2&featureid=monuments2.fid-3f3c22b6_1667d4434db_44be'),
              ('guid',
               'http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/reflect?format=application%2Fatom%2Bxml&layers=asi%3Amonuments2&featureid=monuments2.fid-3f3c22b6_1667d4434db_44be'),
              ('description',
               '<h4>monuments2</h4>\n\n<ul class="textattributes">\n  <li><strong><span class="atr-name">gid</span>:</strong> <span class="atr-value">12067</span></li>\n  <li><strong><span class="atr-name">monumentna</span>:</strong> <span class="atr-value">Shikargah Kusak</span></li>\n  <li><strong><span class="atr-name">descriptio</span>:</strong> <span class="atr-value">Shikargah Kusak</span></li>\n  <li><strong><span class="atr-name">location</span>:</strong> <span class="atr-value">Teen 

In [11]:
# how many data points do we have?
len(items)

4226

In [12]:
# initiate the collector array
collector = []

In [13]:
# test run for one item
item = items[0]
item

OrderedDict([('title', 'monuments2.fid-3f3c22b6_1667d4434db_44be'),
             ('link',
              'http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/reflect?format=application%2Fatom%2Bxml&layers=asi%3Amonuments2&featureid=monuments2.fid-3f3c22b6_1667d4434db_44be'),
             ('guid',
              'http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/reflect?format=application%2Fatom%2Bxml&layers=asi%3Amonuments2&featureid=monuments2.fid-3f3c22b6_1667d4434db_44be'),
             ('description',
              '<h4>monuments2</h4>\n\n<ul class="textattributes">\n  <li><strong><span class="atr-name">gid</span>:</strong> <span class="atr-value">12067</span></li>\n  <li><strong><span class="atr-name">monumentna</span>:</strong> <span class="atr-value">Shikargah Kusak</span></li>\n  <li><strong><span class="atr-name">descriptio</span>:</strong> <span class="atr-value">Shikargah Kusak</span></li>\n  <li><strong><span class="atr-name">location</span>:</strong> <span class="atr-value">Teen Murti B

In [14]:
item['description']

'<h4>monuments2</h4>\n\n<ul class="textattributes">\n  <li><strong><span class="atr-name">gid</span>:</strong> <span class="atr-value">12067</span></li>\n  <li><strong><span class="atr-name">monumentna</span>:</strong> <span class="atr-value">Shikargah Kusak</span></li>\n  <li><strong><span class="atr-name">descriptio</span>:</strong> <span class="atr-value">Shikargah Kusak</span></li>\n  <li><strong><span class="atr-name">location</span>:</strong> <span class="atr-value">Teen Murti Bhawan</span></li>\n  <li><strong><span class="atr-name">district</span>:</strong> <span class="atr-value">New Delhi</span></li>\n  <li><strong><span class="atr-name">state</span>:</strong> <span class="atr-value">Delhi</span></li>\n  <li><strong><span class="atr-name">asicircle</span>:</strong> <span class="atr-value">Delhi Mini Circle</span></li>\n  \n  <li><strong><span class="atr-name">photo</span>:</strong> <span class="atr-value">DLMDL135.jpg</span></li>\n  <li><strong><span class="atr-name">mon_num</

In [15]:
# its html. so lets use BeautifulSoup to scrape through it
soup =  BeautifulSoup(item['description'],'lxml')
soup

<html><body><h4>monuments2</h4>
<ul class="textattributes">
<li><strong><span class="atr-name">gid</span>:</strong> <span class="atr-value">12067</span></li>
<li><strong><span class="atr-name">monumentna</span>:</strong> <span class="atr-value">Shikargah Kusak</span></li>
<li><strong><span class="atr-name">descriptio</span>:</strong> <span class="atr-value">Shikargah Kusak</span></li>
<li><strong><span class="atr-name">location</span>:</strong> <span class="atr-value">Teen Murti Bhawan</span></li>
<li><strong><span class="atr-name">district</span>:</strong> <span class="atr-value">New Delhi</span></li>
<li><strong><span class="atr-name">state</span>:</strong> <span class="atr-value">Delhi</span></li>
<li><strong><span class="atr-name">asicircle</span>:</strong> <span class="atr-value">Delhi Mini Circle</span></li>
<li><strong><span class="atr-name">photo</span>:</strong> <span class="atr-value">DLMDL135.jpg</span></li>
<li><strong><span class="atr-name">mon_num</span>:</strong> <span c

In [16]:
meta = soup.select('span')
meta

[<span class="atr-name">gid</span>,
 <span class="atr-value">12067</span>,
 <span class="atr-name">monumentna</span>,
 <span class="atr-value">Shikargah Kusak</span>,
 <span class="atr-name">descriptio</span>,
 <span class="atr-value">Shikargah Kusak</span>,
 <span class="atr-name">location</span>,
 <span class="atr-value">Teen Murti Bhawan</span>,
 <span class="atr-name">district</span>,
 <span class="atr-value">New Delhi</span>,
 <span class="atr-name">state</span>,
 <span class="atr-value">Delhi</span>,
 <span class="atr-name">asicircle</span>,
 <span class="atr-value">Delhi Mini Circle</span>,
 <span class="atr-name">photo</span>,
 <span class="atr-value">DLMDL135.jpg</span>,
 <span class="atr-name">mon_num</span>,
 <span class="atr-value">DLMDL135</span>,
 <span class="atr-name">pb_name</span>,
 <span class="atr-value">DLM135</span>]

In [17]:
len(meta)

20

In [18]:
for n in range(int(len(meta)/2)):
    # print(n)
    print(meta[n*2].text, meta[n*2+1].text )

gid 12067
monumentna Shikargah Kusak
descriptio Shikargah Kusak
location Teen Murti Bhawan
district New Delhi
state Delhi
asicircle Delhi Mini Circle
photo DLMDL135.jpg
mon_num DLMDL135
pb_name DLM135


## Now looping through all items in the XML

In [19]:
for item in items:
    row = OrderedDict()
    row['title'] = item.get('title')
    row['link'] = item.get('link')
    row['guid'] = item.get('guid')
    # row['description'] = item.get('description')
    row['latitude'] = round(float(item.get('georss:point').split()[0]),6)
    row['longitude'] = round(float(item.get('georss:point').split()[1]),6)
    
    soup =  BeautifulSoup(item.get('description'),'lxml')
    meta = soup.select('span')
    for n in range(int(len(meta)/2)):
        row[meta[n*2].text] = meta[n*2+1].text
        #print(meta[n*2].text, meta[n*2+1].text )
        
    # print(row['latitude'],row['longitude'])
    collector.append(row)

In [20]:
df = pd.DataFrame(collector)

In [21]:
df

Unnamed: 0,title,link,guid,latitude,longitude,gid,monumentna,descriptio,location,district,state,asicircle,photo,mon_num,pb_name
0,monuments2.fid-3f3c22b6_1667d4434db_44be,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,28.603323,77.198151,12067,Shikargah Kusak,Shikargah Kusak,Teen Murti Bhawan,New Delhi,Delhi,Delhi Mini Circle,DLMDL135.jpg,DLMDL135,DLM135
1,monuments2.fid-3f3c22b6_1667d4434db_44bf,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,28.581330,77.231309,12068,Unknown Tomb in the vicinitty of Jawahar Lal N...,Unknown Tomb in the vicinitty of Jawahar Lal N...,"Pragati Vihar, New Delhi",South,Delhi,Delhi Mini Circle,DLMDL174.jpg,DLMDL174,DLM174
2,monuments2.fid-3f3c22b6_1667d4434db_44c0,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,28.627473,77.216701,12069,Jantar Mantar,Jantar Mantar,Cannought Place,New Delhi,Delhi,Delhi Mini Circle,DLMDL023.jpg,DLMDL023,DLM023
3,monuments2.fid-3f3c22b6_1667d4434db_44c1,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,28.593306,77.250726,11299,"Humayun's tomb, its platforms","Humayun's tomb, its platforms",Nizamuddin,South East,Delhi,Delhi Circle,DELDL127A.JPG,DELDL127A,DL127A
4,monuments2.fid-3f3c22b6_1667d4434db_44c2,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,28.592463,77.247118,11300,"Humayun's tomb, its platforms, garden, enclosu...","Humayun's tomb, its platforms, garden, enclosu...",Nizamuddin,South East,Delhi,Delhi Circle,DELDL127B.JPG,DELDL127B,DL127B
5,monuments2.fid-3f3c22b6_1667d4434db_44c3,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,28.565007,77.187884,11301,"Unnamed tomb, Mohammad Pur Village","Unnamed tomb, Mohammad Pur Village",Mohammad Pur Village,South,Delhi,Delhi Circle,DELDL110.JPG,DELDL110,DL110
6,monuments2.fid-3f3c22b6_1667d4434db_44c4,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,28.588039,77.247928,11302,Tomb of Khan-i-Khana,Tomb of Khan-i-Khana,Nizamuddin,South East,Delhi,Delhi Circle,DELDL133.JPG,DELDL133,DL133
7,monuments2.fid-3f3c22b6_1667d4434db_44c5,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,28.531588,77.223234,11303,Sat pula,Sat pula,Khirki Village near Malviya Nagar,South,Delhi,Delhi Circle,DELDL087.JPG,DELDL087,DL087
8,monuments2.fid-3f3c22b6_1667d4434db_44c6,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,28.505062,77.267324,11304,"Walls, gate and bastions of Adilabad (Mohammad...","Walls, gate and bastions of Adilabad (Mohammad...",Tughlaqabad,South East,Delhi,Delhi Circle,DELDL145.JPG,DELDL145,DL145
9,monuments2.fid-3f3c22b6_1667d4434db_44c7,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,http://bhuvan5.nrsc.gov.in:80/bhuvan/asi/wms/r...,28.572319,77.222252,11305,Mosque attached to Mubarak shah Tomb,Mosque attached to Mubarak shah Tomb,Kotla Mubrakpur Village,South,Delhi,Delhi Circle,DELDL107.JPG,DELDL107,DL107


In [22]:
df.to_csv('asi-monuments2.csv',index_label='sr')

## ok the data is in asi-monuments2.csv.
You can drag-drop it on geojson.io for quick viz.