In [20]:
# https://docs.python.org/3.4/library/xml.etree.elementtree.html
import xml.etree.cElementTree as ET

import pandas as pd
import re
import pdb

In [21]:
#load data file
file = 'REF-1551442403585.xml'
tree = ET.ElementTree(file=file)
df = pd.DataFrame()
root = tree.getroot()

In [22]:
for atype in root.findall('type'):
    print(atype.get('foobar'))

In [23]:
import xmltodict

In [24]:
>>> doc = xmltodict.parse("""
... <mydocument has="an attribute">
...   <and>
...     <many>elements</many>
...     <many>more elements</many>
...   </and>
...   <plus a="complex">
...     element as well
...   </plus>
... </mydocument>
... """)

In [25]:
doc['mydocument']['@has']

'an attribute'

In [26]:
doc['mydocument']['and']['many']

['elements', 'more elements']

In [28]:
doc

OrderedDict([('mydocument',
              OrderedDict([('@has', 'an attribute'),
                           ('and',
                            OrderedDict([('many',
                                          ['elements', 'more elements'])])),
                           ('plus',
                            OrderedDict([('@a', 'complex'),
                                         ('#text', 'element as well')]))]))])

In [29]:
doc['mydocument']['plus']['@a']

'complex'

In [30]:
doc['mydocument']['plus']['#text']

'element as well'

## lets try using the xml file

In [42]:
#doc = xmltodict.parse('resident_data.xml')
with open('resident_data.xml') as fd:
    doc = xmltodict.parse(fd.read())

In [43]:
doc

OrderedDict([('State',
              OrderedDict([('Resident',
                            [OrderedDict([('@Id', '100'),
                                          ('Name', 'Sample Name'),
                                          ('PhoneNumber', '1234567891'),
                                          ('EmailAddress',
                                           'sample_name@example.com'),
                                          ('Address',
                                           OrderedDict([('StreetLine1',
                                                         'Street Line1'),
                                                        ('City', 'City Name'),
                                                        ('StateCode', 'AE'),
                                                        ('PostalCode',
                                                         '12345')]))]),
                             OrderedDict([('@Id', '101'),
                                          ('Name', '

In [38]:
import json
import pandas
import requests
import xmltodict

In [39]:
web_request = requests.get(u'http://xml.pinnaclesports.com/pinnacleFeed.aspx?sportType=Basketball')

In [40]:
# Make that unweidly XML doc look like a native Dictionary!
result = xmltodict.parse(web_request.text)

ExpatError: not well-formed (invalid token): line 9, column 82

In [None]:
# Next, convert the nested OrderedDict to a real dict, which isn't strictly necessary, but helps you
#   visualize what the structure of the data looks like
normal_dict = json.loads(json.dumps(result.get('pinnacle_line_feed', {}).get(u'events', {}).get(u'event', [])))

In [50]:
# Now, make that dictionary into a dataframe
df = pd.DataFrame.from_dict(doc)

In [52]:
df.T

Unnamed: 0,#text,Resident
State,.\n.\n.\n.,"[{'@Id': '100', 'Name': 'Sample Name', 'PhoneN..."


In [55]:
json.dumps(doc)

'{"State": {"Resident": [{"@Id": "100", "Name": "Sample Name", "PhoneNumber": "1234567891", "EmailAddress": "sample_name@example.com", "Address": {"StreetLine1": "Street Line1", "City": "City Name", "StateCode": "AE", "PostalCode": "12345"}}, {"@Id": "101", "Name": "Sample Name1", "PhoneNumber": "1234567891", "EmailAddress": "sample_name1@example.com", "Address": {"StreetLine1": "Current Address", "City": "Los Angeles", "StateCode": "CA", "PostalCode": "56666"}}], "#text": ".\\n.\\n.\\n."}}'

In [58]:
doc

OrderedDict([('State',
              OrderedDict([('Resident',
                            [OrderedDict([('@Id', '100'),
                                          ('Name', 'Sample Name'),
                                          ('PhoneNumber', '1234567891'),
                                          ('EmailAddress',
                                           'sample_name@example.com'),
                                          ('Address',
                                           OrderedDict([('StreetLine1',
                                                         'Street Line1'),
                                                        ('City', 'City Name'),
                                                        ('StateCode', 'AE'),
                                                        ('PostalCode',
                                                         '12345')]))]),
                             OrderedDict([('@Id', '101'),
                                          ('Name', '

In [None]:
collections.OrderedDict()

In [60]:
import json
import pandas
import requests
import xmltodict

web_request = requests.get(u'http://xml.pinnaclesports.com/pinnacleFeed.aspx?sportType=Basketball')

# Make that unweidly XML doc look like a native Dictionary!
result = xmltodict.parse(web_request.text)

# Next, convert the nested OrderedDict to a real dict, which isn't strictly necessary, but helps you
#   visualize what the structure of the data looks like
normal_dict = json.loads(json.dumps(result.get('pinnacle_line_feed', {}).get(u'events', {}).get(u'event', [])))

# Now, make that dictionary into a dataframe
df = pandas.DataFrame.from_dict(normal_dict)

ExpatError: not well-formed (invalid token): line 9, column 82

# Epic method

In [None]:
# https://knanne.github.io/notebooks/pandas_process_XML.html

In [101]:
import requests

In [102]:
response = requests.get('https://knanne.github.io/feed.xml')

In [103]:
response.ok

True

In [104]:
import xmltodict
import json

In [105]:
%%time
d = xmltodict.parse(response.content)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.78 ms


In [68]:
d

OrderedDict([('feed',
              OrderedDict([('@xmlns', 'http://www.w3.org/2005/Atom'),
                           ('generator',
                            OrderedDict([('@uri', 'https://jekyllrb.com/'),
                                         ('@version', '3.7.4'),
                                         ('#text', 'Jekyll')])),
                           ('link',
                            [OrderedDict([('@href',
                                           'https://knanne.github.io/feed.xml'),
                                          ('@rel', 'self'),
                                          ('@type', 'application/atom+xml')]),
                             OrderedDict([('@href',
                                           'https://knanne.github.io/'),
                                          ('@rel', 'alternate'),
                                          ('@type', 'text/html')])]),
                           ('updated', '2018-11-25T16:32:05+00:00'),
                         

In [106]:
print(json.dumps(d, indent=2)[:750])
print('.\n'*3)

{
  "feed": {
    "@xmlns": "http://www.w3.org/2005/Atom",
    "generator": {
      "@uri": "https://jekyllrb.com/",
      "@version": "3.7.4",
      "#text": "Jekyll"
    },
    "link": [
      {
        "@href": "https://knanne.github.io/feed.xml",
        "@rel": "self",
        "@type": "application/atom+xml"
      },
      {
        "@href": "https://knanne.github.io/",
        "@rel": "alternate",
        "@type": "text/html"
      }
    ],
    "updated": "2018-11-25T16:32:05+00:00",
    "id": "https://knanne.github.io/feed.xml",
    "title": {
      "@type": "html",
      "#text": "knanne"
    },
    "subtitle": "Expert traveler, amateur farmer, beer / wine enthusiast - with a data science problem",
    "author": {
      "name": "Kai
.
.
.



In [107]:
import pandas as pd

In [109]:
df = pd.DataFrame.from_dict(d['feed']['entry'])

In [71]:
df.shape

(10, 9)

In [72]:
df.columns

Index(['title', 'link', 'published', 'updated', 'id', 'content', 'author',
       'category', 'summary'],
      dtype='object')

In [75]:
df.head()

Unnamed: 0,title,link,published,updated,id,content,author,category,summary
0,"{'@type': 'html', '#text': 'Notes On Excel'}",{'@href': 'https://knanne.github.io/posts/note...,2018-05-29T00:00:00+00:00,2018-05-29T00:00:00+00:00,https://knanne.github.io/posts/notes-on-excel,"{'@type': 'html', '@xml:base': 'https://knanne...",{'name': 'Kain Nanne'},{'@term': 'excel'},"{'@type': 'html', '#text': 'Random notes on us..."
1,"{'@type': 'html', '#text': 'Notes On Shell'}",{'@href': 'https://knanne.github.io/posts/note...,2018-05-26T00:00:00+00:00,2018-05-26T00:00:00+00:00,https://knanne.github.io/posts/notes-on-shell,"{'@type': 'html', '@xml:base': 'https://knanne...",{'name': 'Kain Nanne'},"[{'@term': 'shell'}, {'@term': 'unix'}, {'@ter...","{'@type': 'html', '#text': 'Random notes on th..."
2,"{'@type': 'html', '#text': 'Notes On Ssh'}",{'@href': 'https://knanne.github.io/posts/note...,2018-05-26T00:00:00+00:00,2018-05-26T00:00:00+00:00,https://knanne.github.io/posts/notes-on-ssh,"{'@type': 'html', '@xml:base': 'https://knanne...",{'name': 'Kain Nanne'},{'@term': 'ssh'},"{'@type': 'html', '#text': 'Random notes on us..."
3,"{'@type': 'html', '#text': 'Notes On Pyspark'}",{'@href': 'https://knanne.github.io/posts/note...,2017-12-27T00:00:00+00:00,2017-12-27T00:00:00+00:00,https://knanne.github.io/posts/notes-on-pyspark,"{'@type': 'html', '@xml:base': 'https://knanne...",{'name': 'Kain Nanne'},"[{'@term': 'pyspark'}, {'@term': 'spark'}, {'@...","{'@type': 'html', '#text': 'Random notes, link..."
4,"{'@type': 'html', '#text': 'Random Resources F...",{'@href': 'https://knanne.github.io/posts/rand...,2017-11-22T00:00:00+00:00,2017-11-22T00:00:00+00:00,https://knanne.github.io/posts/random-resource...,"{'@type': 'html', '@xml:base': 'https://knanne...",{'name': 'Kain Nanne'},,"{'@type': 'html', '#text': 'Random notes, link..."


In [76]:
df['author']

0    {'name': 'Kain Nanne'}
1    {'name': 'Kain Nanne'}
2    {'name': 'Kain Nanne'}
3    {'name': 'Kain Nanne'}
4    {'name': 'Kain Nanne'}
5    {'name': 'Kain Nanne'}
6    {'name': 'Kain Nanne'}
7    {'name': 'Kain Nanne'}
8    {'name': 'Kain Nanne'}
9    {'name': 'Kain Nanne'}
Name: author, dtype: object

In [77]:
df['author'].apply(lambda x: x.get('name'))

0    Kain Nanne
1    Kain Nanne
2    Kain Nanne
3    Kain Nanne
4    Kain Nanne
5    Kain Nanne
6    Kain Nanne
7    Kain Nanne
8    Kain Nanne
9    Kain Nanne
Name: author, dtype: object

In [79]:
df['author'].apply(pd.Series)

Unnamed: 0,name
0,Kain Nanne
1,Kain Nanne
2,Kain Nanne
3,Kain Nanne
4,Kain Nanne
5,Kain Nanne
6,Kain Nanne
7,Kain Nanne
8,Kain Nanne
9,Kain Nanne


In [81]:
df = df.join(df['author'].apply(pd.Series), how='left')

In [83]:
df.shape

(10, 10)

In [84]:
df.head()

Unnamed: 0,title,link,published,updated,id,content,author,category,summary,name
0,"{'@type': 'html', '#text': 'Notes On Excel'}",{'@href': 'https://knanne.github.io/posts/note...,2018-05-29T00:00:00+00:00,2018-05-29T00:00:00+00:00,https://knanne.github.io/posts/notes-on-excel,"{'@type': 'html', '@xml:base': 'https://knanne...",{'name': 'Kain Nanne'},{'@term': 'excel'},"{'@type': 'html', '#text': 'Random notes on us...",Kain Nanne
1,"{'@type': 'html', '#text': 'Notes On Shell'}",{'@href': 'https://knanne.github.io/posts/note...,2018-05-26T00:00:00+00:00,2018-05-26T00:00:00+00:00,https://knanne.github.io/posts/notes-on-shell,"{'@type': 'html', '@xml:base': 'https://knanne...",{'name': 'Kain Nanne'},"[{'@term': 'shell'}, {'@term': 'unix'}, {'@ter...","{'@type': 'html', '#text': 'Random notes on th...",Kain Nanne
2,"{'@type': 'html', '#text': 'Notes On Ssh'}",{'@href': 'https://knanne.github.io/posts/note...,2018-05-26T00:00:00+00:00,2018-05-26T00:00:00+00:00,https://knanne.github.io/posts/notes-on-ssh,"{'@type': 'html', '@xml:base': 'https://knanne...",{'name': 'Kain Nanne'},{'@term': 'ssh'},"{'@type': 'html', '#text': 'Random notes on us...",Kain Nanne
3,"{'@type': 'html', '#text': 'Notes On Pyspark'}",{'@href': 'https://knanne.github.io/posts/note...,2017-12-27T00:00:00+00:00,2017-12-27T00:00:00+00:00,https://knanne.github.io/posts/notes-on-pyspark,"{'@type': 'html', '@xml:base': 'https://knanne...",{'name': 'Kain Nanne'},"[{'@term': 'pyspark'}, {'@term': 'spark'}, {'@...","{'@type': 'html', '#text': 'Random notes, link...",Kain Nanne
4,"{'@type': 'html', '#text': 'Random Resources F...",{'@href': 'https://knanne.github.io/posts/rand...,2017-11-22T00:00:00+00:00,2017-11-22T00:00:00+00:00,https://knanne.github.io/posts/random-resource...,"{'@type': 'html', '@xml:base': 'https://knanne...",{'name': 'Kain Nanne'},,"{'@type': 'html', '#text': 'Random notes, link...",Kain Nanne


In [86]:
df.title.head()

0         {'@type': 'html', '#text': 'Notes On Excel'}
1         {'@type': 'html', '#text': 'Notes On Shell'}
2           {'@type': 'html', '#text': 'Notes On Ssh'}
3       {'@type': 'html', '#text': 'Notes On Pyspark'}
4    {'@type': 'html', '#text': 'Random Resources F...
Name: title, dtype: object

In [85]:
df.title.apply(pd.Series)

Unnamed: 0,@type,#text
0,html,Notes On Excel
1,html,Notes On Shell
2,html,Notes On Ssh
3,html,Notes On Pyspark
4,html,Random Resources For Data Analysis
5,html,Preprocessing Data In Pandas
6,html,Notes On Regular Expression
7,html,Cleaning Data In Pandas
8,html,Automating Everything In Python
9,html,Calculations For Data Analysis


In [87]:
df = df.join(df.title.apply(pd.Series), how='left')

In [88]:
df.shape

(10, 12)

In [89]:
df.head()

Unnamed: 0,title,link,published,updated,id,content,author,category,summary,name,@type,#text
0,"{'@type': 'html', '#text': 'Notes On Excel'}",{'@href': 'https://knanne.github.io/posts/note...,2018-05-29T00:00:00+00:00,2018-05-29T00:00:00+00:00,https://knanne.github.io/posts/notes-on-excel,"{'@type': 'html', '@xml:base': 'https://knanne...",{'name': 'Kain Nanne'},{'@term': 'excel'},"{'@type': 'html', '#text': 'Random notes on us...",Kain Nanne,html,Notes On Excel
1,"{'@type': 'html', '#text': 'Notes On Shell'}",{'@href': 'https://knanne.github.io/posts/note...,2018-05-26T00:00:00+00:00,2018-05-26T00:00:00+00:00,https://knanne.github.io/posts/notes-on-shell,"{'@type': 'html', '@xml:base': 'https://knanne...",{'name': 'Kain Nanne'},"[{'@term': 'shell'}, {'@term': 'unix'}, {'@ter...","{'@type': 'html', '#text': 'Random notes on th...",Kain Nanne,html,Notes On Shell
2,"{'@type': 'html', '#text': 'Notes On Ssh'}",{'@href': 'https://knanne.github.io/posts/note...,2018-05-26T00:00:00+00:00,2018-05-26T00:00:00+00:00,https://knanne.github.io/posts/notes-on-ssh,"{'@type': 'html', '@xml:base': 'https://knanne...",{'name': 'Kain Nanne'},{'@term': 'ssh'},"{'@type': 'html', '#text': 'Random notes on us...",Kain Nanne,html,Notes On Ssh
3,"{'@type': 'html', '#text': 'Notes On Pyspark'}",{'@href': 'https://knanne.github.io/posts/note...,2017-12-27T00:00:00+00:00,2017-12-27T00:00:00+00:00,https://knanne.github.io/posts/notes-on-pyspark,"{'@type': 'html', '@xml:base': 'https://knanne...",{'name': 'Kain Nanne'},"[{'@term': 'pyspark'}, {'@term': 'spark'}, {'@...","{'@type': 'html', '#text': 'Random notes, link...",Kain Nanne,html,Notes On Pyspark
4,"{'@type': 'html', '#text': 'Random Resources F...",{'@href': 'https://knanne.github.io/posts/rand...,2017-11-22T00:00:00+00:00,2017-11-22T00:00:00+00:00,https://knanne.github.io/posts/random-resource...,"{'@type': 'html', '@xml:base': 'https://knanne...",{'name': 'Kain Nanne'},,"{'@type': 'html', '#text': 'Random notes, link...",Kain Nanne,html,Random Resources For Data Analysis


In [90]:
df = df.drop(['author','title'], axis=1).rename(columns={'name':'author', '#text':'title'})

In [92]:
df.category.head()

0                                   {'@term': 'excel'}
1    [{'@term': 'shell'}, {'@term': 'unix'}, {'@ter...
2                                     {'@term': 'ssh'}
3    [{'@term': 'pyspark'}, {'@term': 'spark'}, {'@...
4                                                  NaN
Name: category, dtype: object

In [93]:
df.category.apply(pd.Series)

  index = _union_indexes(indexes, sort=sort)
  result = result.union(other)
  result = result.union(other)


Unnamed: 0,@term,0,1,2,3
0,excel,,,,
1,,{'@term': 'shell'},{'@term': 'unix'},{'@term': 'bash'},
2,ssh,,,,
3,,{'@term': 'pyspark'},{'@term': 'spark'},{'@term': 'python'},{'@term': 'databricks'}
4,,,,,
5,,{'@term': 'python'},{'@term': 'pandas'},,
6,regex,,,,
7,,{'@term': 'python'},{'@term': 'pandas'},,
8,python,,,,
9,,{'@term': 'calculations'},{'@term': 'equations'},,


In [94]:
df.category.apply(lambda x: pd.Series(x) if isinstance(x,list) else pd.Series([x]))

Unnamed: 0,0,1,2,3
0,{'@term': 'excel'},,,
1,{'@term': 'shell'},{'@term': 'unix'},{'@term': 'bash'},
2,{'@term': 'ssh'},,,
3,{'@term': 'pyspark'},{'@term': 'spark'},{'@term': 'python'},{'@term': 'databricks'}
4,,,,
5,{'@term': 'python'},{'@term': 'pandas'},,
6,{'@term': 'regex'},,,
7,{'@term': 'python'},{'@term': 'pandas'},,
8,{'@term': 'python'},,,
9,{'@term': 'calculations'},{'@term': 'equations'},,


In [95]:
exploded_column = df.category.apply(lambda x: pd.Series(x) if isinstance(x,list) else pd.Series([x]))\
    .stack()\
    .rename('term')\
    .apply(lambda x: x['@term'])\
    .to_frame()\
    .reset_index(level=1)\
    .rename(columns={'level_1':'term_index'})

In [96]:
exploded_column

Unnamed: 0,term_index,term
0,0,excel
1,0,shell
1,1,unix
1,2,bash
2,0,ssh
3,0,pyspark
3,1,spark
3,2,python
3,3,databricks
5,0,python


In [97]:
df = df.join(exploded_column, how='left')

In [98]:
df.shape

(18, 12)

In [99]:
df.head()

Unnamed: 0,link,published,updated,id,content,category,summary,author,@type,title,term_index,term
0,{'@href': 'https://knanne.github.io/posts/note...,2018-05-29T00:00:00+00:00,2018-05-29T00:00:00+00:00,https://knanne.github.io/posts/notes-on-excel,"{'@type': 'html', '@xml:base': 'https://knanne...",{'@term': 'excel'},"{'@type': 'html', '#text': 'Random notes on us...",Kain Nanne,html,Notes On Excel,0.0,excel
1,{'@href': 'https://knanne.github.io/posts/note...,2018-05-26T00:00:00+00:00,2018-05-26T00:00:00+00:00,https://knanne.github.io/posts/notes-on-shell,"{'@type': 'html', '@xml:base': 'https://knanne...","[{'@term': 'shell'}, {'@term': 'unix'}, {'@ter...","{'@type': 'html', '#text': 'Random notes on th...",Kain Nanne,html,Notes On Shell,0.0,shell
1,{'@href': 'https://knanne.github.io/posts/note...,2018-05-26T00:00:00+00:00,2018-05-26T00:00:00+00:00,https://knanne.github.io/posts/notes-on-shell,"{'@type': 'html', '@xml:base': 'https://knanne...","[{'@term': 'shell'}, {'@term': 'unix'}, {'@ter...","{'@type': 'html', '#text': 'Random notes on th...",Kain Nanne,html,Notes On Shell,1.0,unix
1,{'@href': 'https://knanne.github.io/posts/note...,2018-05-26T00:00:00+00:00,2018-05-26T00:00:00+00:00,https://knanne.github.io/posts/notes-on-shell,"{'@type': 'html', '@xml:base': 'https://knanne...","[{'@term': 'shell'}, {'@term': 'unix'}, {'@ter...","{'@type': 'html', '#text': 'Random notes on th...",Kain Nanne,html,Notes On Shell,2.0,bash
2,{'@href': 'https://knanne.github.io/posts/note...,2018-05-26T00:00:00+00:00,2018-05-26T00:00:00+00:00,https://knanne.github.io/posts/notes-on-ssh,"{'@type': 'html', '@xml:base': 'https://knanne...",{'@term': 'ssh'},"{'@type': 'html', '#text': 'Random notes on us...",Kain Nanne,html,Notes On Ssh,0.0,ssh
