#### Task: Parse XML dump and write info to a dataframe
### Desired Output Format
> chemical formula, property, value, units 
>
> NaCl, Molar mass, 58.44, g mol^-1

### Steps
1. Inspect structure of XML file
> **title>** Aluminum antimonide **<\title>**
>
> **text>** _Seciton1={{Chembox Identifiers [...] Section8={{Chembox Related_ **<\text>**
2. Note: chembox info does not always contain all sections
3. Note: there is no explicit tag for chemical formula, only chemical name
4. **674** chemicals in the XML dump ... **538** have chembox info in sections
5. **First: Read the XML File**

    _For each Chemical:_
    * Store the chemical name
    * Identify the sections in the chemical text
    * Extract each section's text 
    * Extract and store the info from each section's text
    * If section2, do something hackey to extract chemical formula and store
    * _Clean! Clean! Clean!_

In [1]:
import pandas as pd
import numpy as np
import re
from collections import OrderedDict
from xml.dom.minidom import parse
import xml.dom.minidom

class ChemBox(object):    
    
    def __init__(self, num_chemicals=0):
        self.num_chemicals = num_chemicals
        
        self.chem_table = []
        self.chem_dict = OrderedDict()
        self.chem_formula_lst = []        
    
    def chem_xml_parser(self, file_path, tag="page"):
        '''Input : File Path
           Output: Dictionary {Chem_Names: DOM Object}
        '''
        DOMTree = parse(file_path)
        chem_elements = DOMTree.getElementsByTagName(tag)
        self.num_chemicals = len(chem_elements)        
        name_lst = DOMTree.getElementsByTagName("title")

        for i in range(self.num_chemicals):
            name = name_lst[i].firstChild.data
            self.chem_dict[name] = chem_elements[i]
            
       
    def sections_map(self, chemical_text):
        '''Input : Chemical Text
           Output: List of sections identified in the text
        '''
        section_lst = re.compile(r'Section(\d*)=', flags=re.DOTALL).findall(chemical_text)
        num_sects = len(section_lst)
        
        sect_map = []
        
        if section_lst:
            
            for sect in range(num_sects):
                if sect != (num_sects - 1):
                    sect_map.append(['Section' + section_lst[sect], 'Section' + section_lst[sect+1]])
                else:
                    sect_map.append(['Section' + section_lst[sect], "\'\'\'"])
            return sect_map
        else:
            return False


    def extract_section(self, chemical_text, section_pair):
        '''Input : XML text for specific chemical
           Output: Chemical text for specified section
        '''
        if len(section_pair) != 2:
            return False
        sect_start, sect_end = section_pair
        
        # extract text between start & end !!!
        chemical_text = chemical_text.replace('\n\n', '\n')
        section_txt = re.compile(r'%s=(.+?)%s' %(sect_start, sect_end), flags=re.DOTALL).findall(chemical_text)[0]

        # the chemical formula is found in section2 = chemical properties
        if sect_start == "Section2":
            
            if 'Formula' in section_txt:

                if bool(re.search(r'Formula\s*=\s*\n', section_txt)):                    
                    # line under: 'Formula =\n NaCl'
                    chem_formula = re.compile(r'Formula\s*=\s*\n(.+?)\n', flags=re.DOTALL).findall(section_txt)[0].strip()
                else:
                    # same line: 'Formula = NaCl'
                    chem_formula = re.compile(r'Formula\s*=\s*(.+?)\n', flags=re.DOTALL).findall(section_txt)[0].strip()                                
            else:
                chem_formula = 'unknown'

            self.chem_formula_lst.append(chem_formula)
            section_txt = section_txt.replace(chem_formula, '').replace('Formula =', '').strip()

        section_txt = section_txt.replace('{{', '').replace('}}', '').strip()
        return section_txt
    
    
    def extract_chem_info(self, chemical, section_text):
        '''Input : Chemical name & text from specific section
           Output: Itemized chemical information
        '''
        chem_info_lst = section_text.split('\n') ####
        for line in chem_info_lst:
            line = line.strip()
            if line:
                if (line[0] == '|') & (' = ' in line) & ('_Ref' not in line):
                    prop, text_info = line.split('=', 1)
                    prop = prop.replace('|', '').strip() 
                    text_info = text_info.strip()                                            
                    self.chem_table.append([chemical, prop, text_info])


    def chemical_units(self, text):
        '''Input : Chemical Value/Property
           Output: Unit if available
        '''
        # regex search for chemical unit
        value = re.search(r'(\d+)(\.)(\d*) ' , text)
        
        if bool(value):    
            value = value.group(0)
            _, value, units = text.partition(value)
            return value.strip(), units.strip()        
        else:
            return text, ''
            
    def clean_text(self, text , pattern_lst):
        '''Input : Text & list of Patterns to check for
           Output: Clean Text
        '''
        for pattern in pattern_lst:
            if (pattern == '<\w+\ ?/>') | (pattern == '<ref(.+?)>'):
                text = re.sub(pattern, ' ', text)
            else:
                text = re.sub(pattern, '', text)
        text = text.strip()
        return text                

In [2]:
file_path = "Wikipedia-20161003174511.xml"

chembox = ChemBox()
chembox.chem_xml_parser(file_path) # parse the file

print 'Chemicals in XML File: %s' %chembox.num_chemicals

Chemicals in XML File: 674


In [3]:
print 'Dictionary of Chemicals & Their DOM Elements:\n'
for key, val in chembox.chem_dict.items()[:3]:
    print key, val
print '\t\t\t.'
print '\t\t\t.'
print '\t\t\t.'
for key, val in chembox.chem_dict.items()[-3:]:
    print key, val

Dictionary of Chemicals & Their DOM Elements:

Aluminium antimonide <DOM Element: page at 0x10b125b00>
Aluminium arsenate <DOM Element: page at 0x10b12af80>
Aluminium arsenide <DOM Element: page at 0x10b137518>
			.
			.
			.
Zirconium orthosilicate <DOM Element: page at 0x10dfed8c0>
Zirconium tetrafluoride <DOM Element: page at 0x10dff3ef0>
Zirconium tungstate <DOM Element: page at 0x10e800290>


In [4]:
empty = [] # List to store Chemicals w/o ChemBox info

for key, val in chembox.chem_dict.items():
    # extract chemical info text
    txt = val.getElementsByTagName("text")[0].firstChild.data
    
    # evaluate the section split
    sections_map_lst = chembox.sections_map(txt)
    
    # if there are sections, extract
    if sections_map_lst:
        for section_pair in sections_map_lst:
            section_txt = chembox.extract_section(txt, section_pair)
            chembox.extract_chem_info(key, section_txt)            
    else:
        empty.append(key)

In [5]:
print 'Chemicals with No Info: %s' %len(empty), '\n'
print empty

Chemicals with No Info: 136 

[u'Aluminium boride', u'Aluminium potassium sulfate', u'Ammonium cerium(IV) nitrate', u'Ammonium tetrathiocyanatodiamminechromate(III)', u'Antimony hydride', u'Barium ferrite', u'Beryllium chloride', u'Bismuth(III) telluride', u'Boron oxide', u'Calcium oxychloride', u'Carbon tetrabromide', u'Carbonic acid', u'Carbonyl chloride', u'Carboplatin', u'Carborundum', u'Chlorine tetroxide', u'Chlorine trioxide', u'Chlorine', u'Chrome-alum', u'Cisplatin', u'Columbite', u'Copper oxychloride', u'Copper(II) sulfide', u'Dichlorine dioxide', u'Dichlorine tetroxide', u'Dysprosium oxide', u'Dysprosium titanate', u'Germanium(II) fluoride', u'Germanium(IV) fluoride', u'Germanium(II) chloride', u'Germanium(IV) chloride', u'Germanium(II) iodide', u'Germanium(IV) iodide', u'Germanium(II) oxide', u'Germanium(IV) oxide', u'Germanium(II) sulfide', u'Germanium(IV) sulfide', u'Germanium(IV) nitride', u'Gold(V) fluoride', u'Gold(I) iodide', u'Hafnium carbide', u'Hydroiodic acid', u'

In [6]:
# Dataframe of chemical name & information
chembox_df = pd.DataFrame(chembox.chem_table, columns=['Chemical', 'Property', 'Text'])

# cleaning data checking for certain patterns
patterns = ['<ref(.+?)>(.*)', '</?\w+>(.*)', '<\w+\ ?/>(.*)', '\n', '\|']
chembox_df['Text'] = chembox_df['Text'].apply(lambda x: chembox.clean_text(x, patterns))

# Extracting units from applicable values
val_unit = map(lambda x: chembox.chemical_units(x), chembox_df['Text'])
chembox_df = pd.concat([chembox_df, pd.DataFrame(val_unit, columns=['Value', 'Units'])], axis=1)

# Rearranging columns
cols = chembox_df.columns.tolist()
cols = cols[:2] + cols[-2:] + cols[2:3]

chembox_df = chembox_df[cols]
chembox_df.head()

Unnamed: 0,Chemical,Property,Value,Units,Text
0,Aluminium antimonide,SMILES,[Al]#[Sb],,[Al]#[Sb]
1,Aluminium antimonide,ChemSpiderID,82452,,82452
2,Aluminium antimonide,InChI,1/Al.Sb/rAlSb/c1-2,,1/Al.Sb/rAlSb/c1-2
3,Aluminium antimonide,InChIKey,LVQULNGDVIKLPK-XFZGNPHIAJ,,LVQULNGDVIKLPK-XFZGNPHIAJ
4,Aluminium antimonide,StdInChI,1S/Al.Sb,,1S/Al.Sb


In [7]:
print 'Chembox DF Shape: {}'.format(chembox_df.shape)

Chembox DF Shape: (14693, 5)


In [8]:
# Dataframe of chemical name & formula
name = np.atleast_2d(chembox_df['Chemical'].unique()).T
formula = np.atleast_2d(chembox.chem_formula_lst).T
chem_id = pd.DataFrame(np.hstack([name, formula]), columns=['Chemical', 'Formula'])

# cleaning data checking for certain patterns
patterns = ['<ref(.+?)>(.*)', '</?\w+>', '<\w+\ ?/>', '\n', '\[', '\]', '\|']
chem_id['Formula'] = chem_id['Formula'].apply(lambda x: x if not x else chembox.clean_text(x, patterns))
chem_id.head()

Unnamed: 0,Chemical,Formula
0,Aluminium antimonide,Al=1 Sb=1
1,Aluminium arsenate,AlAsO4
2,Aluminium arsenide,Al=1 As=1
3,Aluminium bromide,AlBr3 Al2Br6
4,Aluminium carbide,Al4C3


In [9]:
print 'Chemical_ID DF Shape: {}'.format(chem_id.shape)

Chemical_ID DF Shape: (538, 2)


In [10]:
unknown_formulas = filter(lambda x: x[1] == 'unknown', zip(chem_id.Chemical, chem_id.Formula))
print 'No. Chemicals Formulas TBD:', len(unknown_formulas)
unknown_formulas[:5]

No. Chemicals Formulas TBD: 80


[(u'Aluminium oxide', u'unknown'),
 (u'Ammonium chloride', u'unknown'),
 (u'Antimony pentachloride', u'unknown'),
 (u'Barium ferrate', u'unknown'),
 (u'Beryllium hydroxide', u'unknown')]

--------------------------
** Final DF **

In [11]:
# Merging the formula df to the chembox dataframe
chembox_df = pd.merge(left=chembox_df, right=chem_id, on='Chemical')

# Rearranging columns
cols = chembox_df.columns.tolist()
cols = cols[:1] + cols[-1:] + cols[1:-1]
chembox_df = chembox_df[cols]

chembox_df.head()

Unnamed: 0,Chemical,Formula,Property,Value,Units,Text
0,Aluminium antimonide,Al=1 Sb=1,SMILES,[Al]#[Sb],,[Al]#[Sb]
1,Aluminium antimonide,Al=1 Sb=1,ChemSpiderID,82452,,82452
2,Aluminium antimonide,Al=1 Sb=1,InChI,1/Al.Sb/rAlSb/c1-2,,1/Al.Sb/rAlSb/c1-2
3,Aluminium antimonide,Al=1 Sb=1,InChIKey,LVQULNGDVIKLPK-XFZGNPHIAJ,,LVQULNGDVIKLPK-XFZGNPHIAJ
4,Aluminium antimonide,Al=1 Sb=1,StdInChI,1S/Al.Sb,,1S/Al.Sb


In [12]:
print 'Chembox DF Shape: {}'.format(chembox_df.shape)

Chembox DF Shape: (14693, 6)


-------------
** "Eye Test" Inspection**

In [13]:
cols = filter(lambda x: x != 'Text', chembox_df.columns)

In [14]:
mask = chembox_df['Units'] != ''
chembox_df.loc[mask, cols].head(10)

Unnamed: 0,Chemical,Formula,Property,Value,Units
9,Aluminium antimonide,Al=1 Sb=1,MolarMass,148.742,g/mol
11,Aluminium antimonide,Al=1 Sb=1,Density,4.26,g/cm
15,Aluminium antimonide,Al=1 Sb=1,BandGap,1.58,eV
20,Aluminium antimonide,Al=1 Sb=1,DeltaHf,50.4,kJ/mol
32,Aluminium arsenate,AlAsO4,MolarMass,165.901,g/mol
34,Aluminium arsenate,AlAsO4,Density,3.25,g/cm
39,Aluminium arsenate,AlAsO4,DeltaHf,1431.1,kJ/mol
40,Aluminium arsenate,AlAsO4,Entropy,145.6,J/mol K
50,Aluminium arsenide,Al=1 As=1,MolarMass,101.9031,g/mol
52,Aluminium arsenide,Al=1 As=1,Density,3.72,g/cm


In [15]:
mask = chembox_df['Property'] == 'MolarMass'
chembox_df.loc[mask, cols].head(10)

Unnamed: 0,Chemical,Formula,Property,Value,Units
9,Aluminium antimonide,Al=1 Sb=1,MolarMass,148.742,g/mol
32,Aluminium arsenate,AlAsO4,MolarMass,165.901,g/mol
50,Aluminium arsenide,Al=1 As=1,MolarMass,101.9031,g/mol
95,Aluminium bromide,AlBr3 Al2Br6,MolarMass,266.69,g/mol
124,Aluminium carbide,Al4C3,MolarMass,143.95853,g/mol
148,Aluminium iodide,AlI3,MolarMass,407.69495,g/mol (anhydrous)
169,Aluminium nitride,Al=1 N=1,MolarMass,40.9882,g/mol
235,Aluminium phosphide,Al=1 P=1,MolarMass,57.9552,g/mol
272,Aluminium chloride,AlCl3,MolarMass,133.34,g/mol (anhydrous)
324,Aluminium fluoride,AlF3,MolarMass,83.9767,g/mol (anhydrous)


In [16]:
mask = chembox_df['Property'] == 'Density'
chembox_df.loc[mask, cols].head(10)

Unnamed: 0,Chemical,Formula,Property,Value,Units
11,Aluminium antimonide,Al=1 Sb=1,Density,4.26,g/cm
34,Aluminium arsenate,AlAsO4,Density,3.25,g/cm
52,Aluminium arsenide,Al=1 As=1,Density,3.72,g/cm
98,Aluminium bromide,AlBr3 Al2Br6,Density,3.205,g/cm
127,Aluminium carbide,Al4C3,Density,2.36,g/cm
150,Aluminium iodide,AlI3,Density,3.98,g/cm
171,Aluminium nitride,Al=1 N=1,Density,3.26,g/cm
201,Aluminium oxide,unknown,Density,4.1,g/cm
238,Aluminium phosphide,Al=1 P=1,Density,2.85,g/cm
274,Aluminium chloride,AlCl3,Density,2.48,g/cm


In [17]:
mask = chembox_df['Chemical'] == 'Aluminium antimonide'
chembox_df.loc[mask, cols].head(10)

Unnamed: 0,Chemical,Formula,Property,Value,Units
0,Aluminium antimonide,Al=1 Sb=1,SMILES,[Al]#[Sb],
1,Aluminium antimonide,Al=1 Sb=1,ChemSpiderID,82452,
2,Aluminium antimonide,Al=1 Sb=1,InChI,1/Al.Sb/rAlSb/c1-2,
3,Aluminium antimonide,Al=1 Sb=1,InChIKey,LVQULNGDVIKLPK-XFZGNPHIAJ,
4,Aluminium antimonide,Al=1 Sb=1,StdInChI,1S/Al.Sb,
5,Aluminium antimonide,Al=1 Sb=1,StdInChIKey,LVQULNGDVIKLPK-UHFFFAOYSA-N,
6,Aluminium antimonide,Al=1 Sb=1,CASNo,25152-52-7,
7,Aluminium antimonide,Al=1 Sb=1,PubChem,91307,
8,Aluminium antimonide,Al=1 Sb=1,EINECS,246-667-3,
9,Aluminium antimonide,Al=1 Sb=1,MolarMass,148.742,g/mol


In [18]:
mask = chembox_df['Chemical'] == 'Aluminium phosphide'
chembox_df.loc[mask, cols].head(10)

Unnamed: 0,Chemical,Formula,Property,Value,Units
224,Aluminium phosphide,Al=1 P=1,ChemSpiderID,28171,
225,Aluminium phosphide,Al=1 P=1,UNII,E23DR6L59S,
226,Aluminium phosphide,Al=1 P=1,InChI,1/Al.P/rAlP/c1-2,
227,Aluminium phosphide,Al=1 P=1,SMILES,[Al]#P,
228,Aluminium phosphide,Al=1 P=1,InChIKey,PPNXXZIBFHTHDM-LQQCNYPFAR,
229,Aluminium phosphide,Al=1 P=1,StdInChI,1S/Al.P,
230,Aluminium phosphide,Al=1 P=1,StdInChIKey,PPNXXZIBFHTHDM-UHFFFAOYSA-N,
231,Aluminium phosphide,Al=1 P=1,CASNo,20859-73-8,
232,Aluminium phosphide,Al=1 P=1,PubChem,30332,
233,Aluminium phosphide,Al=1 P=1,RTECS,BD1400000,


------------
** DataFrame to CSV **

In [19]:
# writing files to csv

# columns = ['Formula', 'Property', 'Value', 'Units']
cols = filter(lambda x: (x != 'Chemical') & (x != 'Text'), chembox_df.columns)
chembox_df[cols].to_csv('Chembox_Info.csv', index=False, encoding='utf-8')

# columns = ['Chemical', 'Formula', 'Property', 'Value', 'Units', 'Text']
chembox_df.to_csv('Chembox_Info_All.csv', index=False, encoding='utf-8')