### Initial round of testing for Naive Bayes and XML Identification

With a set of labelled responses (nb_training_201504\*.csv) and the set of all iso/ogc responses (nb_testing_201504\*.csv), we are trying to see if a naive bayes classifier can accurately identify iso or ogc or iso and ogc. 

Steps:

from the training set:

* strip out root tags

* strip out namespaces

* add the label from the csv

from the test set:

* generate a sample set (no replacement & digest not in training (my bad))

* strip out root tags

* strip out namespaces

* predict!


In [86]:
from lxml import etree
import csv
import os
import json
import random

# the xml processing
parser = etree.XMLParser(
            encoding='utf-8',
            remove_blank_text=True,
            remove_comments=True,
            recover=True
        )

def get_namespaces(xml):
    if xml is None:
        return {}
    
    document_namespaces = dict(xml.xpath('/*/namespace::*'))
    if None in document_namespaces:
        document_namespaces['default'] = document_namespaces[None]
        del document_namespaces[None]

    # now run through any child namespace issues
    all_namespaces = xml.xpath('//namespace::*')
    for i, ns in enumerate(all_namespaces):
        if ns[1] in document_namespaces.values():
            continue
        new_key = ns[0] if ns[0] else 'default%s' % i
        document_namespaces[new_key] = ns[1]
    
    return document_namespaces

def prefix(tag, namespaces):
    for prefix, ns in namespaces.iteritems():
        wrapped_ns = '{%s}' % ns
        tag = tag.replace(wrapped_ns, prefix + ':')
    if '{' in tag and '}' in tag:
        return ''

    return tag

# our file location
INPUT_DIR = '../testdata/solr_20150320/clean_20150325'
# sample size
SAMPLE_SIZE = 400

training_digests = []
training_data = []
with open('../testdata/nb_training_20150405.csv', 'rb') as csvfile:
    test_reader = csv.DictReader(csvfile, fieldnames=['label', 'digest'])
    
    for row in test_reader:
        # do the things
        if not os.path.exists(os.path.join(INPUT_DIR, row['digest'] + '_cleaned.json')):
            print 'no file:', os.path.join(INPUT_DIR, row['digest'] + '_cleaned.json')
            continue
        
        with open(os.path.join(INPUT_DIR, row['digest'] + '_cleaned.json'), 'r') as f:
            data = json.loads(f.read())
        
        content = data['content'].encode('unicode_escape')
        
        try:
            xml = etree.fromstring(content, parser=parser)
        except:
            print 'bad xml', row['digest']
            continue
        
        if xml is None:
            print 'wtf xml'
            continue
        
        ns = get_namespaces(xml)
        root = xml.xpath('/*')[0].tag
        prefixed_root = prefix(root, ns)
        
        features = [prefixed_root] + list(set(ns.values()))
    
        training_digests.append(row['digest'])
        training_data += [({feature: True}, row['label']) for feature in features]
    
print len(training_data)
print training_data[:5]
        
        

932
[({'default:WMS_Capabilities': True}, 'ogc'), ({'http://www.opengis.net/wms': True}, 'ogc'), ({'http://www.w3.org/XML/1998/namespace': True}, 'ogc'), ({'http://www.w3.org/1999/xlink': True}, 'ogc'), ({'http://www.w3.org/2001/XMLSchema-instance': True}, 'ogc')]


In [87]:
# let's put together our test data
# where we want some sampling mechanism

test_data = {}
test_digests = []
# this time, pull the digest list without processing anything
with open('../testdata/nb_testing_20150405.csv', 'rb') as csvfile:
    test_reader = csv.DictReader(csvfile, fieldnames=['label', 'digest'])
    
    for row in test_reader:
        # do the things
        if not os.path.exists(os.path.join(INPUT_DIR, row['digest'] + '_cleaned.json')):
            print 'no file:', os.path.join(INPUT_DIR, row['digest'] + '_cleaned.json')
            continue
        
        test_digests.append((row['digest'], row['label']))

# remove any digest found in our training set
test_digests = [t for t in test_digests if t[0] not in training_digests]

sample_digests = [test_digests[t] for t in random.sample(xrange(len(test_digests)), SAMPLE_SIZE)]

# and pulling the "text" - root tags and namespaces
# note: our initial test set is only ogc and iso
for digest, label in sample_digests:
    if not os.path.exists(os.path.join(INPUT_DIR, digest + '_cleaned.json')):
        print 'no file:', os.path.join(INPUT_DIR, digest + '_cleaned.json')
        continue

    with open(os.path.join(INPUT_DIR, digest + '_cleaned.json'), 'r') as f:
        data = json.loads(f.read())

    content = data['content'].encode('unicode_escape')

    try:
        xml = etree.fromstring(content, parser=parser)
    except:
        print 'bad xml', digest
        continue

    if xml is None:
        print 'wtf xml'
        continue

    ns = get_namespaces(xml)
    root = xml.xpath('/*')[0].tag
    prefixed_root = prefix(root, ns)
    
    features = [prefixed_root] + list(set(ns.values()))
    
    test_data[digest] = [({feature:True}, label) for feature in features]

print test_data.keys()[0], test_data[test_data.keys()[0]]

dfbf2b12ac70b05c416749ed229580f4 [({'gmi:MI_Metadata': True}, 'iso'), ({'http://www.isotc211.org/2005/gmi': True}, 'iso'), ({'http://www.opengis.net/gml/3.2': True}, 'iso'), ({'http://www.isotc211.org/2005/srv': True}, 'iso'), ({'http://www.w3.org/2001/XMLSchema-instance': True}, 'iso'), ({'http://www.isotc211.org/2005/gts': True}, 'iso'), ({'http://www.isotc211.org/2005/gmx': True}, 'iso'), ({'http://www.isotc211.org/2005/gmd': True}, 'iso'), ({'http://www.isotc211.org/2005/gsr': True}, 'iso'), ({'http://www.isotc211.org/2005/gss': True}, 'iso'), ({'http://www.isotc211.org/2005/gco': True}, 'iso'), ({'http://www.w3.org/XML/1998/namespace': True}, 'iso'), ({'http://www.w3.org/1999/xlink': True}, 'iso')]


In [88]:
# the naive bayes
#from sklearn.naive_bayes import GaussianNB

from nltk import NaiveBayesClassifier
import nltk.classify
from collections import Counter

classifier = NaiveBayesClassifier.train(training_data)

# print classifier.show_most_informative_features(200)

basic_classifier_results = {}

for k, v in test_data.iteritems():
    items = []
    for f, l in v:
        items.append(classifier.classify(f))
    accuracy = nltk.classify.accuracy(classifier, v)
    print k, [t[1] for t in test_digests if t[0] == k] 
    print '\t', accuracy, dict(Counter(items))  # , items
    
    basic_classifier_results[k] = {
        'accuracy': accuracy,
        'counts': dict(Counter(items)),
        'label': [t[1] for t in test_digests if t[0] == k][0]
    }



dfbf2b12ac70b05c416749ed229580f4 ['iso']
	1.0 {'iso': 13}
5ec8a5381d6da61813f7cf7b22ed7e7c ['iso']
	1.0 {'iso': 9}
cc7becfd02636988c07eb118542e3da4 ['iso']
	1.0 {'iso': 10}
1b22328f0d930b23963852e1d9425898 ['iso']
	1.0 {'iso': 12}
834734a779bfc874822e5152d95bb8f5 ['iso']
	1.0 {'iso': 12}
d9c88e4132b3afd8364fddd92a3a7156 ['iso']
	1.0 {'iso': 12}
006c19561ab047658c4d93cca2e26b65 ['iso']
	1.0 {'iso': 11}
0d6d51841b1dbbc0d37ab4f68f8b4a63 ['ogc']
	0.6 {'iso': 2, 'ogc': 3}
36ec09a019a0c0b60bcfa2757a29fa11 ['ogc']
	0.6 {'iso': 2, 'ogc': 3}
69652831ad3f5accd254b775be4c0cc4 ['iso']
	1.0 {'iso': 12}
f139042170e1a0f842c75f85b300e55a ['iso']
	1.0 {'iso': 12}
64e56fd8b99a26b2d821621cc5c82e9f ['iso']
	1.0 {'iso': 13}
71ba46afae15e2b15e8db810ea311325 ['iso']
	1.0 {'iso': 12}
d89bd2b83618d8f1172a00ba9980525d ['iso']
	1.0 {'iso': 12}
d4fc0f3102d12151839d5d31357c416f ['iso']
	1.0 {'iso': 11}
e3f08abc371f252f1ad51e2c13866746 ['iso']
	0.857142857143 {'iso': 6, 'ogc': 1}
8ed50a3a1b536b0fa631264d589ff1cf ['

In [89]:
# let's try with sets without namespace prefixes (so just the local-name())

def parse_feature(feature):
    # ({'default:WMS_Capabilities': True}, 'ogc')
    key = feature[0].keys()[0]
    new_feature = {key.split(':')[-1]: True} if not key.startswith('http:') and ':' in key else feature[0]
    return (new_feature, feature[1])

# update the training data
training_data_noprefix = [parse_feature(f) for f in training_data]

# update the test data
test_data_noprefix = {}
for k, v in test_data.iteritems():
    test_data_noprefix[k] = [parse_feature(f) for f in v]

# retrain
classifier_noprefix = NaiveBayesClassifier.train(training_data_noprefix)

# print classifier_noprefix.show_most_informative_features(200)

noprefix_classifier_results = {}
for k, v in test_data_noprefix.iteritems():
    items = []
    for f, l in v:
        items.append(classifier_noprefix.classify(f))
    accuracy = nltk.classify.accuracy(classifier_noprefix, v)
    print k, [t[1] for t in test_digests if t[0] == k] 
    print '\t', accuracy, dict(Counter(items))
    
    noprefix_classifier_results[k] = {
        'accuracy': accuracy,
        'counts': dict(Counter(items)),
        'label': [t[1] for t in test_digests if t[0] == k][0]
    }

dfbf2b12ac70b05c416749ed229580f4 ['iso']
	1.0 {'iso': 13}
5ec8a5381d6da61813f7cf7b22ed7e7c ['iso']
	1.0 {'iso': 9}
cc7becfd02636988c07eb118542e3da4 ['iso']
	1.0 {'iso': 10}
1b22328f0d930b23963852e1d9425898 ['iso']
	1.0 {'iso': 12}
834734a779bfc874822e5152d95bb8f5 ['iso']
	1.0 {'iso': 12}
d9c88e4132b3afd8364fddd92a3a7156 ['iso']
	1.0 {'iso': 12}
006c19561ab047658c4d93cca2e26b65 ['iso']
	1.0 {'iso': 11}
0d6d51841b1dbbc0d37ab4f68f8b4a63 ['ogc']
	0.6 {'iso': 2, 'ogc': 3}
36ec09a019a0c0b60bcfa2757a29fa11 ['ogc']
	0.6 {'iso': 2, 'ogc': 3}
69652831ad3f5accd254b775be4c0cc4 ['iso']
	1.0 {'iso': 12}
f139042170e1a0f842c75f85b300e55a ['iso']
	1.0 {'iso': 12}
64e56fd8b99a26b2d821621cc5c82e9f ['iso']
	1.0 {'iso': 13}
71ba46afae15e2b15e8db810ea311325 ['iso']
	1.0 {'iso': 12}
17e74c319ac9fff41c1e501b256baaa7 ['iso']
	1.0 {'iso': 7}
d4fc0f3102d12151839d5d31357c416f ['iso']
	1.0 {'iso': 11}
e3f08abc371f252f1ad51e2c13866746 ['iso']
	0.857142857143 {'iso': 6, 'ogc': 1}
0c76c7942853763854ca62f53ee659a0 ['i

### let's have a think

it trains on a dict of term: true (common bag of words structure for document filtering in nb).

but i am using a bunch of single term dicts.

is it better/possible to use a dict of terms per label like the test data structure instead? would that change the not informativeness of the root tags? (should those be informative anyway?)

In [90]:
# so let's try that


# redo the training data
combined_training_data = []
with open('../testdata/nb_training_20150405.csv', 'rb') as csvfile:
    test_reader = csv.DictReader(csvfile, fieldnames=['label', 'digest'])
    
    for row in test_reader:
        # do the things
        if not os.path.exists(os.path.join(INPUT_DIR, row['digest'] + '_cleaned.json')):
            print 'no file:', os.path.join(INPUT_DIR, row['digest'] + '_cleaned.json')
            continue
        
        with open(os.path.join(INPUT_DIR, row['digest'] + '_cleaned.json'), 'r') as f:
            data = json.loads(f.read())
        
        content = data['content'].encode('unicode_escape')
        
        try:
            xml = etree.fromstring(content, parser=parser)
        except:
            print 'bad xml', row['digest']
            continue
        
        if xml is None:
            print 'wtf xml'
            continue
        
        ns = get_namespaces(xml)
        root = xml.xpath('/*')[0].tag
        prefixed_root = prefix(root, ns)
        
        features = [prefixed_root] + list(set(ns.values()))
        
        feature_set = {feature: True for feature in features}
        combined_training_data.append((feature_set, row['label']))


# new classifier
combined_classifier = NaiveBayesClassifier.train(combined_training_data)

In [82]:
# rebuild the test data for the same
from itertools import chain

test_data_combined = {}
for k, v in test_data.iteritems():
    new_features = {}
    label = ''
    for item in v:
        new_features[item[0].keys()[0]] = item[0].values()[0]
        label = item[1]
        
    test_data_combined[k] = [(new_features, label)]
    

In [91]:
# and go!
combined_classifier_results = {}
for k, v in test_data_combined.iteritems():
    items = []
    for f, l in v:
        items.append(combined_classifier.classify(f))
    accuracy = nltk.classify.accuracy(combined_classifier, v)
    print k, [t[1] for t in test_digests if t[0] == k] 
    print '\t', accuracy, dict(Counter(items))
    
    combined_classifier_results[k] = {
        'accuracy': accuracy,
        'counts': dict(Counter(items)),
        'label': [t[1] for t in test_digests if t[0] == k][0]
    }

f4d76d2eda56e753f9e2f5bf6ed494c3 ['iso']
	1.0 {'iso': 1}
8ed50a3a1b536b0fa631264d589ff1cf ['iso']
	1.0 {'iso': 1}
04329416be76a604cb938ebc53e8ff22 ['iso']
	1.0 {'iso': 1}
14e62f3bb8bd6d02ea39e3b3ca426fce ['ogc']
	1.0 {'ogc': 1}
3244eb4de702a5b31cd9d4a022e21cb0 ['iso']
	1.0 {'iso': 1}
d7be4975b1d88ae90bf8a66bbb97a3bc ['ogc']
	1.0 {'ogc': 1}
36ec09a019a0c0b60bcfa2757a29fa11 ['ogc']
	1.0 {'ogc': 1}
96207fa6bde0c91de959234e6d4a3c9f ['iso']
	1.0 {'iso': 1}
94f5cbba2f35a14ceecf52ae72d14322 ['iso']
	1.0 {'iso': 1}
6545a9985600eaf0707f83105b758abe ['ogc']
	1.0 {'ogc': 1}
9ffe2442ee41056f5a9e99a3e5501615 ['iso']
	1.0 {'iso': 1}
e39a3602828f281eef6f6ac4f3c2d9de ['ogc']
	1.0 {'ogc': 1}
02d372ce6f7682d992191c30c4d46821 ['iso']
	1.0 {'iso': 1}
e4a561e4cf0295e1b34cb7d9dd9fe648 ['iso']
	1.0 {'iso': 1}
4eab0f0503215e827f6b4976c1175724 ['ogc']
	1.0 {'ogc': 1}
82dbeaa8ceab42b52b558d130a84bdc1 ['iso']
	1.0 {'iso': 1}
667e87a7d7f37c5d0477f48a29df0e51 ['iso']
	1.0 {'iso': 1}
015c45caf7f2dcb0d21ee8eabd913db

okay then. 100% accuracy can't be good (i actually don't know what the sample looks like though).

let's see what happens when we just run one set (train iso, does it catch ogc?). i hope not given the previous results.

todo:

1. put together a relatively clean set of all things
2. with labels
3. run some sample against the iso/ogc combined model and see how that goes.
4. add a namespace stopwords corpus and exclude those from the modeling (no xlink paths, too common)
5. rerun the modeling.