In [7]:
import xml.etree.ElementTree as ET

In [8]:
tree = ET.parse('data/policy.xml')
root = tree.getroot()

In [9]:
print(root.tag)

POLICY


In [10]:
print(root.attrib)

{'policy_url': 'http://about.officemax.com/privacy/', 'website_index': '088', 'website_category': 'Shopping', 'modification_date': 'May 16, 2013', 'website_url': 'officemax.com'}


In [14]:
subtexts = []
for child in root:
    # print(child.tag, child.attrib)
    for c in child:
        # print(c.tag, c.attrib)
        # print(c.text)
        subtexts.append(c.text)

In [16]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest

In [17]:
class FrequencySummarizer:
    def __init__(self, min_cut=0.1, max_cut=0.9):
        self._min_cut = min_cut
        self._max_cut = max_cut
        self._stopwords = set(stopwords.words('english') + list(punctuation))
        # print self
        # print min_cut
        # print max_cut

    def _compute_frequencies(self, word_sent):
        freq = defaultdict(int)
        for sentence in word_sent:
            for word in sentence:
                if word not in self._stopwords:
                    freq[word] += 1
        max_freq = float(max(freq.values()))
        for word in freq.keys():
            freq[word] = freq[word] / max_freq
            if freq[word] >= self._max_cut or freq[word] <= self._min_cut:
                del freq[word]
        return freq

    def summarize(self, text, n):
        sents = sent_tokenize(text)
        assert n <= len(sents)
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        rankings = defaultdict(int)
        for i, sent in enumerate(word_sent):
            for word in sent:
                if word in self._freq:
                    # print self._freq[word]
                    # rankings = rankings + self._freq[word]
                    rankings[i] += self._freq[word]
                    # print rankings
                    # sent_idx = nlargest(n, rankings[i], key=rankings.get)
                    sent_idx = nlargest(n, rankings, key=rankings.get)
                    return [sents[j] for j in sent_idx]

In [18]:
fs = FrequencySummarizer()

In [23]:
print(subtexts[3])

Through its Web Sites, OfficeMax collects "Personally Identifiable Information" which is any information that can be used to specifically identify an individual, such as your name, mailing address, e-mail address, telephone number, social security number or credit card number. Users may provide us with Personally Identifiable Information and certain demographic information (such as gender, zip code, hometown or similar information) through Web Site features, including, surveys, online registration forms, sweepstakes entries and bulletin boards. Additionally, when you visit our Web Sites our servers automatically collect certain information from you, such as the number of pages viewed and accessed, browser type and IP address. Most of this information is non-personally identifiable information, but if we connect it or demographic information with Personally Identifiable Information, we will treat it as Personally Identifiable Information. We also collect purchasing information from you 

In [24]:
summary = fs.summarize(subtexts[3], 3)

In [25]:
summary

['Through its Web Sites, OfficeMax collects "Personally Identifiable Information" which is any information that can be used to specifically identify an individual, such as your name, mailing address, e-mail address, telephone number, social security number or credit card number.']