In [1]:
import os
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import re
import string

In [2]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [3]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

In [4]:
texts = newsgroups_train.data[:100]

In [5]:
texts

[u'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.',
 u"A fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't 

In [6]:
texts_filt = []

dgts = ''.join(str(i) for i in range(10))
for text in texts:
    for s in string.punctuation + dgts:
        text = text.replace(s, '')
    texts_filt.append(re.sub( '\s+', ' ', text).strip())

In [7]:
texts_filt

[u'I was wondering if anyone out there could enlighten me on this car I saw the other day It was a door sports car looked to be from the late s early s It was called a Bricklin The doors were really small In addition the front bumper was separate from the rest of the body This is all I know If anyone can tellme a model name engine specs years of production where this car is made history or whatever info you have on this funky looking car please email',
 u'A fair number of brave souls who upgraded their SI clock oscillator have shared their experiences for this poll Please send a brief message detailing your experiences with the procedure Top speed attained CPU rated speed add on cards and adapters heat sinks hour of usage per day floppy disk functionality with and m floppies are especially requested I will be summarizing in the next two days so please add to the network knowledge base if you have done the clock upgrade and havent answered this poll Thanks',
 u'well folks my mac plus fi

In [8]:
ii = {}

for i_t, text in enumerate(log_progress(texts_filt)):
    for w in text.lower().split():
        if ii.get(w, None) is None:
            ii[w] = [i_t]
        else:
            ii[w].append(i_t)
            
for w in ii.iterkeys():
    ii[w] = np.unique(ii[w])

In [9]:
ii

{u'limited': array([39, 82, 87]),
 u'prescot': array([54]),
 u'todays': array([13, 43, 96]),
 u'pardon': array([70]),
 u'ciao': array([68]),
 u'nfo': array([90]),
 u'elvs': array([13]),
 u'child': array([11, 67]),
 u'desirable': array([54]),
 u'yellow': array([17]),
 u'hitch': array([86]),
 u'four': array([82, 96]),
 u'facilities': array([82, 96]),
 u'protest': array([40]),
 u'katyusha': array([33]),
 u'perverted': array([70]),
 u'limiter': array([56]),
 u'undersecretary': array([54]),
 u'captain': array([21]),
 u'hate': array([37, 39]),
 u'assembled': array([70]),
 u'pack': array([38, 39]),
 u'forget': array([28]),
 u'looking': array([ 0,  2, 25, 43, 90]),
 u'fronts': array([70]),
 u'replacements': array([30]),
 u'violate': array([85]),
 u'aug': array([59]),
 u'paris': array([59, 82]),
 u'stinky': array([9]),
 u'supported': array([70]),
 u'neighbours': array([31]),
 u'bike': array([10]),
 u'restriction': array([37]),
 u'swap': array([79]),
 u'under': array([17, 33, 40, 54, 58, 70, 76,

---

In [10]:
def search(query):
    stub = range(10)
    res = [s for s in stub]
    for word in query.strip().split():
        res = np.intersect1d(res, ii.get(word, res))
    return [texts_filt[i] for i in res]

In [11]:
search('of and or')

[u'well folks my mac plus finally gave up the ghost this weekend after starting life as a k way back in sooo im in the market for a new machine a bit sooner than i intended to be im looking into picking up a powerbook or maybe and have a bunch of questions that hopefully somebody can answer does anybody know any dirt on when the next round of powerbook introductions are expected id heard the c was supposed to make an appearence this summer but havent heard anymore on it and since i dont have access to macleak i was wondering if anybody out there had more info has anybody heard rumors about price drops to the powerbook line like the ones the duos just went through recently whats the impression of the display on the i could probably swing a if i got the Mb disk rather than the but i dont really have a feel for how much better the display is yea it looks great in the store but is that all wow or is it really that good could i solicit some opinions of people who use the and daytoday on if 