In [6]:
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import nltk
from collections import defaultdict
import string
from nltk.stem.porter import PorterStemmer

In [7]:
newsgroups = fetch_20newsgroups(categories=['comp.graphics','comp.os.ms-windows.misc','rec.autos','rec.sport.hockey'],
                                subset='all', shuffle=True, random_state=1)

In [14]:
print "%d documents" % len(newsgroups.data)
print "%d categories" % len(newsgroups.target_names)

3947 documents
4 categories


In [9]:
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]
    stems = stem_tokens(tokens, stemmer)
    return stems

In [10]:
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=tokenize)
tfidf_data = vectorizer.fit_transform(newsgroups.data)

In [28]:
query = 'hockey ball news'
tfidf_query = vectorizer.transform([query])

In [29]:
feature_names = vectorizer.get_feature_names()

for word in tfidf_query.nonzero()[1]:
    print feature_names[word], ' - ', tfidf_query[0, word]


news  -  0.5287165361
hockey  -  0.433434690162
ball  -  0.729789828525


#Print top_count texts simular to query sorted by cosine simularity

In [30]:
def query_results(query_string, top_count):
    tfidf_query = vectorizer.transform([query_string])
    cosine_similarities = defaultdict(float) #словарь всех дистанций
    count = 0
    for doc in tfidf_data: #for each document in corpus search cosin distance with query
        cosine_similarity = doc*(tfidf_query[0].transpose()) 
        if not cosine_similarity:
            cosine_similarity = 0.0
        else:
            # matrix -> number
            cosine_similarity = cosine_similarity[0,0]
        cosine_similarities[newsgroups.data[count]] = cosine_similarity
        count += 1
    for key, value in sorted(cosine_similarities.items(), reverse=True, key=lambda x:x[1])[:top_count]:
        print'Similarity value = ', value, '', key
        print '----------------------------------------------------------------------'

In [31]:
query_results("photoshot adobe", 2)

Similarity value =  0.463363771111  From: zstern@adobe.com (Zalman Stern)
Subject: Re: Adobe Photo Shop type software for Unix/X/Motif platforms?
Organization: Adobe Systems Incorporated
Lines: 24

Charles Boesel writes
> 
> In article <C5w8xB.Iv6@world.std.com>  
(sci.image.processing,comp.graphics), wdm@world.std.com (Wayne Michael)  
writes:
> >    I have been searching for a quality image enhancement and
> >    manipulation package for Unix/X/Motif platforms that is comparable
> >    to Adobe Photo Shop for the Mac. [stuff deleted]
> 
> I understand that Adobe is working on making Photoshop available for
> the SGI Indigo, but that is just "rumor" and I wouldn't bet on it
> until I see it. But they >are< going to release Illustrator for the SGI
> "real soon now."
> 

Illustrator for SGI is a shipping product. Adobe and SGI have announced that  
Photoshop is being ported to SGI machines. A simillar announcement has been  
made by Adobe and Sun for Sun platforms. No dates have been an

In [39]:
query_results("car drive", 2)

('Similarity value = ', 0.40246044928991215, '', u"From: apland@mala.bc.ca (Ron Apland)\nSubject: Re: DOS6 & Novell Netware\nOrganization: Malaspina College\nLines: 21\n\nIn article <1rre2d$26d@usenet.INS.CWRU.Edu>, cu826@cleveland.Freenet.Edu (Mahbub Anam) writes:\n> \n> I friend of mine installed dos6 at work and is hooked up to a Novell\n> network, running Netware 386 v.3.11.  The problem he's having is that the\n> doublespace program is using drive 'h' as the swap drive, which is\n> confliting with the networks mapped drives.\n> \n> Is there switch to place the swap drive to someother drive?\n> Please e-mail....thanks a bunch!!\n> -- \n> Mahbub\n> cu826@cleveland.freenet.edu\n\nDetermine the last dblspace drive required and set that as lastdrive in your\nconfig.sys  If you are using W4WW you might want to go a couple of drives higher\nto give you some shared drive space.  Reboot and Novell will set its first\ndrive one above the lastdrive in config.sys.  You might have to remap som