In [1]:
import string
import numpy as np
import nltk
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
news = fetch_20newsgroups(
    categories=['sci.space', 'talk.politics.guns', 'comp.graphics', 'comp.sys.ibm.pc.hardware'],
    subset='all',
    shuffle=True)

In [3]:
print 'documents :', len(news.data)
print 'categories:', len(news.target_names)
print news.target_names

documents : 3852
categories: 4
['comp.graphics', 'comp.sys.ibm.pc.hardware', 'sci.space', 'talk.politics.guns']


In [4]:
stemmer = nltk.stem.porter.PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]
    stems = stem_tokens(tokens, stemmer)
    return stems

In [5]:
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=tokenize)
news_tfidf = vectorizer.fit_transform(news.data)

In [6]:
def query(subj, limit=5, head_only=True):
    subj_tfidf = vectorizer.transform([subj])
    similarities = []
    for i in xrange(news_tfidf.shape[0]):
        similarities.append((news.data[i], cosine_similarity(news_tfidf[i], subj_tfidf)))
    similarities = sorted(similarities, key=lambda x:x[1], reverse=True)[:limit]
    
    for x in similarities:
        if head_only:
            print x[1][0][0], x[0].split('\n')[1].replace('Subject', '')
        else:
            print '%s\n%s\n\n%s' % ('-' * 80, x[1][0][0], x[0])
        
    return similarities

In [7]:
query('space ship')
pass

0.267053115783 : Mothership for Flybys and cutting costs..
0.241652209146 : End of the Space Age
0.234101686336 : Space FAQ 13/15 - Interest Groups & Publications
0.21255184885 : Re: Over zealous shuttle critics
0.20899761061 : Re: Why we like DC-X (was Re: Shuttle 0-Defects & Bizarre? DC-X?)


In [8]:
query('buy video card', limit=10)
pass

0.390783875374 : Need Info on high quality video card
0.365647315047 : Which Video Card? (Please HELP)
0.352877034236 : Comments on an accelerated Video Card for ISA bus
0.319370283053 : Tseng Labs Video Card Problem
0.271432312829 : Re: Which high-performance VLB video card?
0.268664326459 : S3 video card at different address
0.254225285728 : L.B. vs VESA L.B. and ....
0.249478704132 : HELP: advice on what video system to buy
0.238845277788 : Winjet accelerator card
0.223342239226 : NTSC data to RGB ?  For Video Capture.


In [9]:
query('buy video card', limit=2, head_only=False)

--------------------------------------------------------------------------------
0.390783875374

From: gtj@goanna.cs.rmit.oz.au (Glenn T Jayaputera)
Subject: Need Info on high quality video card
Organization: RMIT Department of Computer Science
Lines: 10

Hi...I need some info on video card.  I am looking a video card that can
deliver a high quality picture.  I need the card to display images (well
for advertising company btw), so it must be rich with colors and the speed
must be fast too.

I am just wondering if somebody can advise me what to buy for such
application, and possible the address of the vendor.

thanks in advance
Glenn Jayaputera

--------------------------------------------------------------------------------
0.365647315047

From: tp892275@vine.canberra.edu.au (C. Mierzanowski)
Subject: Which Video Card? (Please HELP)
Organization: Info Sci & Eng, University of Canberra, AUSTRALIA
Lines: 13


I've got a 386 20Hz computer which is under warranty and my Trident
8900C video

[(u'From: gtj@goanna.cs.rmit.oz.au (Glenn T Jayaputera)\nSubject: Need Info on high quality video card\nOrganization: RMIT Department of Computer Science\nLines: 10\n\nHi...I need some info on video card.  I am looking a video card that can\ndeliver a high quality picture.  I need the card to display images (well\nfor advertising company btw), so it must be rich with colors and the speed\nmust be fast too.\n\nI am just wondering if somebody can advise me what to buy for such\napplication, and possible the address of the vendor.\n\nthanks in advance\nGlenn Jayaputera\n',
  array([[ 0.39078388]])),
 (u"From: tp892275@vine.canberra.edu.au (C. Mierzanowski)\nSubject: Which Video Card? (Please HELP)\nOrganization: Info Sci & Eng, University of Canberra, AUSTRALIA\nLines: 13\n\n\nI've got a 386 20Hz computer which is under warranty and my Trident\n8900C video card is starting to play-up (surprise, surprise). Therefore\nI'm going to try to exchange it for a better card.\n\nThe BIG Question is