In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
import re

In [2]:
import nltk
from nltk.stem.snowball import SnowballStemmer
#nltk.download()
stemmer = SnowballStemmer("english")

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [3]:
newsgroups = fetch_20newsgroups(categories=['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'], \
                                shuffle=True, random_state=1)
pprint(list(newsgroups.target_names))

['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space']


In [4]:
print 'Elements number:', len(newsgroups.data)

Elements number: 2373


In [5]:
vectorizer = TfidfVectorizer(stop_words='english', use_idf=True,
                             max_df=0.8, max_features=200000, tokenizer=tokenize_and_stem)
%time data = vectorizer.fit_transform(newsgroups.data)

CPU times: user 18.1 s, sys: 116 ms, total: 18.2 s
Wall time: 18.3 s


In [6]:
def get_subject(number):
    # get subject by number of new
    head = newsgroups.data[number].split("\n")[:5]
    return " ".join([x if 'Subject' in x else '' for x in head])    

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

data_array  = data.toarray()

def do_query(query, results_number=10):
    tfidf_query = vectorizer.transform((query,))
    query_array = tfidf_query.toarray()
    
    results = []
    for news_count in xrange(data.shape[0]):
        results.append((cosine_similarity(data_array[news_count], query_array), get_subject(news_count)))


    for i in sorted(results, reverse = True)[:results_number]:
        print '(similarity: ', '%5f' % i[0][0], ')        ', i[1].lstrip()
    

In [12]:
do_query("drugs")

(similarity:  0.337688 )         Subject: Re: Discussions on alt.psychoactives   
(similarity:  0.293077 )         Subject: Re: Once tapped, your code is no good any more.    
(similarity:  0.282552 )         Subject: Should patients read package inserts (PDR)?   
(similarity:  0.252110 )         Subject: Re: Once tapped, your code is no good any more.   
(similarity:  0.223682 )         Subject: Re: Would "clipper" make a good cover for other encryption method?   
(similarity:  0.193691 )         Subject: **Sorry folks** (read this)   
(similarity:  0.168738 )         Subject: Re: Clipper Chip. LONG follow up.   
(similarity:  0.166855 )         Subject: Re: tuberculosis   
(similarity:  0.154703 )         Subject: Re: fibromyalgia   
(similarity:  0.153091 )         Subject: Re: Altitude adjustment   


In [14]:
do_query("space")

(similarity:  0.509334 )         Subject: End of the Space Age   
(similarity:  0.496821 )         Subject: Space FAQ 13/15 - Interest Groups & Publications   
(similarity:  0.365454 )         Subject: Space FAQ 08/15 - Addresses   
(similarity:  0.353916 )         Subject: Alaska Pipeline and Space Station!   
(similarity:  0.305852 )         Subject: A flawed propulsion system: Space Shuttle   
(similarity:  0.305026 )         Subject: Space FAQ 02/15 - Network Resources    
(similarity:  0.301404 )         Subject: End of the Space Age?   
(similarity:  0.299803 )         Subject: Space Clipper Launch Article   
(similarity:  0.296308 )         Subject: Space Clippers launched   
(similarity:  0.282328 )         Subject: Re: space food sticks   


In [15]:
do_query("computer")

(similarity:  0.165520 )         Subject: Privacy & Anonymity on the Internet FAQ (2 of 3)   
(similarity:  0.160194 )         Subject: Screw the people, crypto is for hard-core hackers & spooks only   
(similarity:  0.139711 )         Subject: Re: Licensing of public key implementations   
(similarity:  0.139664 )         Subject: HELP: 20ma current loop to RS232 converter needed.   
(similarity:  0.131292 )         Subject: Re: The [secret] source of that announcement   
(similarity:  0.128254 )         Subject: Re: Don't fight Clipper Chip, subvert or replace it !   
(similarity:  0.121868 )         Subject: Re: Sunrise/ sunset times   
(similarity:  0.121007 )         Subject: Cryptography FAQ 10/10 - References   
(similarity:  0.120482 )         Subject: Re: text of White House announcement and Q&As on clipper chip encryption   
(similarity:  0.118645 )         Subject: Space FAQ 04/15 - Calculations   


In [20]:
do_query("space food")

(similarity:  0.496701 )         Subject: Re: Is MSG sensitivity superstition?   
(similarity:  0.416468 )         Subject: Re: space food sticks   
(similarity:  0.383777 )         Subject: Re: Is MSG sensitivity superstition?   
(similarity:  0.359315 )         Subject: Re: Is MSG sensitivity superstition?   
(similarity:  0.357552 )         Subject: Re: space food sticks   
(similarity:  0.346709 )         Subject: Re: Barbecued foods and health risk   
(similarity:  0.319753 )         Subject: Re: space food sticks   
(similarity:  0.315675 )         Subject: Re: Is MSG sensitivity superstition?   
(similarity:  0.312388 )         Subject: Re: Is MSG sensitivity superstition?   
(similarity:  0.295430 )         Subject: End of the Space Age   
