In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
newsgroups = fetch_20newsgroups(categories=['comp.sys.ibm.pc.hardware','comp.os.ms-windows.misc','rec.autos',
                                                  'rec.sport.hockey', 'sci.space'], subset='all', shuffle=True, random_state=1)
print 'Names of topics: ', list(newsgroups.target_names)

Names of topics:  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'rec.autos', 'rec.sport.hockey', 'sci.space']


### Tokenizer

In [3]:
stemmer = SnowballStemmer('english').stem
def stem_tokenize(text):
    return [stemmer(i) for i in word_tokenize(text)]

## TF-IDF

In [4]:
tfidf = TfidfVectorizer(stop_words='english',analyzer='word', tokenizer=stem_tokenize)
newsgroups_tfidf = tfidf.fit_transform(newsgroups.data)

In [5]:
print 'topics: ', len(newsgroups.target_names)
print 'documents: ', newsgroups_tfidf.shape[0]

topics:  5
documents:  4943


In [11]:
def most_similar(query, first_n_documents):
    # compute TF-IDF for query
    query_tfidf = tfidf.transform([query])
    
    # print TF-IDF value for each word in query
    words = query.split(' ', 1)
    for k in range(0, len(words)):
        print words[k], ': ', query_tfidf.data[k]
    print '\n'
    
    # compute cosine similarity between query and each document in corpus
    cos_sim = {}
    count = 0
    for document in newsgroups_tfidf:
        cos_sim[newsgroups.data[count]] = cosine_similarity(document, query_tfidf)[0][0]
        count+=1
    
    # sort result by max cosine similarity
    top_document = sorted(cos_sim.values(), reverse=True)
    
    for i in range(0, first_n_documents):
        text = cos_sim.keys()[cos_sim.values().index(top_document[i])]
        str= text.split('\n')
        print 'Similarity: ', top_document[i], '\n'
        
        # print the first 5 string in each document
        for j in range(0,5):
            print str[j].encode("ascii", "ignore")
        print '-----------------------------------\n'

In [12]:
most_similar("Russia champion", 2)

Russia :  0.734449512124
champion :  0.678663329008


Similarity:  0.193366914083 

From: pgf@srl03.cacs.usl.edu (Phil G. Fraering)
Subject: Re: U.S. Government and Science and Technolgy Investment
Organization: Univ. of Southwestern Louisiana
Lines: 24

-----------------------------------

Similarity:  0.162946447611 

Organization: University of Maine System
From: The Always Fanatical: Patrick Ellis <IO11330@MAINE.MAINE.EDU>
Subject: Re: 1993 NHL Draft
 <1993Apr20.184627.4585@newshub.ariel.yorku.ca>
 <1993Apr21.064605.24531@CSD-NewsHost.Stanford.EDU>
-----------------------------------



In [13]:
most_similar("Spaceship", 3)

Spaceship :  1.0


Similarity:  0.303144022377 

From: higgins@fnalf.fnal.gov (Bill Higgins-- Beam Jockey)
Subject: The Dream Machines: book on vaporware spacecraft
Organization: Fermi National Accelerator Laboratory
Lines: 133
NNTP-Posting-Host: fnalf.fnal.gov
-----------------------------------

Similarity:  0.216501621336 

From: kudla@acm.rpi.edu (Robert Kudla)
Subject: Re: Diamond SS24X, Win 3.1, Mouse cursor
Nntp-Posting-Host: hermes.acm.rpi.edu
Lines: 16

-----------------------------------

Similarity:  0.14204429275 

From: kudla@acm.rpi.edu (Robert Kudla)
Keywords: Mislead, Misinform, Misdirect, COPY
Article-I.D.: rpi.y3g53tr
Lines: 40
-----------------------------------



In [15]:
most_similar("machine learning", 5)

machine :  0.6180784167
learning :  0.786116448632


Similarity:  0.235040845418 

Organization: University of Central Florida - Computer Services
From: Mark Woodruff <CDA90038@UCF1VM.BITNET>
Subject: Many people on one machine
Lines: 9

-----------------------------------

Similarity:  0.197225166317 

From: lukka@klaava.Helsinki.FI (Tuomas J Lukka)
Subject: Re: Challenge to Microsoft supporters.
Organization: University of Helsinki
Lines: 13

-----------------------------------

Similarity:  0.170669342726 

From: gryphon@openage.openage.com (The Golden Gryphon)
Subject: Re: Intel, the Pentium and Linux
Article-I.D.: openage.1993Apr04.175934.8526
Organization: Open Age, Inc.
Lines: 40
-----------------------------------

Similarity:  0.147139351263 

From: kentiler@matt.ksu.ksu.edu (Kent P. Iler)
Subject: Procomm Plus for windows problems....
Organization: Kansas State University
Lines: 10
NNTP-Posting-Host: matt.ksu.ksu.edu
-----------------------------------

Similarity:  0.1455981

In [23]:
most_similar("NHL stars", 5)

NHL :  0.751851563218
stars :  0.659332410008


Similarity:  0.349742578653 

From: Karim Edvard Ahmed <ka0k+@andrew.cmu.edu>
Subject: Re: Truly a sad day for hockey
Organization: Senior, Economics, Carnegie Mellon, Pittsburgh, PA
Lines: 17
NNTP-Posting-Host: po5.andrew.cmu.edu
-----------------------------------

Similarity:  0.304003654709 

From: alvstad@mari.acc-admin.stolaf.edu (Mad Dog)
Subject: Truly a sad day for hockey
Organization: St. Olaf College; Northfield, MN
Lines: 19

-----------------------------------

Similarity:  0.284191849585 

From: dwarf@bcarh601.bnr.ca (W. Jim Jordan)
Subject: Re: Truly a sad day for hockey
Nntp-Posting-Host: bcarh601
Organization: Bell-Northern Research Ltd., Ottawa, Ontario, Canada
Lines: 19
-----------------------------------

Similarity:  0.226217756345 

From: MLINDROOS@FINABO.ABO.FI (Marcus Lindroos INF)
Subject: Re: Too Many Europeans in NHL
In-Reply-To: rauser@fraser.sfu.ca's message of Tue, 6 Apr 1993 02:16:48 GMT
Organization: Abo Ak