In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data = fetch_20newsgroups(categories=['comp.sys.mac.hardware', 'rec.sport.hockey', 'sci.space', 'rec.motorcycles'])

In [3]:
nltk.download('punkt')

stemmer = nltk.stem.porter.PorterStemmer()

def textTokenizer(text):
    return [stemmer.stem(i) for i in nltk.word_tokenize(text) if i not in string.punctuation]

# TF-IDF matrix for corpus 
tfidf = TfidfVectorizer(stop_words='english', tokenizer=textTokenizer)
corpus = tfidf.fit_transform(data.data)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Andrey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
query = 'macintosh and kepler program'
transformedQuery = tfidf.transform([query])
features = tfidf.get_feature_names()

for word in transformedQuery.nonzero()[1]:
    print(str(features[word]) + ': {}'.format(transformedQuery[0, word]))

program: 0.371651706838
macintosh: 0.468942808333
kepler: 0.801228838296


In [32]:
corpus_arr = corpus.toarray()

def cosine(query, count):
    print('The query string is "{}"'.format(query))
    cosine_similarity_results = {}
    tfidf_query = tfidf.transform([query]).toarray()
    
    for i in xrange(corpus.shape[0]):
        cosine_dist = cosine_similarity(corpus_arr[i].reshape(1, -1), tfidf_query)
        text_thematic = " ".join([x if 'Subject' in x else '' for x in data.data[i].split("\n")[:5]]) 
        cosine_similarity_results[text_thematic]=cosine_dist
        
    for key, value in sorted(cosine_similarity_results.items(), reverse=True, key=lambda x:x[1])[:count]:
        print('Cosine: {} - document "{}"'.format(value, key))

In [33]:
cosine('powerfull engine' ,10)

The query string is "powerfull engine"
Cosine: [[ 0.23247986]] - document " Subject: Re: "Jump Starting" a Mac II   "
Cosine: [[ 0.22288427]] - document " Subject: Re: Computer Engr vs.  Computer Science   "
Cosine: [[ 0.21305584]] - document " Subject: Re: Computer Engr vs. Computer Science   "
Cosine: [[ 0.17439876]] - document " Subject: Re: DC-X: Vehicle Nears Flight Test   "
Cosine: [[ 0.17035721]] - document " Subject: DoD Books   "
Cosine: [[ 0.16999556]] - document " Subject: Jemison on Star Trek   "
Cosine: [[ 0.14366592]] - document " Subject: Spagthorpe Viking   "
Cosine: [[ 0.14355586]] - document " Subject: How do they ignite the SSME?   "
Cosine: [[ 0.13725853]] - document " Subject: DC-Y trajectory simulation   "
Cosine: [[ 0.13241675]] - document " Subject: *** For sale: 1988 Husqvarna 510TE ***   "


In [34]:
cosine('high speed' ,10)

The query string is "high speed"
Cosine: [[ 0.22998404]] - document " Subject: Re: iisi clock upgrades   "
Cosine: [[ 0.19237619]] - document " Subject: Re: Speeding ticket from CHP   "
Cosine: [[ 0.17592551]] - document " Subject: Why the drive speeds differ??   "
Cosine: [[ 0.16376998]] - document " Subject: Re: Why the drive speeds differ??   "
Cosine: [[ 0.14412415]] - document " Subject: Re: Accelerating the MacPlus...;)   "
Cosine: [[ 0.13921538]] - document " Subject: Re: x86 ~= 680x0 ??  (How do they compare?)   "
Cosine: [[ 0.12772633]] - document " Subject: SI clock reports   "
Cosine: [[ 0.12215963]] - document " Subject: MACH 25 landing site bases?   "
Cosine: [[ 0.12183501]] - document " Subject: Buying a high speed v.everything modem   "
Cosine: [[ 0.1118219]] - document " Subject: Russian Email Contacts.   "
