In [1]:
__author__ = 'Ksenia Voronaya'

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
import string

In [2]:
# it is difficult to handle all documents & categories after, so download only several
corpus_data = fetch_20newsgroups(categories=['rec.motorcycles', 'rec.autos', 'rec.sport.hockey', 'soc.religion.christian'])

In [3]:
print("List of topics ({} categories):".format(len(list(corpus_data.target_names))))
print(list(corpus_data.target_names))
print("Size of corpus is {} documents".format(len(corpus_data.data)))

List of topics (4 categories):
['rec.autos', 'rec.motorcycles', 'rec.sport.hockey', 'soc.religion.christian']
Size of corpus is 2391 documents


In [4]:
# remove morphological affixes from words, leaving only the word stem
stemmer = nltk.stem.porter.PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]
    stems = stem_tokens(tokens, stemmer)
    return stems

In [5]:
# download only the punkt corpus, use it if LookupError
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kvoronaya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# TF-IDF matrix for corpus 
corpus_tfidf = TfidfVectorizer(stop_words='english', tokenizer=tokenize)
corpus_representation = corpus_tfidf.fit_transform(corpus_data.data)

feature = corpus_tfidf.get_feature_names()

In [7]:
# enter a query string
query = 'hockey in Russia, our teams'
query_tfidf = corpus_tfidf.transform([query])

print("TF-IDF for each word in query string:")
for word in query_tfidf.nonzero()[1]:
    print(str(feature[word]) + ': {}'.format(query_tfidf[0, word]))

TF-IDF for each word in query string:
team: 0.387983136288
russia: 0.815344321601
hockey: 0.42974727828


In [8]:
corpus_representation_arr = corpus_representation.toarray()

# no needed to have a full texts, will print only thematic for current text
def get_text_thematic(numb):
    thematic = corpus_data.data[numb].split("\n")[:5]
    return " ".join([x if 'Subject' in x else '' for x in thematic]) 

# to find similarity between query string and each text in corpus, 
# will print only the closest texts
def cosine_similarity_results(query_str, text_count):
    
    print('The query string is "{}"'.format(query_str))
    cosine_similarity_results = {}
    tfidf_query_str = corpus_tfidf.transform([query_str])
    tfidf_query_str = tfidf_query_str.toarray()
    
    for i in xrange(corpus_representation.shape[0]):
        cosine_dist = cosine_similarity(corpus_representation_arr[i], tfidf_query_str)
        text_thematic = get_text_thematic(i)
        cosine_similarity_results[text_thematic]=cosine_dist
        
    for key, value in sorted(cosine_similarity_results.items(), reverse=True, key=lambda x:x[1])[:text_count]:
        print('Cosine Similarity is {}, for document "{}"'.format(value, key))

In [9]:
cosine_similarity_results('world champions in hockey', 15)

The query string is "world champions in hockey"
Cosine Similarity is [[ 0.29167389]], for document "  Subject: Re: Bruins vs Canadiens:  "
Cosine Similarity is [[ 0.23939343]], for document "  Subject: Keenan signs, Plus WALSH????????  "
Cosine Similarity is [[ 0.1535178]], for document " Subject: Re: Leaf slump over   "
Cosine Similarity is [[ 0.14756857]], for document " Subject: Re: Hockey Hell   "
Cosine Similarity is [[ 0.13054062]], for document " Subject: Re: Integra GSR (really about other cars)   "
Cosine Similarity is [[ 0.12955367]], for document " Subject: NHL Team Items...   "
Cosine Similarity is [[ 0.12814987]], for document " Subject: Re: AHL Calder Cup Playoff preview   "
Cosine Similarity is [[ 0.12668325]], for document " Subject: AHL News   "
Cosine Similarity is [[ 0.12211145]], for document "Subject: Remarks by President Clinton to NCAA Division I Champion Hockey Team    "
Cosine Similarity is [[ 0.12204393]], for document " Subject: Re: Hockey and the Hispanic co

In [10]:
cosine_similarity_results('christian religion', 15)

The query string is "christian religion"
Cosine Similarity is [[ 0.51763645]], for document " Subject: Why religion and which religion?   "
Cosine Similarity is [[ 0.35333405]], for document " Subject: Legal definition of religion   "
Cosine Similarity is [[ 0.35006914]], for document " Subject: Re: Knowing God's Will   "
Cosine Similarity is [[ 0.29608097]], for document " Subject: Atheist's views on Christianity (was: Re: "Accepting Jeesus in your heart...")   "
Cosine Similarity is [[ 0.22362192]], for document " Subject: Re: Is "Christian" a dirty word?   "
Cosine Similarity is [[ 0.21061985]], for document " Subject: Re: tuff to be a Christian?   "
Cosine Similarity is [[ 0.20642596]], for document " Subject: "Accepting Jeesus in your heart..."   "
Cosine Similarity is [[ 0.19027318]], for document " Subject: Bible Unsuitable for New Christians   "
Cosine Similarity is [[ 0.18107703]], for document " Subject: Religious wars   "
Cosine Similarity is [[ 0.17950888]], for document " 

In [11]:
cosine_similarity_results('the expensive autos', 15)

The query string is "the expensive autos"
Cosine Similarity is [[ 0.21560458]], for document " Subject: Manual Shift Bigots   "
Cosine Similarity is [[ 0.19421833]], for document " Subject: Re: SHO and SC   "
Cosine Similarity is [[ 0.16555126]], for document " Subject: Re: Chrysler New Yorker LHS (was Re: Chryslers Compact LH Sedans?)   "
Cosine Similarity is [[ 0.16180251]], for document " Subject: Re: Best Radar Detector - VALENTINE-1?   "
Cosine Similarity is [[ 0.13648251]], for document " Subject: Heard of these South Bay shops?   "
Cosine Similarity is [[ 0.09857532]], for document "Subject: apology (was Re: Did US drive on the left?)    "
Cosine Similarity is [[ 0.09086229]], for document " Subject: Re: Manual Shift Bigots wanted   "
Cosine Similarity is [[ 0.08599015]], for document " Subject: Re: ARCTIC WHEELS AUTO SHOW   "
Cosine Similarity is [[ 0.08497326]], for document " Subject: Re: GEICO mechanical breakdown insurance   "
Cosine Similarity is [[ 0.08247272]], for docum

In [12]:
cosine_similarity_results('the fastest motorcycles', 10)

The query string is "the fastest motorcycles"
Cosine Similarity is [[ 0.16711329]], for document " Subject: Re: Its still cold, but...   "
Cosine Similarity is [[ 0.16645309]], for document " Subject: Misc./buying info. needed   "
Cosine Similarity is [[ 0.14652274]], for document " Subject: Misc./Buying Info. Needed   "
Cosine Similarity is [[ 0.10717457]], for document " Subject: Re: New to Motorcycles...   "
Cosine Similarity is [[ 0.10606821]], for document " Subject: Re: GGRRRrrr!! Cages double-parking motorcycles pisses me off!   "
Cosine Similarity is [[ 0.10300142]], for document " Subject: Re: edu breaths   "
Cosine Similarity is [[ 0.09577946]], for document " Subject: Re: First Bike??   "
Cosine Similarity is [[ 0.08896139]], for document " Subject: European M/C Insurance   "
Cosine Similarity is [[ 0.0778002]], for document " Subject: Re: GGRRRrrr!!  Cages double-parking motorcycles pisses me off!   "
Cosine Similarity is [[ 0.07241165]], for document " Subject: Re: CNN Cal