In [16]:
import pandas
import re
import gensim
import spacy
import numpy as np
import pyLDAvis.gensim
from nltk.corpus import stopwords
from pprint import pprint
from gensim.test.utils import datapath
from scipy.stats import entropy

In [17]:
def tokenizing(doc):
    """
    Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.
    """
    return gensim.utils.simple_preprocess(str(doc), deacc=True, min_len=4, max_len=15)


def preprocessed_doc(doc):
    """
    Preprocessing a single document
    """   
    doc = re.sub(r'https?\S*\s?', '', doc)
    doc = re.sub(r'\s\S*\.edu\S*\s?', ' ', doc)
    doc = re.sub(r'\s\S*\.com\S*\s?', ' ', doc)
    doc = re.sub(r'www\.\S*\s?', '', doc)
    doc = re.sub(r'\S*@\S*\s?', '', doc)
    doc = re.sub(r"\'", "", doc)
    doc = re.sub(r'\s+', ' ', doc)  
    doc = tokenizing(doc)
    return doc
    
    
def preprocessed_data(data):
    """
    Preprocessing the entire data (list of documents)
    """
    return [preprocessed_doc(doc) for doc in data]


df = pandas.read_json('data/newsgroups.json')
data = df.content.values.tolist()
tokenized_data = preprocessed_data(data)


In [18]:
stopwords_ = ['that', 'from', 'this', 'have', 'with', 'subject', 'they', 'lines', 'organization',
              'what', 'will', 'there', 'would', 'about', 'writes', 'your', 'article', 'some',
              'which', 'were', 'more', 'people', 'like', 'dont', 'when', 'just', 'university',
              'posting', 'their', 'other', 'know', 'only', 'host', 'them', 'nntp', 'than', 'been',
              'think', 'also', 'does', 'time', 'then', 'good', 'these', 'well', 'should', 'could',
              'because', 'even', 'very', 'into', 'first', 'many', 'those', 'make', 'much',
              'most', 'system', 'such', 'distribution', 'right', 'where', 'world', 'want', 'here',
              'reply', 'used', 'being', 'said', 'over', 'anyone', 'after', 'same', 'need', 'work',
              'something', 'problem', 'please', 'really', 'computer', 'since', 'back', 'believe',
              'still', 'going', 'years', 'file', 'information', 'year', 'windows', 'help', 'mail',
              'using', 'state', 'find', 'take', 'question', 'last', 'point', 'thanks', 'space',
              'before', 'must', 'never', 'things', 'while', 'better', 'government', 'cant', 'might',
              'both', 'number', 'read', 'sure', 'another', 'case', 'without', 'program', 'down',
              'through', 'made', 'data', 'drive', 'software', 'long', 'available', 'part', 'under',
              'david', 'thing', 'doesnt', 'someone', 'look', 'power', 'thats', 'between', 'little',
              'version', 'come', 'didnt', 'however', 'each', 'public', 'around', 'anything', 'fact',
              'science', 'best', 'give', 'true', 'every', 'probably', 'again', 'name', 'john',
              'course', 'least', 'line', 'against', 'tell', 'seems', 'group', 'different',
              'systems', 'great', 'enough', 'high', 'research', 'news', 'list', 'hard', 'real',
              'says', 'second', 'jesus', 'possible', 'either', 'life', 'actually', 'game',
              'though', 'support', 'card', 'technology', 'post', 'center', 'called', 'free',
              'rather', 'nothing', 'access', 'next', 'team', 'chip', 'window', 'mean',
              'email', 'internet', 'problems', 'youre', '-PRON-', '_']]

with open('scripts/stopwords.txt', 'r') as fp:
    for word in fp:
        stopwords_.append(word.strip())

stop_words = stopwords.words('english')
stop_words.extend(stopwords_)


SyntaxError: invalid syntax (<ipython-input-18-5783f8e18e55>, line 23)

In [None]:
def remove_stopwords(doc):
    """
    Removes stopwords from a document
    """
    return [word for word in doc
            if word not in stop_words]


def make_bigrams(doc):
    """
    Make bigrams of a document
    """
    return bigram_mod[doc]


def lemmatization(doc):
    """
    Lemmatizes a document
    """
    doc = nlp(" ".join(doc))
    return [token.lemma_ for token in doc]


def processed_doc(doc):
    """
    Processing a document
    """
    doc = remove_stopwords(doc)
    doc = make_bigrams(doc)
    doc = lemmatization(doc)
    doc = remove_stopwords(doc)
    return doc


def processed_data(data):
    """
    Return lemmatized data
    """
    return [processed_doc(doc) for doc in data]


bigram = gensim.models.Phrases(tokenized_data, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
nlp = spacy.load('en', disable=['parser', 'ner'])

lemmatized_data = processed_data(tokenized_data)


In [None]:
num_of_topics = 20
lda_model = gensim.models.LdaModel.load(datapath('20newsgroups_bow'))
id2word = lda_model.id2word
corpus = [id2word.doc2bow(text) for text in lemmatized_data]

# tfidf = gensim.models.TfidfModel(corpus)
# corpus = tfidf[corpus]


In [None]:
def print_topics():
    """
    Prints top 30 words from each topic
    """
    pprint(lda_model.show_topics(num_of_topics, 30))


def get_doc_topic_distribution(index):
    """
    Get the topic distribution for a given document
    """
    doc = corpus[index]
    return lda_model.get_document_topics(doc)


def get_topic_term():
    """
    Get topic-term matrix
    """
    topic_term_matrix = lda_model.get_topics()
    print(len(topic_term_matrix), 'x', len(topic_term_matrix[0]))
    return topic_term_matrix


def get_term_topics(word_id):
    """
    Get the most relevant topics to the given word
    """
    print('Word is: ', id2word[word_id])
    relevant_topics = lda_model.get_term_topics(word_id)
    relevant_topic_ids = [topic_id[0] for topic_id in relevant_topics]
    for topic_id in relevant_topic_ids:
        print(show_topic(topic_id))
    return relevant_topics


def unseen_doc_topic_distribution(new_doc: list):
    """
    Get topic distribution of a unseen document
    """
    new_doc = [id2word.doc2bow(new_doc)]
    topics = lda_model[new_doc]
    return sorted(topics[0][0], key=lambda x: x[1], reverse=True)


def doc_topic_matrix():
    """
    Get document topic matrix
    """
    matrix = np.zeros(shape=(len(corpus), num_of_topics))
    for i in range(len(corpus)):
        topic_dist = get_doc_topic_distribution(i)
        for topic, prob in topic_dist:
            matrix[i][topic] = prob

    return matrix


def jensen_shannon(query, matrix):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for a document)
    and the entire corpus of topic distributions.
    It returns an array of length M where M is the number of documents in the corpus
    """
    p = query[None, :].T
    q = matrix.T
    m = 0.5 * (p + q)
    # entropy calculated KL divergence
    return np.sqrt(0.5 * (entropy(p, m) + entropy(q, m)))


def get_document(index):
    """
    Index is list or num
    """
    if isinstance(index, list):
        docs = []
        for i in index:
            docs.append(data[i])
        return docs
    return data[index]


def get_similar_docs(new_doc: list):
    """
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances
    """
    topics = unseen_doc_topic_distribution(new_doc)
    topic_dist = np.zeros(shape=(num_of_topics))
    for topic, prob in topics:
        topic_dist[topic] = prob
    sims = jensen_shannon(topic_dist, doc_topic_matrix())
    docs = list(sims.argsort()[:10])
    return get_document(docs)


def retrieval(doc):
    doc = preprocessed_doc(doc)
    doc = processed_doc(doc)
    return get_similar_docs(doc)


In [25]:
doc1 = '''
Don and Ron it was the an "off-night" for the Leafs and the Devils 
were outplaying Toronto. Well, I BEG to differ....

IMHO, Clark deserved to be a first star as much as Gilmour did. His
fast breaks towards the net and the good opportunites that he
created reminded me of the Clark of old. (But not to take any of the
credit away from Gilmour).

I think the Leafs are playing GREAT hockey. WHY? 
Well first look at their injury list which includes, Cullen, Ellet,
Zezel, Macoun. Of course my question is this....how will the Leafs
fare when they are once again "healthy" if they are playing this well
so far??

Second, just look at their standings, still second in defence,
moved from 11th overall to 6th over in the last month, haven't lost
at home in last 12 games, 8 game undefeated streak..etc.
(BTW, am I wrong or was this Potvin's first shut-out? I can't 
remember him having any as of yet.)

Well, as of April 3 we see that the race for first in the Norris
has truly begun and it will be a VERY CLOSE race between Chicago and
Toronto. And the best game of the season will probably be their last
against each other. (is anyone lucky enough to have tickets to
see this one?)

'''

In [26]:
similar_doc1 = retrieval(doc1)
print(similar_doc1[0])

From: gballent@hudson.UVic.CA (Greg  Ballentine)
Subject: Re: plus minus stat
Nntp-Posting-Host: hudson.uvic.ca
Reply-To: gballent@hudson.UVic.CA
Organization: University of Victoria, Victoria, BC, Canada
Lines: 38


In article 20009@ramsey.cs.laurentian.ca, maynard@ramsey.cs.laurentian.ca (Roger Maynard) writes:
>In <1993Apr15.160450.27799@sol.UVic.CA> gballent@hudson.UVic.CA (Greg  Ballentine) writes:

>>Gainey is the best defensive forward ever.  I stand by that assessment.
>>He was a very good player who belongs in the hall of fame.  Did you
>>ever watch him play? He never made a technical error.
>
>I watched him over his entire career.  I have NEVER seen a player, and that
>includes Russell Courtnall and Davie Keon, screw up as many breakaways as
>Bob Gainey.  And I will never forget the time Denis Potvin caught Gainey
>with his head down.  You have been sold a bill of goods on Bob Gainey.
>
>Gainey was a plugger.  And when the press runs out of things to say about 
>the stars on 

In [27]:
print(similar_doc1[1])

From: ayim@leibniz.uwaterloo.ca (Alfred Yim)
Subject: And... THEY'RE OFF!!!!!
Keywords: Leafs Chicago
Organization: University of Waterloo
Lines: 39

Well, I gotta tell ya,

last night's Leafs game vs the Devils was a nail-bitter LET ME TELL YOU!
It was a well played game by BOTH teams (I thought) but according to the
Don and Ron it was the an "off-night" for the Leafs and the Devils 
were outplaying Toronto. Well, I BEG to differ....

IMHO, Clark deserved to be a first star as much as Gilmour did. His
fast breaks towards the net and the good opportunites that he
created reminded me of the Clark of old. (But not to take any of the
credit away from Gilmour).

I think the Leafs are playing GREAT hockey. WHY? 
Well first look at their injury list which includes, Cullen, Ellet,
Zezel, Macoun. Of course my question is this....how will the Leafs
fare when they are once again "healthy" if they are playing this well
so far??

Second, just look at their standings, still second in defence,
moved