In [1]:
import pandas
import re
import gensim
import spacy
import numpy as np
import pyLDAvis.gensim
from nltk.corpus import stopwords
from pprint import pprint
from gensim.test.utils import datapath
from scipy.stats import entropy

In [3]:
def tokenizing(doc):
    """
    Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.
    """
    return gensim.utils.simple_preprocess(str(doc), deacc=True, min_len=4, max_len=15)


def preprocessed_doc(doc):
    """
    Preprocessing a single document
    """   
    doc = re.sub(r'https?\S*\s?', '', doc)
    doc = re.sub(r'\s\S*\.edu\S*\s?', ' ', doc)
    doc = re.sub(r'\s\S*\.com\S*\s?', ' ', doc)
    doc = re.sub(r'www\.\S*\s?', '', doc)
    doc = re.sub(r'\S*@\S*\s?', '', doc)
    doc = re.sub(r"\'", "", doc)
    doc = re.sub(r'\s+', ' ', doc)  
    doc = tokenizing(doc)
    return doc
    
    
def preprocessed_data(data):
    """
    Preprocessing the entire data (list of documents)
    """
    return [preprocessed_doc(doc) for doc in data]


df = pandas.read_json('data/newsgroups.json')
data = df.content.values.tolist()
tokenized_data = preprocessed_data(data)


In [6]:
stopwords_ = ['that', 'from', 'this', 'have', 'with', 'subject', 'they', 'lines', 'organization',
              'what', 'will', 'there', 'would', 'about', 'writes', 'your', 'article', 'some',
              'which', 'were', 'more', 'people', 'like', 'dont', 'when', 'just', 'university',
              'posting', 'their', 'other', 'know', 'only', 'host', 'them', 'nntp', 'than', 'been',
              'think', 'also', 'does', 'time', 'then', 'good', 'these', 'well', 'should', 'could',
              'because', 'even', 'very', 'into', 'first', 'many', 'those', 'make', 'much',
              'most', 'system', 'such', 'distribution', 'right', 'where', 'world', 'want', 'here',
              'reply', 'used', 'being', 'said', 'over', 'anyone', 'after', 'same', 'need', 'work',
              'something', 'problem', 'please', 'really', 'computer', 'since', 'back', 'believe',
              'still', 'going', 'years', 'file', 'information', 'year', 'windows', 'help', 'mail',
              'using', 'state', 'find', 'take', 'question', 'last', 'point', 'thanks', 'space',
              'before', 'must', 'never', 'things', 'while', 'better', 'government', 'cant', 'might',
              'both', 'number', 'read', 'sure', 'another', 'case', 'without', 'program', 'down',
              'through', 'made', 'data', 'drive', 'software', 'long', 'available', 'part', 'under',
              'david', 'thing', 'doesnt', 'someone', 'look', 'power', 'thats', 'between', 'little',
              'version', 'come', 'didnt', 'however', 'each', 'public', 'around', 'anything', 'fact',
              'science', 'best', 'give', 'true', 'every', 'probably', 'again', 'name', 'john',
              'course', 'least', 'line', 'against', 'tell', 'seems', 'group', 'different',
              'systems', 'great', 'enough', 'high', 'research', 'news', 'list', 'hard', 'real',
              'says', 'second', 'jesus', 'possible', 'either', 'life', 'actually', 'game',
              'though', 'support', 'card', 'technology', 'post', 'center', 'called', 'free',
              'rather', 'nothing', 'access', 'next', 'team', 'chip', 'window', 'mean',
              'email', 'internet', 'problems', 'youre']

with open('scripts/stopwords.txt', 'r') as fp:
    for word in fp:
        stopwords_.append(word.strip())

stop_words = stopwords.words('english')
stop_words.extend(stopwords_)


In [7]:
def remove_stopwords(doc):
    """
    Removes stopwords from a document
    """
    return [word for word in doc
            if word not in stop_words]


def make_bigrams(doc):
    """
    Make bigrams of a document
    """
    return bigram_mod[doc]


def lemmatization(doc):
    """
    Lemmatizes a document
    """
    doc = nlp(" ".join(doc))
    return [token.lemma_ for token in doc]


def processed_doc(doc):
    """
    Processing a document
    """
    doc = remove_stopwords(doc)
    doc = make_bigrams(doc)
    doc = lemmatization(doc)
    doc = remove_stopwords(doc)
    return doc


def processed_data(data):
    """
    Return lemmatized data
    """
    return [processed_doc(doc) for doc in data]


bigram = gensim.models.Phrases(tokenized_data, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
nlp = spacy.load('en', disable=['parser', 'ner'])

lemmatized_data = processed_data(tokenized_data)


In [9]:
num_of_topics = 20
lda_model = gensim.models.LdaModel.load(datapath('20newsgroups_tfidf'))
id2word = gensim.corpora.Dictionary(lemmatized_data)
corpus = corpus = [id2word.doc2bow(text) for text in lemmatized_data]

tfidf = gensim.models.TfidfModel(corpus)
corpus = tfidf[corpus]


In [10]:
def print_topics():
    """
    Prints top 30 words from each topic
    """
    pprint(lda_model.show_topics(num_of_topics, 30))


def get_doc_topic_distribution(index):
    """
    Get the topic distribution for a given document
    """
    doc = corpus[index]
    return lda_model.get_document_topics(doc)


def get_topic_term():
    """
    Get topic-term matrix
    """
    topic_term_matrix = lda_model.get_topics()
    print(len(topic_term_matrix), 'x', len(topic_term_matrix[0]))
    return topic_term_matrix


def get_term_topics(word_id):
    """
    Get the most relevant topics to the given word
    """
    print('Word is: ', id2word[word_id])
    relevant_topics = lda_model.get_term_topics(word_id)
    relevant_topic_ids = [topic_id[0] for topic_id in relevant_topics]
    for topic_id in relevant_topic_ids:
        print(show_topic(topic_id))
    return relevant_topics


def unseen_doc_topic_distribution(new_doc: list):
    """
    Get topic distribution of a unseen document
    """
    new_doc = [id2word.doc2bow(new_doc)]
    topics = lda_model[new_doc]
    return sorted(topics[0][0], key=lambda x: x[1], reverse=True)


def doc_topic_matrix():
    """
    Get document topic matrix
    """
    matrix = np.zeros(shape=(len(corpus), num_of_topics))
    for i in range(len(corpus)):
        topic_dist = get_doc_topic_distribution(i)
        for topic, prob in topic_dist:
            matrix[i][topic] = prob

    return matrix


def jensen_shannon(query, matrix):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for a document)
    and the entire corpus of topic distributions.
    It returns an array of length M where M is the number of documents in the corpus
    """
    p = query[None, :].T
    q = matrix.T
    m = 0.5 * (p + q)
    # entropy calculated KL divergence
    return np.sqrt(0.5 * (entropy(p, m) + entropy(q, m)))


def get_document(index):
    """
    index is list or num
    """
    if isinstance(index, list):
        docs = []
        for i in index:
            docs.append(data[i])
        return docs
    return data[index]


def get_similar_docs(new_doc: list):
    """
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances
    """
    topics = unseen_doc_topic_distribution(new_doc)
    topic_dist = np.zeros(shape=(num_of_topics))
    for topic, prob in topics:
        topic_dist[topic] = prob
    sims = jensen_shannon(topic_dist, doc_topic_matrix())
    docs = list(sims.argsort()[:10])
    return get_document(docs)


def retrieval(doc):
    doc = preprocessed_doc(doc)
    doc = processed_doc(doc)
    return get_similar_docs(doc)


In [11]:
doc1 = '''


'''

In [12]:
similar_doc1 = retrieval(doc1)
print(similar_doc1[0])

Subject: Cubs mailing list
From: andrew@dark.side.of.the.moon.uoknor.edu (Chihuahua Charlie)
Distribution: usa
Organization: OU - Academic User Services
Nntp-Posting-Host: loopback.uoknor.edu
News-Software: VAX/VMS VNEWS 1.41    Lines: 14
Lines: 14


	Is there anyone out there running a Chicago National
	League Ballclub list?  If so, please send me information
	on it to...
			andrew@aardvark.ucs.uoknor.edu

	Thanks!

|\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/|
|O|  _    |  Chihuahua Charlie              |  OU is not responsible   |O|
|O| | |   |  Academic User Services         |  for anything anywhere,  |O|
|O| ||||  |  The University of Oklahoma     |  except for that one     |O|
|O|  |_|  |  andrew@aardvark.ucs.uoknor.edu |  incident where 200...   |O|
|O|____________________________________________________________________|O|

