In [1]:
import pandas
import re
import gensim
import spacy
import numpy as np
import pyLDAvis.gensim
from nltk.corpus import stopwords
from pprint import pprint
from gensim.test.utils import datapath
from scipy.stats import entropy

In [2]:
remove_lines = ["It's not an attachment -- it's stored online. To open this item, just clickthe link above.",
                "I've shared an item with you:"]


def tokenizing(doc):
    """
        Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.
    """
    return gensim.utils.simple_preprocess(str(doc), deacc=True, min_len=4, max_len=15)


def preprocessed_doc(doc):
    """
        Preprocessing a single document
    """
    for line in remove_lines:
        doc = doc.replace(line, '')
    
    doc = re.sub(r'https?\S*\s?', '', doc)
    doc = re.sub(r'\s\S*\.ac\.in\S*\s?', ' ', doc)
    doc = re.sub(r'\s\S*\.edu\S*\s?', ' ', doc)
    doc = re.sub(r'\s\S*\.com\S*\s?', ' ', doc)
    doc = re.sub(r'www\.\S*\s?', '', doc)
    doc = re.sub(r'[^a-zA-Z\s]', '', doc)
    doc = re.sub(r'\s+', ' ', doc)    
    doc = tokenizing(doc)
    return doc
    
    
def preprocessed_data(data):
    """
        Preprocessing the entire data (list of documents)
    """
    return [preprocessed_doc(doc) for doc in data]


df = pandas.read_json('data/processed_iitdh_broadcast-3.json')
data = df.content.values.tolist()
tokenized_data = preprocessed_data(data)


In [3]:
stopwords_ = ['dharwad', 'regard', 'iit', 'please', 'student',
              'institute', 'write', 'thank', 'form', 'technology',
              'would', 'year', 'time', 'follow', 'email', 'room', 'fill', 'date',
              'also', 'indian', 'engineering', 'give', 'get', 'day', 'work', 'india',
              'detail', 'mail', 'interested', 'use', 'request', 'guy', 'may',
              'not', 'link', 'like', 'take', 'make', 'still', 'since', 'keep',
              'secretary', 'come', 'one', 'professor', 'today', 'good',
              'find', 'tomorrow', 'first', 'send', 'system', 'start',
              'information', 'member', 'prof', 'part', 'regretted' 'registrar'
              'new', 'venue', 'attend', 'kindly', 'aug', '-PRON-',
              'image', 'provide', 'well', 'visit', 'do', 'inconvenience'
              'assistant_professor', 'want', 'contact', 'go', 'meet',
              'invite', 'name', 'need', 'opportunity', 'attach',
              'everyone', 'google', 'receive', 'conduct', 'great', 'note', 'affair',
              'available', 'number', 'august', 'many', 'college', 'share',
              'help', 'programme', 'walmi_campus', 'belur_industrial',
              'know', 'hold', 'participation', 'march', 'group', 'walmi'
              'rule', 'see', 'learn', 'hello_everyone', 'indore', 'campus'
              'challenge', 'phd', 'present', 'people', 'saturday', 'open', 'hope',
              'april', 'jan', 'online', 'issue', 'require', 'play', 'thing',
              'back', 'faculty', 'dean', 'summer', 'join', 'next', 'inform',
              'base', 'topic', 'hello', 'session', 'idea', 'model', 'participant',
              'post', 'last', 'change', 'round', 'karnataka_india', 'title', 'list',
              'two', 'bring', 'access', 'app', 'include', 'click', 'sincerely', 'learn'
              'view', 'hour', 'close', 'question', 'create', 'third', 'front', 'arpit', 'agrawal', 'btech',
              'high', 'week', 'gentle_reminder', 'september', 'service', 'area', 'morning',
              'timing', 'inter', 'from', 'subject', 'text', 'fwd', 'forwarded', 'message', 'dear', 'f',
              'iitdh', 'iitg', 'deputation', 'three', 'become', 'karnataka', 'select', 'wbw_prasanna',
              'unsubscribe', 'pron', 'hey', 'cse', 'january', 'february', 'march', 'april', 'june', 'july',
              'august', 'september', 'october', 'november', 'december', 'larsen', 'tourbo', 'tourbo_chair',
              'bombay', 'lot', 'sit', 'try', 'ive_invite', 'till', 'every', 'never', 'near', 'welfare', 'techf',
              'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'secy', 'could',
              'student', 'department', 'first', 'second', 'four', 'feel', 'reminder', 'gentle',  'sure',
              'sign', 'photo', 'welcome', 'assistant', 'already', 'sorry', 'small', 'attachment', 'ever']

stop_words = stopwords.words('english')
stop_words.extend(stopwords_)


In [4]:
def remove_stopwords(doc):
    """
        Removes stopwords from a document
    """
    return [word for word in doc
            if word not in stop_words]


def make_bigrams(doc):
    """
        Make bigrams of a document
    """
    return bigram_mod[doc]


def lemmatization(doc):
    """
        Lemmatizes a document
    """
    doc = nlp(" ".join(doc))
    return [token.lemma_ for token in doc]


def processed_doc(doc):
    """
        Processing a document
    """
    doc = remove_stopwords(doc)
    doc = make_bigrams(doc)
    doc = lemmatization(doc)
    doc = remove_stopwords(doc)
    return doc


def processed_data(data):
    """
        Return lemmatized data
    """
    return [processed_doc(doc) for doc in data]


bigram = gensim.models.Phrases(tokenized_data, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
nlp = spacy.load('en', disable=['parser', 'ner'])

lemmatized_data = processed_data(tokenized_data)


In [37]:
num_of_topics = 10
lda_model = gensim.models.ldamodel.LdaModel.load(datapath())
id2word = gensim.corpora.Dictionary(lemmatized_data)

In [45]:
def print_topics():
    """
        Prints top 30 words from each topic
    """
    pprint(lda_model.show_topics(num_of_topics, 30))


def get_doc_topic_distribution(index):
    """
        Get the topic distribution for a given document
    """
    doc = corpus[index]
    return lda_model.get_document_topics(doc)


def get_topic_term():
    """
        Get topic-term matrix
    """
    topic_term_matrix = lda_model.get_topics()
    print(len(topic_term_matrix), 'x', len(topic_term_matrix[0]))
    return topic_term_matrix


def get_term_topics(word_id):
    """
        Get the most relevant topics to the given word
    """
    print('Word is: ', id2word[word_id])
    relevant_topics = lda_model.get_term_topics(word_id)
    relevant_topic_ids = [topic_id[0] for topic_id in relevant_topics]
    for topic_id in relevant_topic_ids:
        print(show_topic(topic_id))
    return relevant_topics


def unseen_doc_topic_distribution(new_doc: list):
    """
        Get topic distribution of a unseen document
    """
    new_doc = [id2word.doc2bow(new_doc)]
    topics = lda_model[new_doc]
    return sorted(topics[0], key=lambda x: x[1], reverse=True)


def doc_topic_matrix():
    """
        Get document topic matrix
    """
    matrix = np.zeros(shape=(len(corpus), num_of_topics))
    for i in range(len(corpus)):
        topic_dist = get_doc_topic_distribution(i)
        for topic, prob in topic_dist:
            matrix[i][topic] = prob

    return matrix


def jensen_shannon(query, matrix):
    """
        This function implements a Jensen-Shannon similarity
        between the input query (an LDA topic distribution for a document)
        and the entire corpus of topic distributions.
        It returns an array of length M where M is the number of documents in the corpus
    """
    p = query[None, :].T
    q = matrix.T
    m = 0.5 * (p + q)
    # entropy calculated KL divergence
    return np.sqrt(0.5 * (entropy(p, m) + entropy(q, m)))


def get_document(index):
    """
        index is list or num
    """
    if isinstance(index, list):
        docs = []
        for i in index:
            docs.append(data[i])
        return docs
    return data[index]


def get_similar_docs(new_doc: list):
    """
        This function implements the Jensen-Shannon distance above
        and retruns the top k indices of the smallest jensen shannon distances
    """
    topics = unseen_doc_topic_distribution(new_doc)
    topic_dist = np.zeros(shape=(num_of_topics))
    for topic, prob in topics:
        topic_dist[topic] = prob
    sims = jensen_shannon(topic_dist, doc_topic_matrix())
    docs = list(sims.argsort()[:5])
    return get_document(docs)


def retrieval(doc):
    doc = preprocessed_doc(doc)
    doc = processed_doc(doc)
    return get_similar_docs(doc)


In [47]:
doc = '''Dear Sir/Madam,

 

Atal Bihari Vajpayee-Indian Institute of Information Technology and Management Gwalior (ABV-IIITM Gwalior) is seeking admission for a PhD programme in the field of Engineering Technology (CS & IT, EC), Management and Applied Sciences (Mathematics & Physics).

Please find the attached programme brochure with specific programme details. We would like to request you to kindly circulate the brochure in your Institute so that the information may reach to the prospective research scholars.  

Sincerely yours,

Pankaj Gupta

Joint Registrar (Academics)

 

About the Institute:

Atal Bihari Vajpayee-Indian Institute of Information Technology and Management Gwalior (ABV-IIITM Gwalior), is an apex Information Technology (IT) and Management Institute, established by the Government of India. ABV-IIITM Gwalior has been declared an Institute of National importance. The Institute strives to become a world-class Institution which endeavors to carve young minds through teaching and research and develop them as tomorrow's leaders. The Institute's mandate is to create Information Technology enabled Management solutions for nation building. The Institute offers various programmes at UG/PG and Doctoral level
'''

In [48]:
similar_docs = retrieval(doc)
print(similar_docs[0])

Subject: Arduino workshop
Text: Hi everyone,
Those who are going to stay for Arduino workshop as Mentees, put your name
and roll number in the google form.

The names in the list will be finalized for group making and will be used
for the certificates.


https://goo.gl/forms/pLBCU9fWX6ZNJmMU2

