In [8]:
import pandas
import re
import gensim
import spacy
import numpy as np
import pyLDAvis.gensim
from nltk.corpus import stopwords
from pprint import pprint
from gensim.test.utils import datapath
from scipy.stats import entropy

In [9]:
remove_lines = ["It's not an attachment -- it's stored online. To open this item, just clickthe link above.",
                "I've shared an item with you:"]


def tokenizing(doc):
    """
    Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.
    """
    return gensim.utils.simple_preprocess(str(doc), deacc=True, min_len=4, max_len=15)


def preprocessed_doc(doc):
    """
    Preprocessing a single document
    """
    for line in remove_lines:
        doc = doc.replace(line, '')
    
    doc = re.sub(r'https?\S*\s?', '', doc)
    doc = re.sub(r'\s\S*\.ac\.in\S*\s?', ' ', doc)
    doc = re.sub(r'\s\S*\.edu\S*\s?', ' ', doc)
    doc = re.sub(r'\s\S*\.com\S*\s?', ' ', doc)
    doc = re.sub(r'www\.\S*\s?', '', doc)
    doc = re.sub(r'[^a-zA-Z\s]', '', doc)
    doc = re.sub(r'\s+', ' ', doc)    
    doc = tokenizing(doc)
    return doc
    
    
def preprocessed_data(data):
    """
    Preprocessing the entire data (list of documents)
    """
    return [preprocessed_doc(doc) for doc in data]


df = pandas.read_json('data/processed_iitdh_broadcast-3.json')
data = df.content.values.tolist()
tokenized_data = preprocessed_data(data)


In [10]:
stopwords_ = ['dharwad', 'regard', 'iit', 'please', 'student',
              'institute', 'write', 'thank', 'form', 'technology',
              'would', 'year', 'time', 'follow', 'email', 'room', 'fill', 'date',
              'also', 'indian', 'engineering', 'give', 'get', 'day', 'work', 'india',
              'detail', 'mail', 'interested', 'use', 'request', 'guy', 'may',
              'not', 'link', 'like', 'take', 'make', 'still', 'since', 'keep',
              'secretary', 'come', 'one', 'professor', 'today', 'good',
              'find', 'tomorrow', 'first', 'send', 'system', 'start',
              'information', 'member', 'prof', 'part', 'regretted' 'registrar'
              'new', 'venue', 'attend', 'kindly', 'aug', '-PRON-',
              'image', 'provide', 'well', 'visit', 'do', 'inconvenience'
              'assistant_professor', 'want', 'contact', 'go', 'meet',
              'invite', 'name', 'need', 'opportunity', 'attach',
              'everyone', 'google', 'receive', 'conduct', 'great', 'note', 'affair',
              'available', 'number', 'august', 'many', 'college', 'share',
              'help', 'programme', 'walmi_campus', 'belur_industrial',
              'know', 'hold', 'participation', 'march', 'group', 'walmi'
              'rule', 'see', 'learn', 'hello_everyone', 'indore', 'campus'
              'challenge', 'phd', 'present', 'people', 'saturday', 'open', 'hope',
              'april', 'jan', 'online', 'issue', 'require', 'play', 'thing',
              'back', 'faculty', 'dean', 'summer', 'join', 'next', 'inform',
              'base', 'topic', 'hello', 'session', 'idea', 'model', 'participant',
              'post', 'last', 'change', 'round', 'karnataka_india', 'title', 'list',
              'two', 'bring', 'access', 'app', 'include', 'click', 'sincerely', 'learn'
              'view', 'hour', 'close', 'question', 'create', 'third', 'front', 'arpit', 'agrawal', 'btech',
              'high', 'week', 'gentle_reminder', 'september', 'service', 'area', 'morning',
              'timing', 'inter', 'from', 'subject', 'text', 'fwd', 'forwarded', 'message', 'dear', 'f',
              'iitdh', 'iitg', 'deputation', 'three', 'become', 'karnataka', 'select', 'wbw_prasanna',
              'unsubscribe', 'pron', 'hey', 'cse', 'january', 'february', 'march', 'april', 'june', 'july',
              'august', 'september', 'october', 'november', 'december', 'larsen', 'tourbo', 'tourbo_chair',
              'bombay', 'lot', 'sit', 'try', 'ive_invite', 'till', 'every', 'never', 'near', 'welfare', 'techf',
              'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'secy', 'could',
              'student', 'department', 'first', 'second', 'four', 'feel', 'reminder', 'gentle',  'sure',
              'sign', 'photo', 'welcome', 'assistant', 'already', 'sorry', 'small', 'attachment', 'ever']

stop_words = stopwords.words('english')
stop_words.extend(stopwords_)


In [16]:
bigram = gensim.models.Phrases(tokenized_data, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
nlp = spacy.load('en', disable=['parser', 'ner'])


def remove_stopwords(doc):
    """
    Removes stopwords from a document
    """
    return [word for word in doc
            if word not in stop_words]


def make_bigrams(doc):
    """
    Make bigrams of a document
    """
    return bigram_mod[doc]


def lemmatization(doc):
    """
    Lemmatizes a document
    """
    doc = nlp(" ".join(doc))
    return [token.lemma_ for token in doc]


def processed_doc(doc):
    """
    Processing a document
    """
    doc = remove_stopwords(doc)
    doc = make_bigrams(doc)
    doc = lemmatization(doc)
    doc = remove_stopwords(doc)
    return doc


def processed_data(data):
    """
    Return lemmatized data
    """
    return [processed_doc(doc) for doc in data]

lemmatized_data = processed_data(tokenized_data)

In [17]:
num_of_topics = 10
lda_model = gensim.models.LdaModel.load(datapath('iitdh_bow_10_0.01_0.61'))
id2word = gensim.corpora.Dictionary(lemmatized_data)
corpus = corpus = [id2word.doc2bow(text) for text in lemmatized_data]

tfidf = gensim.models.TfidfModel(corpus)
corpus = tfidf[corpus]


In [18]:
def print_topics():
    """
    Prints top 30 words from each topic
    """
    pprint(lda_model.show_topics(num_of_topics, 30))


def get_doc_topic_distribution(index):
    """
    Get the topic distribution for a given document
    """
    doc = corpus[index]
    return lda_model.get_document_topics(doc)


def get_topic_term():
    """
    Get topic-term matrix
    """
    topic_term_matrix = lda_model.get_topics()
    print(len(topic_term_matrix), 'x', len(topic_term_matrix[0]))
    return topic_term_matrix


def get_term_topics(word_id):
    """
    Get the most relevant topics to the given word
    """
    print('Word is: ', id2word[word_id])
    relevant_topics = lda_model.get_term_topics(word_id)
    relevant_topic_ids = [topic_id[0] for topic_id in relevant_topics]
    for topic_id in relevant_topic_ids:
        print(show_topic(topic_id))
    return relevant_topics


def unseen_doc_topic_distribution(new_doc: list):
    """
        Get topic distribution of a unseen document
    """
    new_doc = [id2word.doc2bow(new_doc)]
    topics = lda_model[new_doc]
    return sorted(topics[0][0], key=lambda x: x[1], reverse=True)


def doc_topic_matrix():
    """
    Get document topic matrix
    """
    matrix = np.zeros(shape=(len(corpus), num_of_topics))
    for i in range(len(corpus)):
        topic_dist = get_doc_topic_distribution(i)
        for topic, prob in topic_dist:
            matrix[i][topic] = prob

    return matrix


def jensen_shannon(query, matrix):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for a document)
    and the entire corpus of topic distributions.
    It returns an array of length M where M is the number of documents in the corpus
    """
    p = query[None, :].T
    q = matrix.T
    m = 0.5 * (p + q)
    # entropy calculated KL divergence
    return np.sqrt(0.5 * (entropy(p, m) + entropy(q, m)))


def get_document(index):
    """
    Index is list or num
    """
    if isinstance(index, list):
        docs = []
        for i in index:
            docs.append(data[i])
        return docs
    return data[index]


def get_similar_docs(new_doc: list):
    """
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances
    """
    topics = unseen_doc_topic_distribution(new_doc)
    topic_dist = np.zeros(shape=(num_of_topics))
    for topic, prob in topics:
        topic_dist[topic] = prob
    sims = jensen_shannon(topic_dist, doc_topic_matrix())
    docs = list(sims.argsort()[:10])
    return get_document(docs)


def retrieval(doc):
    doc = preprocessed_doc(doc)
    doc = processed_doc(doc)
    return get_similar_docs(doc)


In [19]:
doc = '''Dear Sir/Madam,
Invitation for Diwali 2019 celebration

Namaskar,

On this auspicious occasion of the festival of lights, let us all 
come together to celebrate this rich blend of cultures. We plan to make this 
festive weekend of Diwali joyful, fun and memorable. We have a whole lot of events 
lined up for all of you and hopefully all of us can have a great time. 
The agenda for the weekend i.e., this Saturday( 26-10-19) and Sunday(27-10-19) is as follows-

Regards,
Diwali Organising team,
'''

In [20]:
similar_docs = retrieval(doc)
print(similar_docs[0])

Subject: DevHack WhatsApp Group
Text: [image: unnamed.png]
👏👏Stop those mosquitoes that are buzzing,
and catch some sleep tonight
as* DevHack *is coming!

Click on this link <https://chat.whatsapp.com/HGy0O8FJb9m8BZoxQSiLeb> to
join the *DevHack WhatsApp group *for quick communication during the
hackathon.


😪😪Buenas Noche

Sonu Sourav
DevHack Team



In [40]:
print(similar_docs[6])

Subject: Re: Registration for the Co-Op program
Text: Some answers to general queries regarding the co-op program:
The co-op program is for the fourth year students since they are having a
BTP this semester.
Note that it is for the coming semester (August to November 2019). For next
semester, the form has to be filled again.
For students who has been allocated a btp and who are  opting for the Co-Op
program, the corresponding project will be allocated to another student at
the discretion of the faculty members.
Try to fill the form as soon as possible. Send the scanned pdf of the
filled form through mail to CDC Support.

Thanking you
Yours sincerely
Riya Toteja
CDC Support


On Thu, Jul 25, 2019 at 9:53 AM Career Development Cell Support <
cdc.support@iitdh.ac.in> wrote:

> Hello Everyone
>
> This is to inform all the students who are currently on an internship. If
> you wish to convert your   internship to a Co-Op project for the next
> semester,  discuss with the company and also ide