# Initialization

Import Dependencies

In [1]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import nltk
import math
import operator
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
import string
import json
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
np.random.seed(2018)
nltk.download('wordnet')
import random

Mounted at /content/drive
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Dataset and Dataframes

In [40]:
#json file has complete data with column attributes
dataset = pd.read_json('/content/drive/My Drive/IR Assignment/Dataset/News_Category_Dataset_v2.json', lines = True)

#total news articles from corpus
num_news = 200000

headlines = dataset[:num_news]['headline'].to_numpy()
short_desc = dataset[:num_news]['short_description'].to_numpy()
categories = dataset[:num_news]['category'].to_numpy()
links = dataset[:num_news]['link'].to_numpy()

#combines headlines and short description as the basis of our document
comb_news = headlines + '. ' + short_desc

print(headlines[0])
print(short_desc[0])
print(comb_news[0])

#total characters that should be present in each document
min_len_file = 60

print(dataset.shape)

new_comb_news = []
new_headlines = []
new_short_desc = []
new_links = []

#discards documents having length less that minimum specified len above
for ind in range(len(comb_news)):

  if len(comb_news[ind]) > min_len_file:
    new_comb_news.append(comb_news[ind])
    new_headlines.append(headlines[ind])
    new_short_desc.append(short_desc[ind])
    new_links.append(links[ind])

comb_news = new_comb_news
headlines = new_headlines
short_desc = new_short_desc
links = new_links

#Renaming document to 'review'
review = comb_news

#collects the categories of news present
(unique, counts) = np.unique(categories, return_counts=True)
frequencies = np.asarray((unique, counts)).T

#total docs actually present in corpus
print(len(comb_news))

There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV
She left her husband. He killed their children. Just another day in America.
There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV. She left her husband. He killed their children. Just another day in America.
(200853, 6)
188042


# Preprocessing data

Stemming and tokenization

In [3]:
'''
Preprocessing function - Porter stemmer
'''
def preprocess_docs(data):
    processed_data=[]
    stemmer = PorterStemmer()
    for d in data:
        tempf=[]
        temp = d
        temp = ''.join(c for c in temp if c not in string.punctuation)
        temp = nltk.word_tokenize(temp)
        temp = [w.lower() for w in temp]
        temp = [t for t in temp if t not in nltk.corpus.stopwords.words('english')]
        for word in temp:
          tempf.append(stemmer.stem(word))
        processed_data.append(tempf)
    return processed_data

In [4]:
#running over entire data
review = preprocess_docs(review)
headlines = preprocess_docs(headlines)
short_desc = preprocess_docs(short_desc)
print(review[2])
print(headlines[2])
print(short_desc[2])

['hugh', 'grant', 'marri', 'first', 'time', 'age', '57', 'actor', 'longtim', 'girlfriend', 'anna', 'eberstein', 'tie', 'knot', 'civil', 'ceremoni']
['hugh', 'grant', 'marri', 'first', 'time', 'age', '57']
['actor', 'longtim', 'girlfriend', 'anna', 'eberstein', 'tie', 'knot', 'civil', 'ceremoni']


In [None]:
for ind in range(len(review)):
  for term in review[ind]:

    if term == "'":
      print(review[ind])
      break

# LDA - Topic Modelling

gensim Bag of Words

In [5]:
"""
Topic modeling is a type of statistical modeling for discovering the abstract “topics” 
that occur in a collection of documents. 
Latent Dirichlet Allocation (LDA) is an example of topic model and is used to classify 
text in a document to a particular topic. It builds a topic per document model and words 
per topic model, modeled as Dirichlet distributions.
"""

# Create a dictionary from ‘review’ list containing the number of times a word appears in the training set
dictionary = gensim.corpora.Dictionary(review)

"""
Filter out tokens that appear in
less than 15 documents (absolute number) or
more than 0.5 documents (fraction of total corpus size, not absolute number).
after the above two steps, keep only the first 100000 most frequent tokens.
"""
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

"""
For each document we create a dictionary reporting how many
words and how many times those words appear. 
Saving this to ‘bow_corpus’, then check our selected document earlier.
"""

bow_corpus = [dictionary.doc2bow(doc) for doc in review]
print(bow_corpus[121])

# Preview Bag Of Words for our sample preprocessed document
bow_doc_121 = bow_corpus[121]
for i in range(len(bow_doc_121)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_121[i][0], dictionary[bow_doc_121[i][0]], bow_doc_121[i][1]))

[(52, 1), (103, 1), (270, 1), (346, 2), (389, 2), (518, 1), (555, 1), (720, 1), (828, 1), (839, 1), (897, 1), (969, 1), (1106, 1), (1107, 1), (1108, 2), (1109, 1), (1110, 1), (1111, 1)]
Word 52 ("new") appears 1 time.
Word 103 ("like") appears 1 time.
Word 270 ("thing") appears 1 time.
Word 346 ("elect") appears 2 time.
Word 389 ("charg") appears 2 time.
Word 518 ("prove") appears 1 time.
Word 555 ("basic") appears 1 time.
Word 720 ("welcom") appears 1 time.
Word 828 ("sure") appears 1 time.
Word 839 ("seem") appears 1 time.
Word 897 ("compani") appears 1 time.
Word 969 ("rule") appears 1 time.
Word 1106 ("ad") appears 1 time.
Word 1107 ("commiss") appears 1 time.
Word 1108 ("facebook") appears 2 time.
Word 1109 ("kind") appears 1 time.
Word 1110 ("privat") appears 1 time.
Word 1111 ("regul") appears 1 time.


Running LDA using Bag of Words

In [6]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=30, workers=3)

# For each topic, we will explore the words occuring in that topic and its relative weight
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.020*"chang" + 0.010*"way" + 0.009*"us" + 0.009*"make" + 0.008*"social" + 0.008*"work" + 0.008*"need" + 0.007*"mani" + 0.007*"peopl" + 0.007*"world"
Topic: 1 
Words: 0.045*"trump" + 0.028*"presid" + 0.018*"donald" + 0.017*"obama" + 0.015*"’" + 0.014*"campaign" + 0.014*"republican" + 0.013*"clinton" + 0.012*"elect" + 0.011*"say"
Topic: 2 
Words: 0.036*"hous" + 0.029*"news" + 0.026*"white" + 0.015*"video" + 0.011*"tweet" + 0.011*"interview" + 0.010*"show" + 0.010*"comment" + 0.010*"week" + 0.010*"michel"
Topic: 3 
Words: 0.046*"name" + 0.011*"2011" + 0.011*"cheat" + 0.010*"affair" + 0.009*"harri" + 0.008*"salad" + 0.008*"bear" + 0.008*"wood" + 0.008*"closet" + 0.008*"milk"
Topic: 4 
Words: 0.017*"day" + 0.016*"live" + 0.015*"gay" + 0.013*"black" + 0.013*"peopl" + 0.011*"commun" + 0.011*"celebr" + 0.010*"medit" + 0.010*"right" + 0.009*"world"
Topic: 5 
Words: 0.019*"get" + 0.015*"thing" + 0.015*"know" + 0.014*"like" + 0.014*"dont" + 0.014*"make" + 0.012*"one" + 0.012*"wa

Constructing LDA topic-score for each news

In [7]:
"""
example of an entry to doc_lda

doc_lda =
{
  doc_num_0:
  {
    topic_id_0:score,
    topic_id_1:score,
    topic_id_5:score
  }
}
"""

doc_lda = {}

for doc_num in range(len(review)):
  curr_bow = bow_corpus[doc_num]
  doc_lda.update({doc_num:{}})

  for index, score in sorted(lda_model[curr_bow], key=lambda tup: -1*tup[1]):
    doc_lda[doc_num].update({index:score})

print(doc_lda[121])



{15: 0.3783571, 10: 0.2275307, 5: 0.18011914, 1: 0.12827581, 4: 0.051626347}


# Master Dictionary - Index Construction

Constructing Master Dictionary

In [8]:
"""
example of an entry to master dictionary
master_dict = 
{
  'dog':
  {
    doc_number:
    {
      'tf':10, 
      'df':15, 
      'tf-idf':0.6
    }
  }
}
"""

master_dict = {}

orig_review = review.copy()

#construct with headlines first
review = headlines

tf_factor = 2

# updating tf using headlines with weighted tf
for doc_num in range(len(review)):
  curr_review = review[doc_num]

  for curr_term in curr_review:

    if curr_term in master_dict:

      if doc_num in master_dict[curr_term]:
        master_dict[curr_term][doc_num]['tf'] += 1*tf_factor
      else:
        #put the doc_id for that term in master if doc_id not present
        master_dict[curr_term].update({doc_num:{'tf':1*tf_factor, 'df':0, 'tf-idf':0.1}})

    # put the term in master
    else:
      master_dict.update({curr_term:{doc_num:{'tf':1*tf_factor, 'df':0, 'tf-idf':0.1}}})


# updating tf using short description next
review = short_desc
for doc_num in range(len(review)):
  curr_review = review[doc_num]

  for curr_term in curr_review:

    if curr_term in master_dict:

      if doc_num in master_dict[curr_term]:
        master_dict[curr_term][doc_num]['tf'] += 1
      else:
        #put the doc_id for that term in master if doc_id not present
        master_dict[curr_term].update({doc_num:{'tf':1, 'df':0, 'tf-idf':0.1}})

    # put the term in master
    else:
      master_dict.update({curr_term:{doc_num:{'tf':1, 'df':0, 'tf-idf':0.1}}})

# restoring combined news
review = orig_review

#master_dict
print(master_dict['trump'])
 

{4: {'tf': 2, 'df': 0, 'tf-idf': 0.1}, 6: {'tf': 2, 'df': 0, 'tf-idf': 0.1}, 11: {'tf': 2, 'df': 0, 'tf-idf': 0.1}, 13: {'tf': 2, 'df': 0, 'tf-idf': 0.1}, 14: {'tf': 4, 'df': 0, 'tf-idf': 0.1}, 15: {'tf': 2, 'df': 0, 'tf-idf': 0.1}, 16: {'tf': 2, 'df': 0, 'tf-idf': 0.1}, 19: {'tf': 2, 'df': 0, 'tf-idf': 0.1}, 23: {'tf': 3, 'df': 0, 'tf-idf': 0.1}, 37: {'tf': 2, 'df': 0, 'tf-idf': 0.1}, 38: {'tf': 2, 'df': 0, 'tf-idf': 0.1}, 57: {'tf': 2, 'df': 0, 'tf-idf': 0.1}, 60: {'tf': 2, 'df': 0, 'tf-idf': 0.1}, 65: {'tf': 2, 'df': 0, 'tf-idf': 0.1}, 73: {'tf': 2, 'df': 0, 'tf-idf': 0.1}, 76: {'tf': 3, 'df': 0, 'tf-idf': 0.1}, 85: {'tf': 3, 'df': 0, 'tf-idf': 0.1}, 86: {'tf': 2, 'df': 0, 'tf-idf': 0.1}, 90: {'tf': 3, 'df': 0, 'tf-idf': 0.1}, 91: {'tf': 3, 'df': 0, 'tf-idf': 0.1}, 94: {'tf': 2, 'df': 0, 'tf-idf': 0.1}, 95: {'tf': 2, 'df': 0, 'tf-idf': 0.1}, 97: {'tf': 2, 'df': 0, 'tf-idf': 0.1}, 99: {'tf': 3, 'df': 0, 'tf-idf': 0.1}, 100: {'tf': 2, 'df': 0, 'tf-idf': 0.1}, 101: {'tf': 2, 'df': 0, '

updating df and tf-idf values

In [9]:

total_reviews = len(review)

for curr_term in master_dict:

  for doc_num in master_dict[curr_term]:

    # len(master_dict[curr_term]) stores total doc numbers that term has or total docs in which that term present
    master_dict[curr_term][doc_num]['df'] = len(master_dict[curr_term])

    # calculates tf-idf using formula
    master_dict[curr_term][doc_num]['tf-idf'] = (master_dict[curr_term][doc_num]['tf'])*(math.log10(total_reviews/master_dict[curr_term][doc_num]['df']))

print(master_dict['trump'])

#total unique words in corpus
print(len(master_dict))


{4: {'tf': 2, 'df': 14628, 'tf-idf': 2.218139819993808}, 6: {'tf': 2, 'df': 14628, 'tf-idf': 2.218139819993808}, 11: {'tf': 2, 'df': 14628, 'tf-idf': 2.218139819993808}, 13: {'tf': 2, 'df': 14628, 'tf-idf': 2.218139819993808}, 14: {'tf': 4, 'df': 14628, 'tf-idf': 4.436279639987616}, 15: {'tf': 2, 'df': 14628, 'tf-idf': 2.218139819993808}, 16: {'tf': 2, 'df': 14628, 'tf-idf': 2.218139819993808}, 19: {'tf': 2, 'df': 14628, 'tf-idf': 2.218139819993808}, 23: {'tf': 3, 'df': 14628, 'tf-idf': 3.3272097299907117}, 37: {'tf': 2, 'df': 14628, 'tf-idf': 2.218139819993808}, 38: {'tf': 2, 'df': 14628, 'tf-idf': 2.218139819993808}, 57: {'tf': 2, 'df': 14628, 'tf-idf': 2.218139819993808}, 60: {'tf': 2, 'df': 14628, 'tf-idf': 2.218139819993808}, 65: {'tf': 2, 'df': 14628, 'tf-idf': 2.218139819993808}, 73: {'tf': 2, 'df': 14628, 'tf-idf': 2.218139819993808}, 76: {'tf': 3, 'df': 14628, 'tf-idf': 3.3272097299907117}, 85: {'tf': 3, 'df': 14628, 'tf-idf': 3.3272097299907117}, 86: {'tf': 2, 'df': 14628, 't

L2 Norm Normalisation of tf-idf

In [10]:
norm_factor = {}

for curr_term in master_dict:

  for doc_num in master_dict[curr_term]:

    if doc_num in norm_factor:
      norm_factor[doc_num] += master_dict[curr_term][doc_num]['tf-idf']*master_dict[curr_term][doc_num]['tf-idf']
    else:
      norm_factor.update({doc_num: 0.2})
      norm_factor[doc_num] = master_dict[curr_term][doc_num]['tf-idf']*master_dict[curr_term][doc_num]['tf-idf']

# square root the norm factors

for doc_num in norm_factor:
  norm_factor[doc_num] = math.sqrt(norm_factor[doc_num])

# normalise tf-idf of master dictionary

for curr_term in master_dict:

  for doc_num in master_dict[curr_term]:

    master_dict[curr_term][doc_num]['tf-idf'] = master_dict[curr_term][doc_num]['tf-idf']/norm_factor[doc_num]



Save my files and load

In [41]:
import pickle

#save objects
def store_data():

  pickle_out = open("/content/drive/My Drive/IR Assignment/saved objects/comb_news.pickle","wb")
  pickle.dump(comb_news, pickle_out)
  pickle_out.close()

  pickle_out = open("/content/drive/My Drive/IR Assignment/saved objects/review.pickle","wb")
  pickle.dump(review, pickle_out)
  pickle_out.close()

  pickle_out = open("/content/drive/My Drive/IR Assignment/saved objects/master_dict.pickle","wb")
  pickle.dump(master_dict, pickle_out)
  pickle_out.close()

  pickle_out = open("/content/drive/My Drive/IR Assignment/saved objects/total_reviews.pickle","wb")
  pickle.dump(total_reviews, pickle_out)
  pickle_out.close()

  pickle_out = open("/content/drive/My Drive/IR Assignment/saved objects/dictionary.pickle","wb")
  pickle.dump(dictionary, pickle_out)
  pickle_out.close()

  pickle_out = open("/content/drive/My Drive/IR Assignment/saved objects/lda_model.pickle","wb")
  pickle.dump(lda_model, pickle_out)
  pickle_out.close()

  pickle_out = open("/content/drive/My Drive/IR Assignment/saved objects/doc_lda.pickle","wb")
  pickle.dump(doc_lda, pickle_out)
  pickle_out.close()

  pickle_out = open("/content/drive/My Drive/IR Assignment/saved objects/links.pickle","wb")
  pickle.dump(links, pickle_out)
  pickle_out.close()

store_data()


#load objects
def load_data():

  pickle_in = open("/content/drive/My Drive/IR Assignment/saved objects/comb_news.pickle","rb")
  temp_comb_news = pickle.load(pickle_in)
  pickle_in.close()

  pickle_in = open("/content/drive/My Drive/IR Assignment/saved objects/review.pickle","rb")
  temp_review = pickle.load(pickle_in)
  pickle_in.close()

  pickle_in = open("/content/drive/My Drive/IR Assignment/saved objects/master_dict.pickle","rb")
  temp_master_dict = pickle.load(pickle_in)
  pickle_in.close()

  pickle_in = open("/content/drive/My Drive/IR Assignment/saved objects/total_reviews.pickle","rb")
  temp_total_reviews = pickle.load(pickle_in)
  pickle_in.close()

  pickle_in = open("/content/drive/My Drive/IR Assignment/saved objects/dictionary.pickle","rb")
  temp_dictionary = pickle.load(pickle_in)
  pickle_in.close()

  pickle_in = open("/content/drive/My Drive/IR Assignment/saved objects/lda_model.pickle","rb")
  temp_lda_model = pickle.load(pickle_in)
  pickle_in.close()

  pickle_in = open("/content/drive/My Drive/IR Assignment/saved objects/doc_lda.pickle","rb")
  temp_doc_lda = pickle.load(pickle_in)
  pickle_in.close()

  pickle_in = open("/content/drive/My Drive/IR Assignment/saved objects/links.pickle","rb")
  temp_links = pickle.load(pickle_in)
  pickle_in.close()

  return temp_comb_news,temp_review,temp_master_dict,temp_total_reviews,temp_dictionary,temp_lda_model,temp_doc_lda,temp_links

comb_news,review,master_dict,total_reviews,dictionary,lda_model,doc_lda,links = load_data()



In [45]:
#dummy cell
print(1)

1


# Extracting results from query

Function to get results

In [43]:
def getResults(myquery):

  sample_query = [myquery]

  #preprocessing query before evaulation
  sample_query = preprocess_docs(sample_query)
  query = sample_query[0]
  print(query)

  query_dict = {}

  for query_term in query:

  # query term should be present in my master dictionary
    if query_term in master_dict:

      if query_term in query_dict:
        query_dict[query_term]['tf'] += 1
        query_dict[query_term]['df'] = len(master_dict[query_term])
        query_dict[query_term]['tf-idf'] = (1+math.log10(query_dict[query_term]['tf']))*(math.log10(total_reviews/query_dict[query_term]['df']))
      else:
        query_dict.update({query_term:{'tf':1, 'df':0, 'tf-idf':0.1}})
        query_dict[query_term]['df'] = len(master_dict[query_term])
        query_dict[query_term]['tf-idf'] = (1+math.log10(query_dict[query_term]['tf']))*(math.log10(total_reviews/query_dict[query_term]['df']))


  #normalise tf-idf for query
  q_norm_factor = 0;

  for term in query_dict:
    q_norm_factor += query_dict[term]['tf-idf']*query_dict[term]['tf-idf']

  q_norm_factor = math.sqrt(q_norm_factor)

  for term in query_dict:
    query_dict[term]['tf-idf'] = query_dict[term]['tf-idf']/q_norm_factor

  #topic modelling of query
  bow_vector = dictionary.doc2bow(query)
  query_lda = {}

  for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    query_lda.update({index:score})
  
  #calculating cosine similarity


  result = {} #to store all the documents as result and their cosine similarity score

  for query_term in query_dict:

    for doc_num in master_dict[query_term]:

      if doc_num in result:
        result[doc_num] += query_dict[query_term]['tf-idf']*master_dict[query_term][doc_num]['tf-idf']
      else:
        result.update({doc_num: 0.1})
        result[doc_num] = query_dict[query_term]['tf-idf']*master_dict[query_term][doc_num]['tf-idf']

  #calculating LDA score for each doc in results
  result_LDA = {}

  for doc_num in result:
    result_LDA.update({doc_num:0})

    for topic_id in query_lda:

      if topic_id in doc_lda[doc_num]:
        #dot product of lda scores
        result_LDA[doc_num] += query_lda[topic_id]*doc_lda[doc_num][topic_id]

  #extracting results with LDA
  for doc_num in result:
    result[doc_num] += result_LDA[doc_num]/3

  sorted_result_lda = sorted(result.items(), key=operator.itemgetter(1),reverse=True)
  
  for ind in range(10):
    print(comb_news[sorted_result_lda[ind][0]])

  result_s = []
  for ind in range(10):
    cont = []
    cont.append(comb_news[sorted_result_lda[ind][0]])
    cont.append(links[sorted_result_lda[ind][0]])
    result_s.append(cont)
  
  return result_s

# Flask Application

In [12]:
!pip install flask-ngrok

Collecting flask-ngrok
  Downloading https://files.pythonhosted.org/packages/af/6c/f54cb686ad1129e27d125d182f90f52b32f284e6c8df58c1bae54fa1adbc/flask_ngrok-0.0.25-py3-none-any.whl
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [None]:
from flask_ngrok import run_with_ngrok
from flask import render_template,request
from flask import Flask

app = Flask(__name__)
run_with_ngrok(app)   #starts ngrok when the app is run

@app.route("/")
def give_query():
    return render_template("index.html")

@app.route('/results',methods=["POST"])
def get_result():
	query = request.form.get('query')
	if not query:
		return "Query not entered"
	result_s = getResults(query)

	return render_template("result.html",q=query,res=result_s) 
  
def home():
    return "<h1>Running Flask on Google Colab!</h1>"

app.run()