# **DATA PREPARATION**

In [0]:
import os
paragraph_path = '/content/drive/My Drive/Information_Retrieval/BBC Business News/News Articles/business'
summary_path = '/content/drive/My Drive/Information_Retrieval/BBC Business News/Summaries/business'

paragraph_texts_dict = dict()
for r, d, f in os.walk(paragraph_path):
    for file in f:
        if '.txt' in file:
            file_id = file.split('.')[0]
            file = open(os.path.join(r, file),'r')
            paragraph_texts_dict[file_id] = file.read()

summaries_dict = dict()
for r, d, f in os.walk(summary_path):
    for file in f:
        if '.txt' in file:
            file_id = file.split('.')[0]
            file = open(os.path.join(r, file),'r')
            summaries_dict[file_id] = file.read()

In [15]:
len(paragraph_texts_dict),len(summaries_dict)

(510, 510)

In [0]:
import pickle

with open('/content/drive/My Drive/Information_Retrieval/paragraph_texts_dict.pickle','wb') as file:
  pickle.dump(paragraph_texts_dict,file)

In [0]:
import pickle

with open('/content/drive/My Drive/Information_Retrieval/summaries_dict.pickle','wb') as file:
  pickle.dump(summaries_dict,file)

In [0]:
import pickle

with open('/content/drive/My Drive/Information_Retrieval/summaries_dict.pickle','rb') as file:
  summaries_dict = pickle.load(file)

with open('/content/drive/My Drive/Information_Retrieval/paragraph_texts_dict.pickle','rb') as file:
  paragraph_texts_dict = pickle.load(file)

In [4]:
paragraph_texts_dict['101'],summaries_dict['101']

('Australia rates at four year high\n\nAustralia is raising its benchmark interest rate to its highest level in four years despite signs of a slowdown in the country\'s economy.\n\nThe Reserve Bank of Australia lifted interest rates 0.25% to 5.5%, their first upwards move in more than a year. However, shortly after the Bank made its decision, new figures showed a fall in economic growth in the last quarter. The Bank said it had acted to curb inflation but the move was criticised by some analysts.\n\nThe rate hike was the first since December 2003 and had been well-flagged in advance. However, opposition parties and some analysts said the move was ill-timed given data showing the Australian economy grew just 0.1% between October and December and 1.5% on an annual basis.\n\nThe figures, representing a decline from the 0.2% growth in GDP seen between July and September, were below market expectations. Consumer spending remains strong, however, and the Bank is concerned about growing infla

In [6]:
pip install sumy

Collecting sumy
[?25l  Downloading https://files.pythonhosted.org/packages/61/20/8abf92617ec80a2ebaec8dc1646a790fc9656a4a4377ddb9f0cc90bc9326/sumy-0.8.1-py2.py3-none-any.whl (83kB)
[K     |████                            | 10kB 21.2MB/s eta 0:00:01[K     |███████▉                        | 20kB 2.2MB/s eta 0:00:01[K     |███████████▊                    | 30kB 3.2MB/s eta 0:00:01[K     |███████████████▋                | 40kB 2.1MB/s eta 0:00:01[K     |███████████████████▌            | 51kB 2.6MB/s eta 0:00:01[K     |███████████████████████▍        | 61kB 3.1MB/s eta 0:00:01[K     |███████████████████████████▍    | 71kB 3.6MB/s eta 0:00:01[K     |███████████████████████████████▎| 81kB 4.0MB/s eta 0:00:01[K     |████████████████████████████████| 92kB 3.4MB/s 
Collecting breadability>=0.1.20
  Downloading https://files.pythonhosted.org/packages/ad/2d/bb6c9b381e6b6a432aa2ffa8f4afdb2204f1ff97cfcc0766a5b7683fec43/breadability-0.1.20.tar.gz
Collecting pycountry>=18.2.23
[?25

In [0]:
import sumy
from sumy.evaluation.rouge import rouge_1,rouge_2
from sumy.nlp.tokenizers import Tokenizer

In [10]:
from textblob import TextBlob
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


# **TexRank**

In [21]:
import gensim
saved_model = gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.load('/content/drive/My Drive/Information_Retrieval/word2vec-google-news-300')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
from gensim import utils
import numpy as np

#Get vector representation for each of the sentences
def getWord2VecVector(sentence_list):
    sentence_vectors = []
    for sen in sentence_list:
        words = utils.simple_preprocess(sen)
        vector = np.zeros((1,300))
        for word in words:
            try:
                vector+=saved_model.get_vector(word)
            except:
              pass
        sentence_vectors.append(vector)
    return sentence_vectors


In [0]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

LANGUAGE = "english"

def performTexRank(sen_count):
    sum_r1 = 0
    sum_r2 = 0
    count = 0
    for file_id,paragraph in paragraph_texts_dict.items():
          sentence_list = [item.raw for item in TextBlob(paragraph).sentences]

          #Matrix to hold similarity values between sentences
          adjacency_matrix = np.zeros((len(sentence_list),len(sentence_list)))

          sentence_vectors = getWord2VecVector(sentence_list)

          for i in range(len(sentence_list)):
            for j in range(len(sentence_list)):
              if i!=j:
                if adjacency_matrix[j][i] > 0:
                  adjacency_matrix[i][j] = adjacency_matrix[j][i]
                else:
                  adjacency_matrix[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,-1),sentence_vectors[j].reshape(1,-1))[0][0]
          
          #perform pagerank algo on the similarity matrix
          nx_graph = nx.from_numpy_array(adjacency_matrix)
          scores = nx.pagerank(nx_graph)

          #choose top sentences as summary
          ranked_sentences = sorted(((scores[i],sumy.models.dom.Sentence(s,Tokenizer(LANGUAGE))) for i,s in enumerate(sentence_list)), reverse=True)
          top_sentences = [ ranked_sentences[i][1]  for i in range(min(sen_count,len(ranked_sentences))) ]
          summary_sentences = [sumy.models.dom.Sentence(item.raw,Tokenizer(LANGUAGE)) for item in TextBlob(summaries_dict[file_id]).sentences]
          r1 = rouge_1(top_sentences,summary_sentences)
          r2 = rouge_2(top_sentences,summary_sentences)
          # print("File ID: {} R1: {} R2: {}".format(file_id,r1,r2))
          sum_r1+= r1
          sum_r2+= r2
          count+=1
    return sum_r1/count,sum_r2/count

In [27]:
avg_r1,avg_r2 = performTexRank(10)
print("ROUGUE 1 SCORE: {} ROGUE 2 SCORE: {} ".format(avg_r1,avg_r2))

ROUGUE 1 SCORE: 0.9086004306758974 ROGUE 2 SCORE: 0.8368608878227614 


In [28]:
avg_r1,avg_r2 = performTexRank(15)
print("ROUGUE 1 SCORE: {} ROGUE 2 SCORE: {} ".format(avg_r1,avg_r2))

ROUGUE 1 SCORE: 0.9740241512136719 ROGUE 2 SCORE: 0.9227259447807842 


In [29]:
avg_r1,avg_r2 = performTexRank(20)
print("ROUGUE 1 SCORE: {} ROGUE 2 SCORE: {} ".format(avg_r1,avg_r2))

ROUGUE 1 SCORE: 0.9908023495796223 ROGUE 2 SCORE: 0.9458851520747782 


In [30]:
avg_r1,avg_r2 = performTexRank(25)
print("ROUGUE 1 SCORE: {} ROGUE 2 SCORE: {} ".format(avg_r1,avg_r2))

ROUGUE 1 SCORE: 0.9967045623704421 ROGUE 2 SCORE: 0.9538487267640796 


# **LexRank**

In [0]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer  as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

def performLexRank(sen_count):
  count = 0
  sum_r1 = 0
  sum_r2 = 0
  for file_id,paragraph in paragraph_texts_dict.items():
        parser = PlaintextParser.from_string(paragraph, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        top_sentences = []
        for sentence in summarizer(parser.document, sen_count):
            top_sentences.append(sentence)
        summary_sentences = [sumy.models.dom.Sentence(item.raw,Tokenizer(LANGUAGE)) for item in TextBlob(summaries_dict[file_id]).sentences]
        r1 = rouge_1(top_sentences,summary_sentences)
        r2 = rouge_2(top_sentences,summary_sentences)
        #print("File ID: {} R1: {} R2: {}".format(file_id,r1,r2))
        sum_r1+= r1
        sum_r2+= r2
        count+=1
  return sum_r1/count,sum_r2/count

In [42]:
avg_r1,avg_r2 = performLexRank(10)
print("ROUGUE 1 SCORE: {} ROGUE 2 SCORE: {} ".format(avg_r1,avg_r2))

ROUGUE 1 SCORE: 0.8061285038189773 ROGUE 2 SCORE: 0.7018857387432886 


In [43]:
avg_r1,avg_r2 = performLexRank(15)
print("ROUGUE 1 SCORE: {} ROGUE 2 SCORE: {} ".format(avg_r1,avg_r2))

ROUGUE 1 SCORE: 0.9334000426274172 ROGUE 2 SCORE: 0.8671580805844252 


In [44]:
avg_r1,avg_r2 = performLexRank(20)
print("ROUGUE 1 SCORE: {} ROGUE 2 SCORE: {} ".format(avg_r1,avg_r2))

ROUGUE 1 SCORE: 0.976818623043103 ROGUE 2 SCORE: 0.9264571193849704 


In [45]:
avg_r1,avg_r2 = performLexRank(25)
print("ROUGUE 1 SCORE: {} ROGUE 2 SCORE: {} ".format(avg_r1,avg_r2))

ROUGUE 1 SCORE: 0.9920993056726062 ROGUE 2 SCORE: 0.9472817430152631 


# **SUMMARY**

In [46]:
import pandas as pd
data = [['TexRank',10,0.9086,0.8368], ['TexRank',15,0.9740,0.9227], ['TexRank',20,0.9908,0.9458], ['TexRank',25,0.9967,0.9538],
        ['LexRank',10,0.8061,0.7018], ['LexRank',15,0.9334,0.8671], ['LexRank',20,0.9768,0.9264], ['LexRank',25,0.9920,0.9472]]

pd.DataFrame(data = data,columns = ['METHOD','SENTENCE COUNT','ROUGUE 1','ROUGUE 2'])

Unnamed: 0,METHOD,SENTENCE COUNT,ROUGUE 1,ROUGUE 2
0,TexRank,10,0.9086,0.8368
1,TexRank,15,0.974,0.9227
2,TexRank,20,0.9908,0.9458
3,TexRank,25,0.9967,0.9538
4,LexRank,10,0.8061,0.7018
5,LexRank,15,0.9334,0.8671
6,LexRank,20,0.9768,0.9264
7,LexRank,25,0.992,0.9472
