In [4]:
import numpy as np 
import pandas as pd 
import os

from nltk.corpus import stopwords
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

In [5]:
classes = os.listdir('BBC News Summary/BBC News Summary/News Articles/')
art_dir = 'BBC News Summary/BBC News Summary/News Articles/'
sum_dir = 'BBC News Summary/BBC News Summary/Summaries/'

In [6]:
articles = []
summaries = []
file_arr = []
for cla in classes:
    files = os.listdir(art_dir + cla)
    for file in files:
        article_file_path = art_dir + cla + '/' + file
        summary_file_path = sum_dir + cla + '/' + file
        try:
            with open (article_file_path,'r') as f:
                articles.append('.'.join([line.rstrip() for line in f.readlines()]))
            with open (summary_file_path,'r') as f:
                summaries.append('.'.join([line.rstrip() for line in f.readlines()]))
            file_arr.append(cla + '/' + file)
        except:
            pass
        
df = pd.DataFrame({'File_path':file_arr,'Articles': articles,'Reference Summaries':summaries})
df.head()

Unnamed: 0,File_path,Articles,Reference Summaries
0,business/001.txt,Ad sales boost Time Warner profit..Quarterly p...,TimeWarner said fourth quarter sales rose 2% t...
1,business/002.txt,Dollar gains on Greenspan speech..The dollar h...,The dollar has hit its highest level against t...
2,business/003.txt,Yukos unit buyer faces loan claim..The owners ...,Yukos' owner Menatep Group says it will ask Ro...
3,business/004.txt,High fuel prices hit BA's profits..British Air...,"Rod Eddington, BA's chief executive, said the ..."
4,business/005.txt,Pernod takeover talk lifts Domecq..Shares in U...,Pernod has reduced the debt it took on to fund...


In [7]:
def summarize(text):     
    SENTENCES_COUNT = 3
    language = 'english'
#LEXRANK stop word removal, tf-to-idf conversions, score calculations, and matrix generation for cosine similarity models
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = LexRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(language)
    summary = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        summary.append(str(sentence))

    return " ".join(summary)

In [8]:
df["Sumy Summary"] = df["Articles"].apply(summarize)
df.head()

Unnamed: 0,File_path,Articles,Reference Summaries,Sumy Summary
0,business/001.txt,Ad sales boost Time Warner profit..Quarterly p...,TimeWarner said fourth quarter sales rose 2% t...,Ad sales boost Time Warner profit..Quarterly p...
1,business/002.txt,Dollar gains on Greenspan speech..The dollar h...,The dollar has hit its highest level against t...,Dollar gains on Greenspan speech..The dollar h...
2,business/003.txt,Yukos unit buyer faces loan claim..The owners ...,Yukos' owner Menatep Group says it will ask Ro...,Yukos unit buyer faces loan claim..The owners ...
3,business/004.txt,High fuel prices hit BA's profits..British Air...,"Rod Eddington, BA's chief executive, said the ...",BA's profits were still better than market exp...
4,business/005.txt,Pernod takeover talk lifts Domecq..Shares in U...,Pernod has reduced the debt it took on to fund...,Pernod takeover talk lifts Domecq..Shares in U...


In [9]:
df['Articles']

0       Ad sales boost Time Warner profit..Quarterly p...
1       Dollar gains on Greenspan speech..The dollar h...
2       Yukos unit buyer faces loan claim..The owners ...
3       High fuel prices hit BA's profits..British Air...
4       Pernod takeover talk lifts Domecq..Shares in U...
                              ...                        
2220    BT program to beat dialler scams..BT is introd...
2221    Spam e-mails tempt net shoppers..Computer user...
2222    Be careful how you code..A new European direct...
2223    US cyber security chief resigns..The man makin...
2224    Losing yourself in online gaming..Online role ...
Name: Articles, Length: 2225, dtype: object

In [10]:
alldocs = df
business = df.iloc[:510,:]
entertainment = df.iloc[510:896,:]
politics = df.iloc[896:1313,:]
sport = df.iloc[1313:1824,:]
tech = df.iloc[1824:]

In [20]:
len(business['Articles'][6])

1668

In [21]:
len(business['Sumy Summary'][6])

547

In [13]:
from rouge import Rouge

In [18]:
class rougetester:
    def __init__(self,category):
        rouge = Rouge()
        self.scores = rouge.get_scores(category["Reference Summaries"], category["Sumy Summary"])
        df_result = pd.DataFrame(self.scores)
        
    def calculate(self):
        rouge_1 = { 'r': 0, 'p': 0, 'f': 0 }
        rouge_2 = { 'r': 0, 'p': 0, 'f': 0 }
        rouge_l = { 'r': 0, 'p': 0, 'f': 0 }
        total_len = len(self.scores)
        results = []

        for score in self.scores:
            rouge_1['r'] += score['rouge-1']['r']
            rouge_1['p'] += score['rouge-1']['p']
            rouge_1['f'] += score['rouge-1']['f']
            rouge_2['r'] += score['rouge-2']['r']
            rouge_2['p'] += score['rouge-2']['p']
            rouge_2['f'] += score['rouge-2']['f']
            rouge_l['r'] += score['rouge-l']['r']
            rouge_l['p'] += score['rouge-l']['p']
            rouge_l['f'] += score['rouge-l']['f']

        for total_score in [rouge_1, rouge_2, rouge_l]:
            for key in total_score:
                total_score[key] = total_score[key] / total_len
            results.append(total_score)

        df_avg_result = pd.DataFrame({ 'rouge-1': results[0], 'rouge-2': results[1], 'rouge-l': results[2] })
        print(df_avg_result.head())
        print("---------------------------------")



In [19]:
rgb = rougetester(business)
rge = rougetester(entertainment)
rgp = rougetester(politics)
rgs = rougetester(sport)
rgt = rougetester(tech)
rgo = rougetester(alldocs)
print("BUSINESS")
rgb.calculate()
print("ENTERTAINMENT")
rge.calculate()
print("POLITICS")
rgp.calculate()
print("SPORT")
rgs.calculate()
print("TECH")
rgt.calculate()
print("OVERALL")
rgo.calculate()

BUSINESS
    rouge-1   rouge-2   rouge-l
r  0.696415  0.593478  0.686719
p  0.541075  0.431850  0.533879
f  0.593722  0.482794  0.585664
---------------------------------
ENTERTAINMENT
    rouge-1   rouge-2   rouge-l
r  0.687092  0.581070  0.677291
p  0.534250  0.428697  0.526647
f  0.581348  0.472662  0.573030
---------------------------------
POLITICS
    rouge-1   rouge-2   rouge-l
r  0.733731  0.623941  0.722014
p  0.472477  0.362401  0.465062
f  0.558062  0.440733  0.549269
---------------------------------
SPORT
    rouge-1   rouge-2   rouge-l
r  0.707900  0.600565  0.698872
p  0.503543  0.399796  0.496987
f  0.563653  0.453316  0.556339
---------------------------------
TECH
    rouge-1   rouge-2   rouge-l
r  0.759541  0.644571  0.747873
p  0.431689  0.318370  0.424964
f  0.530401  0.406350  0.522215
---------------------------------
OVERALL
    rouge-1   rouge-2   rouge-l
r  0.715806  0.607871  0.705511
p  0.498701  0.390474  0.491625
f  0.566574  0.452606  0.558482
-----------