# Importing

In [1]:
import pandas as pd
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sklearn.model_selection import train_test_split

In [2]:
from main import replace_dots, preprocess_text, summarizer, calc_rouge, avg_rouge, generate_ensemble_summary

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Reading Data from CSV

In [3]:
data = pd.read_csv("./data/bbc_data.csv") #source: https://www.kaggle.com/datasets/pariza/bbc-news-summary/data

In [4]:
data.head()

Unnamed: 0,label,text,summary
0,business,UK economy facing 'major risks'..The UK manufa...,"""Despite some positive news for the export sec..."
1,business,Aids and climate top Davos agenda..Climate cha...,"At the same time, about 100,000 people are exp..."
2,business,Asian quake hits European shares..Shares in Eu...,The unfolding scale of the disaster in south A...
3,business,India power shares jump on debut..Shares in In...,"Shares in India's largest power producer, Nati..."
4,business,Lacroix label bought by US firm..Luxury goods ...,LVMH said the French designer's haute couture ...


In [5]:
data.shape

(2225, 3)

# Data Preprocessing

In [6]:
data['text'][1]

'Aids and climate top Davos agenda..Climate change and the fight against Aids are leading the list of concerns for the first day of the World Economic Forum in the Swiss resort of Davos...Some 2,000 business and political leaders from around the globe will listen to UK Prime Minister Tony Blair\'s opening speech on Wednesday. Mr Blair will focus on Africa\'s development plans and global warming. Earlier in the day came an update on efforts to have 3 million people on anti-Aids drugs by the end of 2005. The World Health Organisation (WHO) said 700,000 people in poor countries were on life-extending drugs - up from 440,000 six months earlier but amounting to only 12% of the 5.8 million who needed them. A $2bn "funding gap" still stood in the way of hitting the 2005 target, the WHO said...The themes to be stressed by Mr Blair - whose attendance was announced at the last minute - are those he wants to dominate the UK\'s chairmanship of the G8 group of industrialised states. Other issues to

In [7]:
data['text'] = data['text'].apply(lambda x: replace_dots(x)) # replace_dots function replaces sequences of dots with a single dot followed by a space. It also ensures that there is a space after a dot that is preceded by a digit.

In [8]:
data['text'][1]

'Aids and climate top Davos agenda. Climate change and the fight against Aids are leading the list of concerns for the first day of the World Economic Forum in the Swiss resort of Davos. Some 2,000 business and political leaders from around the globe will listen to UK Prime Minister Tony Blair\'s opening speech on Wednesday.  Mr Blair will focus on Africa\'s development plans and global warming.  Earlier in the day came an update on efforts to have 3 million people on anti-Aids drugs by the end of 2005.  The World Health Organisation (WHO) said 700,000 people in poor countries were on life-extending drugs - up from 440,000 six months earlier but amounting to only 12% of the 5.8 million who needed them.  A $2bn "funding gap" still stood in the way of hitting the 2005 target, the WHO said. The themes to be stressed by Mr Blair - whose attendance was announced at the last minute - are those he wants to dominate the UK\'s chairmanship of the G8 group of industrialised states.  Other issues

In [9]:
data['summary'] = data['summary'].apply(lambda x: replace_dots(x))

In [10]:
data = preprocess_text(data)

In [11]:
data.head()

Unnamed: 0,label,text,summary
0,business,UK economy facing 'major risks'. The UK manufa...,"""Despite some positive news for the export sec..."
1,business,Aids and climate top Davos agenda. Climate cha...,"At the same time, about 100,000 people are exp..."
2,business,Asian quake hits European shares. Shares in Eu...,The unfolding scale of the disaster in south A...
3,business,India power shares jump on debut. Shares in In...,"Shares in India's largest power producer, Nati..."
4,business,Lacroix label bought by US firm. Luxury goods ...,LVMH said the French designer's haute couture ...


# Train-Test Split

In [12]:
X = data.drop(columns = ['label'])
y = data['label']
train, test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

In [13]:
#data is now train
data = train.copy()
data.reset_index(drop = True, inplace = True)
test.reset_index(drop = True, inplace = True)

# Base Extractive Summarization Techniques

## 1) LSA Summarizer

In [14]:
data['lsa_summary'] = data['text'].apply(lambda x: summarizer(x, 3))

In [15]:
data.head()

Unnamed: 0,text,summary,lsa_summary
0,Fiat chief takes steering wheel. The chief exe...,"Mr Marchionne, who only joined the company las...","As part of a major restructuring, Fiat is to i..."
1,Johnson uncertain about Euro bid. Jade Johnson...,"""It's the stress,"" said Johnson. Jade Johnson ...",Jade Johnson is undecided about whether to con...
2,Cage film's third week at US top. Nicolas Cage...,"National Treasure, which sees Cage's character...",The Polar Express entered in third place while...
3,McConnell details Scots wave toll. At least th...,At least three people from Scotland died in th...,Mr McConnell went on to promise that Scotland ...
4,Ministers lose slopping out case. The Scottish...,Executive ministers raised an appeal arguing t...,"Napier said that the practice, where prisoners..."


### Rogue Scores

In [16]:
results_lsa = calc_rouge(data, 'summary', 'lsa_summary')
results_lsa.columns = ['rouge-1_lsa', 'rouge-2_lsa', 'rouge-l_lsa']

#### Average

In [17]:
avg_r1, avg_r2, avg_rL = avg_rouge(results_lsa, 'rouge-1_lsa', 'rouge-2_lsa', 'rouge-l_lsa')
print('LSA')
print('Average rouge-1 score: ', avg_r1)
print('Average rouge-2 score: ', avg_r2)
print('Average rouge-l score: ', avg_rL)

LSA
Average rouge-1 score:  0.6374854982722293
Average rouge-2 score:  0.49569814976884197
Average rouge-l score:  0.6216293821422859


In [18]:
avg_rogue_lsa  = avg_rL

## 2) LexRank Summarizer

In [19]:
data['lexrank_summary'] = data['text'].apply(lambda x: summarizer(x, 3, LexRankSummarizer))

In [20]:
data.head()

Unnamed: 0,text,summary,lsa_summary,lexrank_summary
0,Fiat chief takes steering wheel. The chief exe...,"Mr Marchionne, who only joined the company las...","As part of a major restructuring, Fiat is to i...",Sergio Marchionne has replaced Herbert Demel a...
1,Johnson uncertain about Euro bid. Jade Johnson...,"""It's the stress,"" said Johnson. Jade Johnson ...",Jade Johnson is undecided about whether to con...,"Johnson uncertain about Euro bid. ""I don't wan..."
2,Cage film's third week at US top. Nicolas Cage...,"National Treasure, which sees Cage's character...",The Polar Express entered in third place while...,Cage film's third week at US top. Nicolas Cage...
3,McConnell details Scots wave toll. At least th...,At least three people from Scotland died in th...,Mr McConnell went on to promise that Scotland ...,He formally expressed Scotland's sympathy for ...
4,Ministers lose slopping out case. The Scottish...,Executive ministers raised an appeal arguing t...,"Napier said that the practice, where prisoners...","Ministers lose slopping out case. On Thursday,..."


### Rogue Scores

In [21]:
results_lexrank = calc_rouge(data, 'summary', 'lexrank_summary')
results_lexrank.columns = ['rouge-1_lexrank', 'rouge-2_lexrank', 'rouge-l_lexrank']

#### Average

In [22]:
avg_r1, avg_r2, avg_rL = avg_rouge(results_lexrank, 'rouge-1_lexrank', 'rouge-2_lexrank', 'rouge-l_lexrank')
print("LexRank")
print('Average rouge-1 score: ', avg_r1)
print('Average rouge-2 score: ', avg_r2)
print('Average rouge-l score: ', avg_rL)

LexRank
Average rouge-1 score:  0.7945737920338466
Average rouge-2 score:  0.6919515595826836
Average rouge-l score:  0.7830033923614866


In [23]:
avg_rogue_lexrank  = avg_rL

## 3) L.U.H.N Summarizer

In [24]:
data['luhn_summary'] = data['text'].apply(lambda x: summarizer(x, 3, LuhnSummarizer))

In [25]:
data.head()

Unnamed: 0,text,summary,lsa_summary,lexrank_summary,luhn_summary
0,Fiat chief takes steering wheel. The chief exe...,"Mr Marchionne, who only joined the company las...","As part of a major restructuring, Fiat is to i...",Sergio Marchionne has replaced Herbert Demel a...,Sergio Marchionne has replaced Herbert Demel a...
1,Johnson uncertain about Euro bid. Jade Johnson...,"""It's the stress,"" said Johnson. Jade Johnson ...",Jade Johnson is undecided about whether to con...,"Johnson uncertain about Euro bid. ""I don't wan...","""I will have to see how I am jumping in the ne..."
2,Cage film's third week at US top. Nicolas Cage...,"National Treasure, which sees Cage's character...",The Polar Express entered in third place while...,Cage film's third week at US top. Nicolas Cage...,Cage film's third week at US top. Nicolas Cage...
3,McConnell details Scots wave toll. At least th...,At least three people from Scotland died in th...,Mr McConnell went on to promise that Scotland ...,He formally expressed Scotland's sympathy for ...,At least three people from Scotland died in th...
4,Ministers lose slopping out case. The Scottish...,Executive ministers raised an appeal arguing t...,"Napier said that the practice, where prisoners...","Ministers lose slopping out case. On Thursday,...","On Thursday, the Court of Session threw out a ..."


### Rogue Scores

In [26]:
results_luhn = calc_rouge(data, 'summary', 'luhn_summary')
results_luhn.columns =['rouge-1_luhn', 'rouge-2_luhn', 'rouge-l_luhn']

#### Average

In [27]:
avg_r1, avg_r2, avg_rL = avg_rouge(results_luhn, 'rouge-1_luhn', 'rouge-2_luhn', 'rouge-l_luhn')
print("LUHN")
print('Average rouge-1 score: ', avg_r1)
print('Average rouge-2 score: ', avg_r2)
print('Average rogue-l score:', avg_rL)

LUHN
Average rouge-1 score:  0.7979187836783583
Average rouge-2 score:  0.7079597331155932
Average rogue-l score: 0.7876259116572806


In [28]:
avg_rogue_luhn = avg_rL

## 4) SumBasic Summarizer Summarizer

In [29]:
data['sum_basic_summary'] = data['text'].apply(lambda x: summarizer(x, 3, SumBasicSummarizer))

In [30]:
data.head()

Unnamed: 0,text,summary,lsa_summary,lexrank_summary,luhn_summary,sum_basic_summary
0,Fiat chief takes steering wheel. The chief exe...,"Mr Marchionne, who only joined the company las...","As part of a major restructuring, Fiat is to i...",Sergio Marchionne has replaced Herbert Demel a...,Sergio Marchionne has replaced Herbert Demel a...,Sergio Marchionne has replaced Herbert Demel a...
1,Johnson uncertain about Euro bid. Jade Johnson...,"""It's the stress,"" said Johnson. Jade Johnson ...",Jade Johnson is undecided about whether to con...,"Johnson uncertain about Euro bid. ""I don't wan...","""I will have to see how I am jumping in the ne...",And the Commonwealth and European silver medal...
2,Cage film's third week at US top. Nicolas Cage...,"National Treasure, which sees Cage's character...",The Polar Express entered in third place while...,Cage film's third week at US top. Nicolas Cage...,Cage film's third week at US top. Nicolas Cage...,Nicolas Cage movie National Treasure has toppe...
3,McConnell details Scots wave toll. At least th...,At least three people from Scotland died in th...,Mr McConnell went on to promise that Scotland ...,He formally expressed Scotland's sympathy for ...,At least three people from Scotland died in th...,McConnell details Scots wave toll. The Scottis...
4,Ministers lose slopping out case. The Scottish...,Executive ministers raised an appeal arguing t...,"Napier said that the practice, where prisoners...","Ministers lose slopping out case. On Thursday,...","On Thursday, the Court of Session threw out a ...","Ministers lose slopping out case. On Thursday,..."


### Rogue Scores

In [31]:
results_sum_basic = calc_rouge(data, 'summary', 'sum_basic_summary')
results_sum_basic.columns = ['rouge-1_sum_basic', 'rouge-2_sum_basic', 'rouge-l_sum_basic']

#### Average

In [32]:
avg_r1, avg_r2, avg_rL = avg_rouge(results_sum_basic, 'rouge-1_sum_basic', 'rouge-2_sum_basic', 'rouge-l_sum_basic')
print('SumBasic')
print('Average rouge-1 score: ', avg_r1)
print('Average rogue-2 score:', avg_r2)
print('Average rogue-l score:', avg_rL)

SumBasic
Average rouge-1 score:  0.6773055612940571
Average rogue-2 score: 0.49114968798922354
Average rogue-l score: 0.6579374370653842


In [33]:
avg_rogue_sb = avg_rL

# Joining Tables

In [34]:
results = pd.concat([results_lsa, results_lexrank, results_luhn, results_sum_basic], axis=1)
data_con = pd.concat([data, results], axis = 1)

In [35]:
data_con.head()

Unnamed: 0,text,summary,lsa_summary,lexrank_summary,luhn_summary,sum_basic_summary,rouge-1_lsa,rouge-2_lsa,rouge-l_lsa,rouge-1_lexrank,rouge-2_lexrank,rouge-l_lexrank,rouge-1_luhn,rouge-2_luhn,rouge-l_luhn,rouge-1_sum_basic,rouge-2_sum_basic,rouge-l_sum_basic
0,Fiat chief takes steering wheel. The chief exe...,"Mr Marchionne, who only joined the company las...","As part of a major restructuring, Fiat is to i...",Sergio Marchionne has replaced Herbert Demel a...,Sergio Marchionne has replaced Herbert Demel a...,Sergio Marchionne has replaced Herbert Demel a...,"{'r': 0.5454545454545454, 'p': 0.2970297029702...","{'r': 0.3582089552238806, 'p': 0.1655172413793...","{'r': 0.5454545454545454, 'p': 0.2970297029702...","{'r': 0.6981132075471698, 'p': 0.3663366336633...","{'r': 0.625, 'p': 0.27586206896551724, 'f': 0....","{'r': 0.6981132075471698, 'p': 0.3663366336633...","{'r': 0.5098039215686274, 'p': 0.2574257425742...","{'r': 0.3181818181818182, 'p': 0.1448275862068...","{'r': 0.5098039215686274, 'p': 0.2574257425742...","{'r': 0.7142857142857143, 'p': 0.2970297029702...","{'r': 0.5471698113207547, 'p': 0.2, 'f': 0.292...","{'r': 0.7142857142857143, 'p': 0.2970297029702..."
1,Johnson uncertain about Euro bid. Jade Johnson...,"""It's the stress,"" said Johnson. Jade Johnson ...",Jade Johnson is undecided about whether to con...,"Johnson uncertain about Euro bid. ""I don't wan...","""I will have to see how I am jumping in the ne...",And the Commonwealth and European silver medal...,"{'r': 0.7111111111111111, 'p': 0.6274509803921...","{'r': 0.68, 'p': 0.5396825396825397, 'f': 0.60...","{'r': 0.7111111111111111, 'p': 0.6274509803921...","{'r': 0.5, 'p': 0.3333333333333333, 'f': 0.399...","{'r': 0.39473684210526316, 'p': 0.238095238095...","{'r': 0.47058823529411764, 'p': 0.313725490196...","{'r': 0.2553191489361702, 'p': 0.2352941176470...","{'r': 0.034482758620689655, 'p': 0.03174603174...","{'r': 0.2127659574468085, 'p': 0.1960784313725...","{'r': 0.41379310344827586, 'p': 0.235294117647...","{'r': 0.21212121212121213, 'p': 0.111111111111...","{'r': 0.41379310344827586, 'p': 0.235294117647..."
2,Cage film's third week at US top. Nicolas Cage...,"National Treasure, which sees Cage's character...",The Polar Express entered in third place while...,Cage film's third week at US top. Nicolas Cage...,Cage film's third week at US top. Nicolas Cage...,Nicolas Cage movie National Treasure has toppe...,"{'r': 0.7833333333333333, 'p': 0.6025641025641...","{'r': 0.6935483870967742, 'p': 0.4831460674157...","{'r': 0.7833333333333333, 'p': 0.6025641025641...","{'r': 0.9302325581395349, 'p': 0.5128205128205...","{'r': 0.86, 'p': 0.48314606741573035, 'f': 0.6...","{'r': 0.9302325581395349, 'p': 0.5128205128205...","{'r': 0.9302325581395349, 'p': 0.5128205128205...","{'r': 0.86, 'p': 0.48314606741573035, 'f': 0.6...","{'r': 0.9302325581395349, 'p': 0.5128205128205...","{'r': 0.8113207547169812, 'p': 0.5512820512820...","{'r': 0.7166666666666667, 'p': 0.4831460674157...","{'r': 0.7924528301886793, 'p': 0.5384615384615..."
3,McConnell details Scots wave toll. At least th...,At least three people from Scotland died in th...,Mr McConnell went on to promise that Scotland ...,He formally expressed Scotland's sympathy for ...,At least three people from Scotland died in th...,McConnell details Scots wave toll. The Scottis...,"{'r': 0.74, 'p': 0.37373737373737376, 'f': 0.4...","{'r': 0.6842105263157895, 'p': 0.2708333333333...","{'r': 0.7, 'p': 0.35353535353535354, 'f': 0.46...","{'r': 0.828125, 'p': 0.5353535353535354, 'f': ...","{'r': 0.7974683544303798, 'p': 0.4375, 'f': 0....","{'r': 0.828125, 'p': 0.5353535353535354, 'f': ...","{'r': 1.0, 'p': 0.5656565656565656, 'f': 0.722...","{'r': 0.9861111111111112, 'p': 0.4930555555555...","{'r': 1.0, 'p': 0.5656565656565656, 'f': 0.722...","{'r': 0.6857142857142857, 'p': 0.2424242424242...","{'r': 0.631578947368421, 'p': 0.16666666666666...","{'r': 0.6571428571428571, 'p': 0.2323232323232..."
4,Ministers lose slopping out case. The Scottish...,Executive ministers raised an appeal arguing t...,"Napier said that the practice, where prisoners...","Ministers lose slopping out case. On Thursday,...","On Thursday, the Court of Session threw out a ...","Ministers lose slopping out case. On Thursday,...","{'r': 1.0, 'p': 0.5, 'f': 0.6666666622222223}","{'r': 0.971830985915493, 'p': 0.37912087912087...","{'r': 1.0, 'p': 0.5, 'f': 0.6666666622222223}","{'r': 0.9375, 'p': 0.22727272727272727, 'f': 0...","{'r': 0.8888888888888888, 'p': 0.1758241758241...","{'r': 0.9375, 'p': 0.22727272727272727, 'f': 0...","{'r': 1.0, 'p': 0.49242424242424243, 'f': 0.65...","{'r': 0.975609756097561, 'p': 0.43956043956043...","{'r': 1.0, 'p': 0.49242424242424243, 'f': 0.65...","{'r': 0.7666666666666667, 'p': 0.1742424242424...","{'r': 0.6666666666666666, 'p': 0.1318681318681...","{'r': 0.7333333333333333, 'p': 0.1666666666666..."


In [36]:
data_con.columns

Index(['text', 'summary', 'lsa_summary', 'lexrank_summary', 'luhn_summary',
       'sum_basic_summary', 'rouge-1_lsa', 'rouge-2_lsa', 'rouge-l_lsa',
       'rouge-1_lexrank', 'rouge-2_lexrank', 'rouge-l_lexrank', 'rouge-1_luhn',
       'rouge-2_luhn', 'rouge-l_luhn', 'rouge-1_sum_basic',
       'rouge-2_sum_basic', 'rouge-l_sum_basic'],
      dtype='object')

# Proposed Ensemble Extractive Approach using Weighted Voting

In [37]:
total_weight = avg_rogue_lsa + avg_rogue_lexrank + avg_rogue_luhn + avg_rogue_sb
weight_lsa = avg_rogue_lsa/total_weight
weight_lexrank = avg_rogue_lexrank/total_weight
weight_luhn = avg_rogue_luhn/total_weight
weight_sb = avg_rogue_sb/total_weight

df_for_ensemble = data_con.copy()
df_for_ensemble['ensemble_summary'] = df_for_ensemble.apply(lambda x: generate_ensemble_summary(x, weight_lsa, weight_lexrank, weight_luhn, weight_sb), axis=1)






In [38]:
df_for_ensemble.head()

Unnamed: 0,text,summary,lsa_summary,lexrank_summary,luhn_summary,sum_basic_summary,rouge-1_lsa,rouge-2_lsa,rouge-l_lsa,rouge-1_lexrank,rouge-2_lexrank,rouge-l_lexrank,rouge-1_luhn,rouge-2_luhn,rouge-l_luhn,rouge-1_sum_basic,rouge-2_sum_basic,rouge-l_sum_basic,ensemble_summary
0,Fiat chief takes steering wheel. The chief exe...,"Mr Marchionne, who only joined the company las...","As part of a major restructuring, Fiat is to i...",Sergio Marchionne has replaced Herbert Demel a...,Sergio Marchionne has replaced Herbert Demel a...,Sergio Marchionne has replaced Herbert Demel a...,"{'r': 0.5454545454545454, 'p': 0.2970297029702...","{'r': 0.3582089552238806, 'p': 0.1655172413793...","{'r': 0.5454545454545454, 'p': 0.2970297029702...","{'r': 0.6981132075471698, 'p': 0.3663366336633...","{'r': 0.625, 'p': 0.27586206896551724, 'f': 0....","{'r': 0.6981132075471698, 'p': 0.3663366336633...","{'r': 0.5098039215686274, 'p': 0.2574257425742...","{'r': 0.3181818181818182, 'p': 0.1448275862068...","{'r': 0.5098039215686274, 'p': 0.2574257425742...","{'r': 0.7142857142857143, 'p': 0.2970297029702...","{'r': 0.5471698113207547, 'p': 0.2, 'f': 0.292...","{'r': 0.7142857142857143, 'p': 0.2970297029702...",Sergio Marchionne has replaced Herbert Demel a...
1,Johnson uncertain about Euro bid. Jade Johnson...,"""It's the stress,"" said Johnson. Jade Johnson ...",Jade Johnson is undecided about whether to con...,"Johnson uncertain about Euro bid. ""I don't wan...","""I will have to see how I am jumping in the ne...",And the Commonwealth and European silver medal...,"{'r': 0.7111111111111111, 'p': 0.6274509803921...","{'r': 0.68, 'p': 0.5396825396825397, 'f': 0.60...","{'r': 0.7111111111111111, 'p': 0.6274509803921...","{'r': 0.5, 'p': 0.3333333333333333, 'f': 0.399...","{'r': 0.39473684210526316, 'p': 0.238095238095...","{'r': 0.47058823529411764, 'p': 0.313725490196...","{'r': 0.2553191489361702, 'p': 0.2352941176470...","{'r': 0.034482758620689655, 'p': 0.03174603174...","{'r': 0.2127659574468085, 'p': 0.1960784313725...","{'r': 0.41379310344827586, 'p': 0.235294117647...","{'r': 0.21212121212121213, 'p': 0.111111111111...","{'r': 0.41379310344827586, 'p': 0.235294117647...","""But if I'm doing this kind of thing, then I w..."
2,Cage film's third week at US top. Nicolas Cage...,"National Treasure, which sees Cage's character...",The Polar Express entered in third place while...,Cage film's third week at US top. Nicolas Cage...,Cage film's third week at US top. Nicolas Cage...,Nicolas Cage movie National Treasure has toppe...,"{'r': 0.7833333333333333, 'p': 0.6025641025641...","{'r': 0.6935483870967742, 'p': 0.4831460674157...","{'r': 0.7833333333333333, 'p': 0.6025641025641...","{'r': 0.9302325581395349, 'p': 0.5128205128205...","{'r': 0.86, 'p': 0.48314606741573035, 'f': 0.6...","{'r': 0.9302325581395349, 'p': 0.5128205128205...","{'r': 0.9302325581395349, 'p': 0.5128205128205...","{'r': 0.86, 'p': 0.48314606741573035, 'f': 0.6...","{'r': 0.9302325581395349, 'p': 0.5128205128205...","{'r': 0.8113207547169812, 'p': 0.5512820512820...","{'r': 0.7166666666666667, 'p': 0.4831460674157...","{'r': 0.7924528301886793, 'p': 0.5384615384615...",Nicolas Cage movie National Treasure has toppe...
3,McConnell details Scots wave toll. At least th...,At least three people from Scotland died in th...,Mr McConnell went on to promise that Scotland ...,He formally expressed Scotland's sympathy for ...,At least three people from Scotland died in th...,McConnell details Scots wave toll. The Scottis...,"{'r': 0.74, 'p': 0.37373737373737376, 'f': 0.4...","{'r': 0.6842105263157895, 'p': 0.2708333333333...","{'r': 0.7, 'p': 0.35353535353535354, 'f': 0.46...","{'r': 0.828125, 'p': 0.5353535353535354, 'f': ...","{'r': 0.7974683544303798, 'p': 0.4375, 'f': 0....","{'r': 0.828125, 'p': 0.5353535353535354, 'f': ...","{'r': 1.0, 'p': 0.5656565656565656, 'f': 0.722...","{'r': 0.9861111111111112, 'p': 0.4930555555555...","{'r': 1.0, 'p': 0.5656565656565656, 'f': 0.722...","{'r': 0.6857142857142857, 'p': 0.2424242424242...","{'r': 0.631578947368421, 'p': 0.16666666666666...","{'r': 0.6571428571428571, 'p': 0.2323232323232...",Mr McConnell went on to signal that the execut...
4,Ministers lose slopping out case. The Scottish...,Executive ministers raised an appeal arguing t...,"Napier said that the practice, where prisoners...","Ministers lose slopping out case. On Thursday,...","On Thursday, the Court of Session threw out a ...","Ministers lose slopping out case. On Thursday,...","{'r': 1.0, 'p': 0.5, 'f': 0.6666666622222223}","{'r': 0.971830985915493, 'p': 0.37912087912087...","{'r': 1.0, 'p': 0.5, 'f': 0.6666666622222223}","{'r': 0.9375, 'p': 0.22727272727272727, 'f': 0...","{'r': 0.8888888888888888, 'p': 0.1758241758241...","{'r': 0.9375, 'p': 0.22727272727272727, 'f': 0...","{'r': 1.0, 'p': 0.49242424242424243, 'f': 0.65...","{'r': 0.975609756097561, 'p': 0.43956043956043...","{'r': 1.0, 'p': 0.49242424242424243, 'f': 0.65...","{'r': 0.7666666666666667, 'p': 0.1742424242424...","{'r': 0.6666666666666666, 'p': 0.1318681318681...","{'r': 0.7333333333333333, 'p': 0.1666666666666...","On Thursday, the Court of Session threw out a ..."


### Rogue Scores

In [39]:
results_ensemble = calc_rouge(df_for_ensemble, 'summary', 'ensemble_summary')
results_ensemble.columns = ['rouge-1_ensemble', 'rouge-2_ensemble', 'rouge-l_ensemble']#### Average

#### Average

In [40]:
avg_r1, avg_r2, avg_rL = avg_rouge(results_ensemble, 'rouge-1_ensemble', 'rouge-2_ensemble', 'rouge-l_ensemble')
print("Ensemble:")
print('Average rouge-1 score: ', avg_r1)
print('Average rouge-2 score: ', avg_r2)
print('Average rouge-l score: ', avg_rL)

Ensemble:
Average rouge-1 score:  0.8086883062709452
Average rouge-2 score:  0.7154968750151037
Average rouge-l score:  0.7981451504751497


# Testing Data

## 1) LSA

In [41]:
test['lsa_summary'] = test['text'].apply(lambda x: summarizer(x, 3))
results_lsa_test = calc_rouge(test, 'summary', 'lsa_summary')
results_lsa_test.columns = ['rouge-1_lsa', 'rouge-2_lsa', 'rouge-l_lsa']
#average rouge recall scores for lsa
avg_r1, avg_r2, avg_rL = avg_rouge(results_lsa_test,'rouge-1_lsa', 'rouge-2_lsa', 'rouge-l_lsa' )
print('LSA Test')
print('Average rouge-1 score: ', avg_r1)
print('Average rogue-2 score:', avg_r2)
print('Average rogue-l score:', avg_rL)

LSA Test
Average rouge-1 score:  0.6196727464861287
Average rogue-2 score: 0.4748775367367442
Average rogue-l score: 0.602950916939744


## 2) LexRank

In [42]:
test['lexrank_summary'] = test['text'].apply(lambda x: summarizer(x, 3, LexRankSummarizer))
results_lexrank_test = calc_rouge(test, 'summary', 'lexrank_summary')
results_lexrank_test.columns = ['rouge-1_lexrank', 'rouge-2_lexrank', 'rouge-l_lexrank']
#average rouge recall scores for lexrank
avg_r1, avg_r2, avg_rL = avg_rouge(results_lexrank_test,'rouge-1_lexrank', 'rouge-2_lexrank', 'rouge-l_lexrank' )
print('LexRank Test')
print('Average rouge-1 score: ', avg_r1)
print('Average rogue-2 score:', avg_r2)
print('Average rogue-l score:', avg_rL)

LexRank Test
Average rouge-1 score:  0.7962527418490459
Average rogue-2 score: 0.6982959765943381
Average rogue-l score: 0.7858444588604405


## 3) L.U.H.N

In [43]:
test['luhn_summary'] = test['text'].apply(lambda x: summarizer(x, 3, LuhnSummarizer))
results_luhn_test = calc_rouge(test, 'summary', 'luhn_summary')
results_luhn_test.columns = ['rouge-1_luhn', 'rouge-2_luhn', 'rouge-l_luhn']
#average rouge recall scores for luhn
avg_r1, avg_r2, avg_rL = avg_rouge(results_luhn_test,'rouge-1_luhn', 'rouge-2_luhn', 'rouge-l_luhn' )
print('Luhn Test')
print('Average rouge-1 score: ', avg_r1)
print('Average rogue-2 score:', avg_r2)
print('Average rogue-l score:', avg_rL)

Luhn Test
Average rouge-1 score:  0.7848367380546
Average rogue-2 score: 0.6924121368032535
Average rogue-l score: 0.773225020999985


## 4) SumBasic

In [44]:
test['sum_basic_summary'] = test['text'].apply(lambda x: summarizer(x, 3, SumBasicSummarizer))
results_sum_basic_test = calc_rouge(test, 'summary', 'sum_basic_summary')
results_sum_basic_test.columns = ['rouge-1_sum_basic', 'rouge-2_sum_basic', 'rouge-l_sum_basic']
#average rouge recall scores for sum_basic
avg_r1, avg_r2, avg_rL = avg_rouge(results_sum_basic_test,'rouge-1_sum_basic', 'rouge-2_sum_basic', 'rouge-l_sum_basic' )
print('SumBasic Test')
print('Average rouge-1 score: ', avg_r1)
print('Average rogue-2 score:', avg_r2)
print('Average rogue-l score:', avg_rL)

SumBasic Test
Average rouge-1 score:  0.6768681473382804
Average rogue-2 score: 0.4955983469338142
Average rogue-l score: 0.6573773618972109


## Ensemble Extractive Approach

In [45]:
results = pd.concat([results_lsa_test, results_lexrank_test, results_luhn_test, results_sum_basic_test], axis=1)
test_con = pd.concat([test, results], axis = 1)
test_con['ensemble_summary'] = test_con.apply(lambda x: generate_ensemble_summary(x, weight_lsa, weight_lexrank, weight_luhn, weight_sb), axis=1)
results_ensemble_test = calc_rouge(test_con, 'summary', 'ensemble_summary')
results_ensemble_test.columns = ['rouge-1_ensemble', 'rouge-2_ensemble', 'rouge-l_ensemble']

avg_r1, avg_r2, avg_rL = avg_rouge(results_ensemble_test, 'rouge-1_ensemble', 'rouge-2_ensemble', 'rouge-l_ensemble')
print("Ensemble:")
print('Average rouge-1 score: ', avg_r1)
print('Average rouge-2 score: ', avg_r2)
print('Average rouge-l score: ', avg_rL)

Ensemble:
Average rouge-1 score:  0.7972945358818847
Average rouge-2 score:  0.7041159782629272
Average rouge-l score:  0.7862432180889567
