In [1]:
# Data
import json
# Data Analysis
import numpy as np
import pandas as pd
# Extractive Summarizer
from models.extractive_summarizer import ExtractiveSummarizer
# Encoders for Extractive Summarizer
import models.Encoders.Word2Vec as Word2Vecencoder
import models.Encoders.TFIDF as TFIDFencoder
# Rankers for Extractive Summarizer
import models.Rankers.LogisticRegression as LogisticRegressionranker
import models.Rankers.LinearRegression as LinearRegressionranker
# Decoders for Extractive Summarizer
import models.Decoders.PureScore as PureScore
import models.Decoders.PureScore as PureRougeSearch
import models.Decoders.PureScore as GuidedRougeSearch
# Baselines
import models.ModelLeadN as LeadN
import models.ModelRandom as Random
# Evaluator
from evaluation.rouge_evaluator import RougeEvaluator

In [2]:
# Data
test_PATH = 'data/test.json'
train_greedy_sent_PATH = 'data/train.greedy_sent.json'
train_PATH = 'data/train.json'
validation_PATH = 'data/validation.json'
very_small_validation_PATH = 'data/very_small_validation.json'
with open(test_PATH, 'r') as f:
    test_data = json.load(f)
with open(train_greedy_sent_PATH, 'r') as f:
    train_greedy_sent_data = json.load(f)
with open(train_PATH, 'r') as f:
    train_data = json.load(f)
with open(validation_PATH, 'r') as f:
    validation_data = json.load(f)
with open(very_small_validation_PATH, 'r') as f:
    very_small_validation_data = json.load(f)

In [7]:
# Table A.1: Random
summarizer = Random.ExtractiveSummarizer()
pred_data = summarizer.run_extractive_summarization(train_greedy_sent_data , test_data)
results = summarizer.eval(RougeEvaluator() , test_data, pred_data)
a = results

Validating data shape: 100%|██████████| 10000/10000 [00:00<00:00, 3707835.93it/s]
Running extractive summarizer: 100%|█████████▉| 999/1000 [00:00<00:00, 46875.00it/s]
100%|██████████| 1000/1000 [00:00<00:00, 2741375.16it/s]


In [8]:
# Table A.2: LeadN
summarizer = LeadN.ExtractiveSummarizer()
pred_data = summarizer.run_extractive_summarization(train_greedy_sent_data , test_data)
results = summarizer.eval(RougeEvaluator() , test_data, pred_data)
b = results

Validating data shape: 100%|██████████| 10000/10000 [00:00<00:00, 4006403.67it/s]
Running extractive summarizer: 100%|█████████▉| 999/1000 [00:00<00:00, 54794.16it/s]
100%|██████████| 1000/1000 [00:00<00:00, 2644580.08it/s]


In [None]:
# Table A.3: TFIDFencoder + LogisticRegressionranker + GuidedRougeSearch
num_experiments = 1#0
results = None
for i in range(num_experiments):
    summarizer = ExtractiveSummarizer(encoder=TFIDFencoder, ranker=LogisticRegressionranker, decoder=GuidedRougeSearch)
    pred_data = summarizer.run_extractive_summarization(train_greedy_sent_data , test_data)
    try:
        results += summarizer.eval(RougeEvaluator() , test_data, pred_data) / num_experiments
    except:
        results = summarizer.eval(RougeEvaluator() , test_data, pred_data) / num_experiments
results

In [None]:
# Table C.1: TFIDFencoder + LogisticRegressionranker + PureScore
summarizer = ExtractiveSummarizer(encoder=TFIDFencoder, ranker=LogisticRegressionranker, decoder=PureScore)
pred_data = summarizer.run_extractive_summarization(train_greedy_sent_data , test_data)
results = summarizer.eval(RougeEvaluator() , test_data, pred_data)
results

In [None]:
# Table C.2: TFIDFencoder + LinearRegression + GuidedRougeSearch
summarizer = ExtractiveSummarizer(encoder=TFIDFencoder, ranker=LinearRegressionranker, decoder=GuidedRougeSearch)
pred_data = summarizer.run_extractive_summarization(train_greedy_sent_data , test_data)
results = summarizer.eval(RougeEvaluator() , test_data, pred_data)
results

In [None]:
# Table C.3: Word2Vecencoder + LogisticRegressionranker + GuidedRougeSearch
summarizer = ExtractiveSummarizer(encoder=Word2Vecencoder, ranker=LogisticRegressionranker, decoder=GuidedRougeSearch)
pred_data = summarizer.run_extractive_summarization(train_greedy_sent_data , test_data)
results = summarizer.eval(RougeEvaluator() , test_data, pred_data)
results

In [None]:
# Table C.4: Word2Vecencoder + LinearRegression + GuidedRougeSearch
summarizer = ExtractiveSummarizer(encoder=Word2Vecencoder, ranker=LinearRegressionranker, decoder=GuidedRougeSearch)
pred_data = summarizer.run_extractive_summarization(train_greedy_sent_data , test_data)
results = summarizer.eval(RougeEvaluator() , test_data, pred_data)
results

In [9]:
# Table C.5: Word2Vecencoder + LinearRegression + PureScore
summarizer = ExtractiveSummarizer(encoder=Word2Vecencoder, ranker=LinearRegressionranker, decoder=PureScore)
pred_data = summarizer.run_extractive_summarization(train_greedy_sent_data , test_data)
results = summarizer.eval(RougeEvaluator() , test_data, pred_data)
c = results

Validating data shape: 100%|██████████| 10000/10000 [00:00<00:00, 3745248.68it/s]


Constructing Word Vectors


Running X_2_VectorList: 100%|██████████| 10000/10000 [00:15<00:00, 625.04it/s]


Training Completed


Running extractive summarizer: 100%|█████████▉| 999/1000 [00:02<00:00, 360.24it/s]
100%|██████████| 1000/1000 [00:00<00:00, 183381.60it/s]


In [11]:
pd.concat({'Random': a, 'LeadN':b, 'Word2Vecencoder + LinearRegression':c}).round(3)

Unnamed: 0,Unnamed: 1,Recall,Precision,F1-score
Random,rouge-1,0.294,0.264,0.278
Random,rouge-2,0.077,0.068,0.072
Random,rouge-4,0.023,0.021,0.022
Random,rouge-l,0.243,0.221,0.231
LeadN,rouge-1,0.454,0.321,0.376
LeadN,rouge-2,0.185,0.13,0.153
LeadN,rouge-4,0.067,0.048,0.056
LeadN,rouge-l,0.38,0.269,0.315
Word2Vecencoder + LinearRegression,rouge-1,0.426,0.274,0.334
Word2Vecencoder + LinearRegression,rouge-2,0.152,0.099,0.12


In [None]:
# Table C.6: PureRougeSearch - 
summarizer = ExtractiveSummarizer(encoder=TFIDFencoder, ranker=LinearRegressionranker, decoder=PureRougeSearch)
pred_data = summarizer.run_extractive_summarization(train_greedy_sent_data , test_data)
results = summarizer.eval(RougeEvaluator() , test_data, pred_data)
results