# Packages and libraries

In [None]:
#install the following packages

!pip install sentencepiece
!pip install transformers
!pip install rouge

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 19.1 MB/s eta 0:00:01[K     |▌                               | 20 kB 11.1 MB/s eta 0:00:01[K     |▉                               | 30 kB 8.8 MB/s eta 0:00:01[K     |█                               | 40 kB 7.7 MB/s eta 0:00:01[K     |█▍                              | 51 kB 5.8 MB/s eta 0:00:01[K     |█▋                              | 61 kB 5.8 MB/s eta 0:00:01[K     |██                              | 71 kB 5.6 MB/s eta 0:00:01[K     |██▏                             | 81 kB 6.3 MB/s eta 0:00:01[K     |██▍                             | 92 kB 6.1 MB/s eta 0:00:01[K     |██▊                             | 102 kB 5.5 MB/s eta 0:00:01[K     |███                             | 112 kB 5.5 MB/s eta 0:00:01[K     |███▎                            | 122 kB 5.5 MB/s eta 0:00:01[K     |███▌        

In [None]:
import nltk
import os
import string
import collections
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import networkx
from rouge import Rouge
import sentencepiece
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
import gzip
from google.colab import drive
from shutil import copyfile

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Data

In [None]:
# Mount google drive where the data_summarization.csv file was previously saved

drive.mount('/content/gdrive')
copyfile('gdrive/My Drive/Progetto_TMeS/data_summarization.csv', 'data_summarization.csv')

Mounted at /content/gdrive


'data_summarization.csv'

In [None]:
# Data can't be simply read with read_csv
# The converters part is needed to make sure the system reads the clean_sentences and norm_sentences as composed by lists of strings,
# Not just a single and very long string

data = pd.read_csv("data_summarization.csv", converters={"clean_sentences": lambda x: x.strip("[]").replace("'","").split(", "), "norm_sentences": lambda x: x.strip("[]").replace("'","").split(", ")})

In [None]:
print(data.shape)
data.head(1)

(2500, 6)


Unnamed: 0.1,Unnamed: 0,abstract,topic,preprocessed,clean_sentences,norm_sentences
0,0,a process for controlling a multiple stage dra...,b,"with reference to the drawings , a wire drawin...","[with reference to the drawings , a wire drawi...",[reference drawings wire drawing system machin...


# Extractive summarization - Graph based

In [None]:
# Drop unwanted columns
data = data.drop(['topic', 'Unnamed: 0'], axis=1)

print(data.shape)
data.head(1)

(2500, 4)


Unnamed: 0,abstract,preprocessed,clean_sentences,norm_sentences
0,a process for controlling a multiple stage dra...,"with reference to the drawings , a wire drawin...","[with reference to the drawings , a wire drawi...",[reference drawings wire drawing system machin...


In [None]:
# Calculate similarity between phrases in 'norm_sentences' using tfidf vectorizer, matrices and graphs
# Obtain scores with pagerank algorithm and sort them in a new column

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)

def phrase_similarity(sentence):
  dt_matrix = tv.fit_transform(sentence)
  dt_matrix = dt_matrix.toarray()
  similarity_matrix = np.matmul(dt_matrix, dt_matrix.T)
  np.round(similarity_matrix, 3)

  # Create a graph based on the similarity matrix
  similarity_graph = networkx.from_numpy_array(similarity_matrix)

  # Score with pagerank and ranking
  scores = networkx.pagerank(similarity_graph)
  ranked_sentences = sorted(((score, index) for index, score 
                                             in scores.items()), 
                            reverse=True)

  return ranked_sentences

In [None]:
data['sim_scores'] = data['norm_sentences'].map(lambda x: phrase_similarity(x))

In [None]:
data.head(1)

Unnamed: 0,abstract,preprocessed,clean_sentences,norm_sentences,sim_scores
0,a process for controlling a multiple stage dra...,"with reference to the drawings , a wire drawin...","[with reference to the drawings , a wire drawi...",[reference drawings wire drawing system machin...,"[(0.010969829645980863, 165), (0.0108471889984..."


In [None]:
# Calculate the average number of sentences in 'abstract' to have an idea of the number of sentences to be put in the extractive summarization

abstact_len = []
for i in range(len(data)):
  abstact_len.append(data['abstract'][i].count('.'))

def Average(lst):
    return sum(lst) / len(lst)

average = Average(abstact_len)
average

3.5384

In [None]:
# Matching between indices of the 4 best sentences in 'sim_scores' and 'clean_sentences'
# Unite the phrases to obtain the final extractive summary

data['extractive_summary'] = np.nan

for i in range(len(data)):
  sent=[]
  for t in data['sim_scores'][i][:4]: # number of sentences
    sent.append(str(data['clean_sentences'][i][t[1]]))
  final_summ = ' '.join(sent)
  data['extractive_summary'][i] = final_summ

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [None]:
# Example of the ground truth and the new summary
print('Original abstract: ' + data['abstract'][100])
print('Extractive summary: ' + data['extractive_summary'][100])

Original abstract: improved cleanability and contamination prevention are provided in a wet milling apparatus for the production of pharmaceutical grade milled products . the advantages are provided by a milling agitator that is characterized by a smooth , seamless agitating surface , without crevices or seams which might accumulate contamination and which might prevent removal of contamination during cleaning . the use of polymeric milling media reduces wear on the agitator and permits the agitator to be constructed with permanent , smooth welded joints . seamless joints are also provided on the interior of the milling chamber and sanitary , threadless fasteners are provided for the media separation screen and other milling chamber fittings .
Extractive summary: the cooling passage 50 is formed by an inner cylindrical wall 61 and an outer cylindrical wall 62 . both pivotably connected to a pivot member 108 at one of their ends . the drive shaft 11 mates with a small diameter portion o

# Abstractive summarization - Pegasus

In [None]:
# Take only 200 documents, beacause the process is very time and computationally consuming

data2 = data.head(200).copy()

In [None]:
data2.head(1)

Unnamed: 0,abstract,preprocessed,clean_sentences,norm_sentences,sim_scores,extractive_summary
0,a process for controlling a multiple stage dra...,"with reference to the drawings , a wire drawin...","[with reference to the drawings , a wire drawi...",[reference drawings wire drawing system machin...,"[(0.010969829645980863, 165), (0.0108471889984...",13 is preset by the product speed controller 9...


In [None]:
# Clean dataset form unwanted columns

data2 = data2.drop(['clean_sentences', 'norm_sentences', 'sim_scores'], axis=1)

print(data2.shape)
data2.head(1)

(200, 3)


Unnamed: 0,abstract,preprocessed,extractive_summary
0,a process for controlling a multiple stage dra...,"with reference to the drawings , a wire drawin...",13 is preset by the product speed controller 9...


In [None]:
# Use the Pegasus Model, a pretrained model from google specific for abstractive summarization
# applied on Big Patent Dataset

model_name = 'google/pegasus-big_patent'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

In [None]:
# Define the encoder-decoder function to run the model on the dataset

def pegasus_summarization(doc):
  batch = tokenizer(doc, truncation=True, padding='longest', return_tensors="pt").to(device)
  translated = model.generate(**batch)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text[0]

In [None]:
# Apply Pegasus summarization on the documents

data2['abstractive_summary'] = data2['preprocessed'].map(lambda x: pegasus_summarization(x))

In [None]:
data2.head()

Unnamed: 0,abstract,preprocessed,extractive_summary,abstractive_summary
0,a process for controlling a multiple stage dra...,"with reference to the drawings , a wire drawin...",13 is preset by the product speed controller 9...,A wire drawing system having a plurality of st...
1,"a plastic material , controlled at a desired t...",preferred embodiments according to the present...,48 and a similar adjusting member on the other...,"In an extruding machine, a temperature measuri..."
2,a box formed from a single piece of cardboard ...,reference will firstly be made to fig1 which s...,to facilitate insertion into the finished box ...,"A spread-out piece of punched, crease- lined a..."
3,"a machine for accurately cutting tiles , block...","fig1 illustrates a block , stone , tile and wo...",the silt contained within the effluent water n...,"A block, stone, tile and wood cutting apparatu..."
4,the present invention provides a rechargeable ...,an illustrative embodiment of the present inve...,a cylindrical protrusion 19 a protrudes from t...,"A housing of an impact driver includes a body,..."


In [None]:
# Comparison between techniques
print('Original abstract: ' + data2['abstract'][5])
print('Extractive summary: ' + data2['extractive_summary'][5])
print('Abstractive summary: ' + data2['abstractive_summary'][5])

Original abstract: an aspect of the present invention provides an apparatus for converting images of vehicle surroundings that includes , at least one camera configured to start , upon receiving a synchronizing signal , photographing the surroundings of a vehicle and outputting image data representative of the photographs , an output memory configured to store image data to be displayed on a display installed in the vehicle , a pattern memory configured to store destination addresses of the output memory , and an image converter configured to generate the synchronizing signal , obtain the image data from the camera , and transfer part or the whole of the image data to the output memory according to the destination addresses stored in the pattern memory .
Extractive summary: and the display 5 displays the display data 40 . the output memory 4 has a bank for receiving image data and another bank for outputting display data to the display 5 . as mentioned above  the output memory 4 provid

# Summaries evaluation - Rouge

In [None]:
# Calculate rouge scores to evaluate the two methods
# Consider f-measure of rouge-1 and rouge-l
# Then calculate the average score for both extractive and abstractive summarization

rouge = Rouge()

data2['rouge-1_extr'] = np.nan
data2['rouge-1_abst'] = np.nan
data2['rouge-l_extr'] = np.nan
data2['rouge-l_abst'] = np.nan

for i in range(len(data2)): 
  reference = data2['abstract'][i]
  model_out1 = data2['extractive_summary'][i]
  model_out2 = data2['abstractive_summary'][i]

  punteggio_extr = rouge.get_scores(model_out1, reference)
  punteggio_abst = rouge.get_scores(model_out2, reference)

  data2['rouge-1_extr'][i] = punteggio_extr[0]['rouge-1']['f']
  data2['rouge-1_abst'][i] = punteggio_abst[0]['rouge-1']['f']

  data2['rouge-l_extr'][i] = punteggio_extr[0]['rouge-l']['f']
  data2['rouge-l_abst'][i] = punteggio_abst[0]['rouge-l']['f']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
data2.head()

Unnamed: 0,abstract,preprocessed,extractive_summary,abstractive_summary,rouge-1_extr,rouge-1_abst,rouge-l_extr,rouge-l_abst
0,a process for controlling a multiple stage dra...,"with reference to the drawings , a wire drawin...",13 is preset by the product speed controller 9...,A wire drawing system having a plurality of st...,0.121212,0.195652,0.121212,0.152174
1,"a plastic material , controlled at a desired t...",preferred embodiments according to the present...,48 and a similar adjusting member on the other...,"In an extruding machine, a temperature measuri...",0.173077,0.38,0.153846,0.32
2,a box formed from a single piece of cardboard ...,reference will firstly be made to fig1 which s...,to facilitate insertion into the finished box ...,"A spread-out piece of punched, crease- lined a...",0.261905,0.215385,0.142857,0.215385
3,"a machine for accurately cutting tiles , block...","fig1 illustrates a block , stone , tile and wo...",the silt contained within the effluent water n...,"A block, stone, tile and wood cutting apparatu...",0.308642,0.206349,0.222222,0.174603
4,the present invention provides a rechargeable ...,an illustrative embodiment of the present inve...,a cylindrical protrusion 19 a protrudes from t...,"A housing of an impact driver includes a body,...",0.142857,0.409091,0.119048,0.340909


In [None]:
# Results
print('Average rouge-1 score (F-measure) for extrative summaries: ', data2['rouge-1_extr'].mean())
print('Average rouge-l score (F-measure) for extrative summaries: ', data2['rouge-l_extr'].mean())

print('Average rouge-1 score (F-measure) for abstractive summaries: ', data2['rouge-1_abst'].mean())
print('Average rouge-l score (F-measure) for abstractive summaries: ', data2['rouge-l_abst'].mean())

Average rouge-1 score (F-measure) for extrative summaries:  0.24141477316865811
Average rouge-1 score (F-measure) for abstractive summaries:  0.34001048763325337
Average rouge-l score (F-measure) for extrative summaries:  0.19807100887571008
Average rouge-l score (F-measure) for abstractive summaries:  0.2937433060929056
