# Data Collection

In [None]:
!wget https://cs.stanford.edu/~myasu/projects/scisumm_net/scisummnet_release1.1__20190413.zip

--2024-11-12 15:02:40--  https://cs.stanford.edu/~myasu/projects/scisumm_net/scisummnet_release1.1__20190413.zip
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19451729 (19M) [application/zip]
Saving to: ‘scisummnet_release1.1__20190413.zip’


2024-11-12 15:02:42 (13.9 MB/s) - ‘scisummnet_release1.1__20190413.zip’ saved [19451729/19451729]



In [None]:
!unzip scisummnet_release1.1__20190413.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   creating: __MACOSX/scisummnet_release1.1__20190413/top1000_complete/C90-3030/summary/
  inflating: __MACOSX/scisummnet_release1.1__20190413/top1000_complete/C90-3030/summary/._C90-3030.gold.txt  
  inflating: __MACOSX/scisummnet_release1.1__20190413/top1000_complete/C90-3030/._summary  
  inflating: __MACOSX/scisummnet_release1.1__20190413/top1000_complete/._C90-3030  
   creating: scisummnet_release1.1__20190413/top1000_complete/P04-1036/
  inflating: scisummnet_release1.1__20190413/top1000_complete/P04-1036/citing_sentences_annotated.json  
   creating: __MACOSX/scisummnet_release1.1__20190413/top1000_complete/P04-1036/
  inflating: __MACOSX/scisummnet_release1.1__20190413/top1000_complete/P04-1036/._citing_sentences_annotated.json  
   creating: scisummnet_release1.1__20190413/top1000_complete/P04-1036/Documents_xml/
  inflating: scisummnet_release1.1__20190413/top1000_complete/P04-1036/Documents_xml/P04-1036.xml  


# Data Cleaning and Pre-processing

In [None]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
def remove_web_links(text):
  text = re.sub(r'http://www.\w+.org/','', text)
  text = re.sub(r'http://www.\w+.org/','', text)
  text = re.sub(r'http://www.([\w\S]+).org/\w+\W\w+','',text)
  text = re.sub(r'https://www.\w+.org/','', text)
  text = re.sub(r'https://www.([\w\S]+).org/\w+\W\w+','',text)
  text = re.sub(r'https://\w+.\w+/\d+.\d+/\w\d+\W\w+','',text)
  text = re.sub(r'https://\w+.\w+/\d+.\d+/\w\d+\W\w+','',text)
  text = re.sub(r'Figure\s\d:','', text)
  text = re.sub(r'\Wwww.\w+\W\w+\W','',text)
  text = re.sub("@[A-Za-z0-9]+", "", text)
  text = re.sub(r'www.\w+','',text)
  return text

def remove_emojis(text):
  regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002500-\U00002BEF"
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"
        u"\u3030"
                           "]+", flags = re.UNICODE)
  text = regrex_pattern.sub('', text)
  return text

def remove_spaces(text):
  text = re.sub(r'\n',"",text)
  return text

def remove_stopwords(text):
  stop_words=set(stopwords.words('english'))
  words=word_tokenize(text)
  sentence=[w for w in words if w not in stop_words]
  return " ".join(sentence)

def lemmatize_text(text):
  wordlist=[]
  lemmatizer = WordNetLemmatizer()
  sentences=sent_tokenize(text)
  for sentence in sentences:
      words=word_tokenize(sentence)
      for word in words:
          wordlist.append(lemmatizer.lemmatize(word))
  return ' '.join(wordlist)
def lowercase_text(text):
  return text.lower()

def remove_punctuations(text):
  additional_punctuations = ['’', '…']
  for punctuation in string.punctuation:
    text = text.replace(punctuation, '')
  for punctuation in additional_punctuations:
    text = text.replace(punctuation, '')
  return text

def remove_numbers(text):
  if text is not None:
    text = text.replace(r'^\d+\.\s+','')
  text = re.sub("[0-9]", '', text)
  return text

def clean_and_preprocess_data(text, lowercase=True, clean_stopwords=True, clean_punctuations=True, clean_links=True,
                              clean_emojis=True, clean_spaces=True, clean_numbers=True,  lemmatize=True):
  if clean_stopwords == True:
    text = remove_stopwords(text)
  if clean_punctuations == True:
    text = remove_punctuations(text)
  if clean_links == True:
    text = remove_web_links(text)
  if clean_emojis == True:
    text = remove_emojis(text)
  if clean_spaces == True:
    text = remove_spaces(text)
  if clean_numbers == True:
    text = remove_numbers(text)
  if lemmatize == True:
    text = lemmatize_text(text)
  if lowercase == True:
    return text.lower()
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
text_df = pd.read_csv("top100.csv")
text_df['abstract'] = text_df['abstract'].apply(lambda x: clean_and_preprocess_data(x, lemmatize=False, clean_numbers=False, clean_stopwords=False, clean_punctuations=False, lowercase=False))
text_df['full_text'] = text_df['full_text'].apply(lambda x: clean_and_preprocess_data(x, lemmatize=False, clean_numbers=False, clean_stopwords=False, clean_punctuations=False, lowercase=False))
text_df['conclusion'] = text_df['conclusion'].apply(lambda x: clean_and_preprocess_data(x, lemmatize=False, clean_numbers=False, clean_stopwords=False, clean_punctuations=False, lowercase=False))
print(text_df.head())
text_df.to_csv("top100_cleaned.csv")

   Unnamed: 0                                           abstract  \
0           0  Word lattice decoding has proven useful in spo...   
1           1  We formulate the problem of nonprojective depe...   
2           2  We propose a cascaded linear model for joint C...   
3           3  In this paper, we propose a novel string-todep...   
4           4  We present a novel transition system for depen...   

                                           full_text  \
0  Word lattice decoding has proven useful in spo...   
1  We formulate the problem of nonprojective depe...   
2  We propose a cascaded linear model for joint C...   
3  In this paper, we propose a novel string-todep...   
4  We present a novel transition system for depen...   

                                          conclusion  
0  We have achieved substantial gains in translat...  
1  We presented new dependency parsers based on c...  
2  We proposed a cascaded linear model for Chines...  
3  In this paper, we propose a nov

In [None]:
clean_df=pd.read_csv("top100_cleaned.csv")
clean_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,abstract,full_text,conclusion
0,0,0,Word lattice decoding has proven useful in spo...,Word lattice decoding has proven useful in spo...,We have achieved substantial gains in translat...
1,1,1,We formulate the problem of nonprojective depe...,We formulate the problem of nonprojective depe...,We presented new dependency parsers based on c...
2,2,2,We propose a cascaded linear model for joint C...,We propose a cascaded linear model for joint C...,We proposed a cascaded linear model for Chines...
3,3,3,"In this paper, we propose a novel string-todep...","In this paper, we propose a novel string-todep...","In this paper, we propose a novel string-todep..."
4,4,4,We present a novel transition system for depen...,We present a novel transition system for depen...,We have presented a novel transition system fo...
...,...,...,...,...,...
95,95,95,We present an algorithm for anaphora res- olut...,We present an algorithm for anaphora res- olut...,Quantitative evaluation shows the anaphora res...
96,96,96,"I:n this paper, we describe a new corpus-based...","I:n this paper, we describe a new corpus-based...",Prel)ositioual phrase attachment disambiguatio...
97,97,97,computation of preferthe admissible argument v...,computation of preferthe admissible argument v...,We have presented an application of topic mode...
98,98,98,"If we take an existing supervised NLP system, ...","If we take an existing supervised NLP system, ...",Word features can be learned in advance in an ...


#Model Development

In [None]:
!pip install sentencepiece
!pip install transformers
!pip install tensorflow-gpu # For CPMTokenizer
!pip install bert-extractive-summarizer

Collecting tensorflow-gpu
  Downloading tensorflow-gpu-2.12.0.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
Collecting bert-extractive-summarizer
  Downloading bert_extractive_summarizer-0.10.1-py3-none-any.whl.metadata (15 kB)
Downloading bert_extractive_summarizer-0.10.1-py3-none-any.whl (25 kB)
Installing collected packages: bert-extractive-summarizer
Successfully installed bert-extractive-s

In [None]:
import pandas as pd
import numpy as np

# Data preprocessing
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Text Summarization
from transformers import *
from summarizer import Summarizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
GroupViT models are not usable since `tensorflow_probability` can't be loaded. It seems you have `tensorflow_probability` installed with the wrong tensorflow version.Please try to reinstall it following the instructions here: https://github.co

In [None]:
class TextSummarizer:
  def __init__(self, data):
    self.data = data

  # Helper functions

  # Get average word length in a document
  def avg_word(self, data):
    words = data.split()
    length = (sum(len(word) for word in words)/(len(words)+0.000001))

    return length

  # Get number of punctuations in a document
  def count_punctuation(self, data):
    punctuation_count = sum([1 for char in data if char in string.punctuation])

    return punctuation_count

  # Get optimal number of sentences for extractive summarization
  def get_optimal_number_sentences(self, data, model):

    optimal_num_sentences = model.calculate_optimal_k(data, k_max=10)

    return optimal_num_sentences

  # Extract numerical text features
  def extract_text_features(self, text_column):
    # Get number of stop words
    stop_words = stopwords.words('english')
    self.data["num_stopwords"] = self.data[text_column].apply(lambda x:
    len([x for x in x.split() if x in stop_words]))

    # Get number of punctuations
    self.data["num_punctuations"] = self.data[text_column].apply(lambda x:
    self.count_punctuation(x))

    # Get number of numerical characters
    self.data["num_numerics"] = self.data[text_column].apply(lambda x:
    len([x for x in x.split() if x.isdigit()]))

    # Get number of words in the document
    self.data["num_words"] = self.data[text_column].apply(lambda x:
    len(str(x).split(" ")))

    # Get average word length in document

    self.data["avg_word_length"] = self.data[text_column].apply(lambda x:
    round(self.avg_word(x),1))

    # Get the stopwords to word ratio
    self.data["stopwords_to_words_ratio"] = round(self.data["num_stopwords"] / self.data["num_words"], 3)

    return self.data

  def extractive_summarizer(self, model, text_column):
    self.data["extractive_summarized_text"] = self.data[text_column].apply(lambda x:
    "".join(model(x, num_sentences=self.get_optimal_number_sentences(x, model))))

    return self.data


  def join_extracted_summary(self, abstract, extracted_summary, conclusion):
    self.data["combined_text"] = self.data[[abstract, extracted_summary, conclusion]].agg(
        " ".join, axis=1
    )

    return self.data

  def abstractive_summarizer(self, model, text_column, max_length=750, min_length=250):
    summaries_list = []
    for i in range(len(self.data[text_column])):
      text = self.data[text_column][i]
      try:
        summary = model(text, max_length = max_length,
        min_length = min_length, do_sample=False)[-1]["summary_text"]
      except:
        # Decrease the length of the token to 1024 if it exceeds
        text = text[:1024]
        summary = model(text, max_length = max_length,
        min_length = min_length, do_sample=False)[-1]["summary_text"]

      summaries_list.append(summary)

    self.data["abstractive_summaries"] = summaries_list

    return self.data

In [None]:
text100_df=pd.read_csv("/content/top100_cleaned.csv")
top10_data = text100_df.head(10)

top10_file_path = 'top10cleaned.csv'
top10_data.to_csv(top10_file_path, index=False)

#Extractive Summarization

In [None]:
text_df = pd.read_csv("top10cleaned.csv")
text_df.drop("Unnamed: 0", axis=1, inplace=True)
text_class = TextSummarizer(text_df)

# Extract all features from the combined abstract, body, and conclusion text
text_class.extract_text_features("full_text")

# Use scibert to perform extractive summarization
pretrained_model = 'allenai/scibert_scivocab_uncased'

# Load model, model config and tokenizer via Transformers
custom_config = AutoConfig.from_pretrained(pretrained_model)
custom_config.output_hidden_states=True
custom_tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
custom_model = AutoModel.from_pretrained(pretrained_model, config=custom_config)

# Create pretrained-model object
model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)

# Extractive summarization
extractive_summarized_text = text_class.extractive_summarizer(model, "full_text")

# Save dataframe containing extractive summaries
extractive_summarized_text.to_csv("extractive_summarized_dataframe_final.csv")

# Check extractive summaries
extractive_summaries = extractive_summarized_text["extractive_summarized_text"]
print(extractive_summaries)

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--allenai--scibert_scivocab_uncased/snapshots/24f92d32b1bfb0bcaf9ab193ff3ad01e87732fc1/config.json
Model config BertConfig {
  "_name_or_path": "allenai/scibert_scivocab_uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31090
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--allenai--scibert_scivocab_uncased/sna

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--allenai--scibert_scivocab_uncased/snapshots/24f92d32b1bfb0bcaf9ab193ff3ad01e87732fc1/vocab.txt
loading file tokenizer.json from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--allenai--scibert_scivocab_uncased/snapshots/24f92d32b1bfb0bcaf9ab193ff3ad01e87732fc1/config.json
Model config BertConfig {
  "_name_or_path": "allenai/scibert_scivocab_uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--allenai--scibert_scivocab_uncased/snapshots/24f92d32b1bfb0bcaf9ab193ff3ad01e87732fc1/pytorch_model.bin
Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializin

0    Word lattice decoding has proven useful in spo...
1    We formulate the problem of nonprojective depe...
2    We propose a cascaded linear model for joint C...
3    In this paper, we propose a novel string-todep...
4    We present a novel transition system for depen...
5    Morphological processes in Semitic languages d...
6    Previous studies of data-driven dependency par...
7    This paper presents an unsupervised opinanalys...
8    We present a phrasal synchronous grammar model...
9    Broad-coverage annotated treebanks necessary t...
Name: extractive_summarized_text, dtype: object


#Abstractive Summarisation

In [None]:
#Now, we perform abstractive summarization on the extractively summarized text

text_class = TextSummarizer(extractive_summarized_text)

# Concatenate the extractive summary with the abstract and conclusion
text_class.join_extracted_summary("abstract", "extractive_summarized_text", "conclusion")

# Instantiate abstractive summarizer model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
abstractive_summarized_text = text_class.abstractive_summarizer(summarizer, "combined_text")

# Save to csv
abstractive_summarized_text.to_csv("abstractive_summarized_dataframe_final.csv")
abstractive_summaries = abstractive_summarized_text["abstractive_summaries"]

# Compare abstractive summaries and full text
print(abstractive_summarized_text[["full_text", "abstractive_summaries"]])

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/config.json
Model config BartConfig {
  "_name_or_path": "facebook/bart-large-cnn",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": fals

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/model.safetensors
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1
}

All model checkpoint weights were used when initializing BartForConditionalGeneration.

All the weights of BartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-large-cnn.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BartForConditionalGeneration for predictions without further training.


generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/config.json
Model config BartConfig {
  "_name_or_path": "facebook/bart-large-cnn",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_layer_norm": false,
  "architectures

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

loading file vocab.json from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/merges.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/config.json
Model config BartConfig {
  "_name_or_path": "facebook/bart-large-cnn",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_l

                                           full_text  \
0  Word lattice decoding has proven useful in spo...   
1  We formulate the problem of nonprojective depe...   
2  We propose a cascaded linear model for joint C...   
3  In this paper, we propose a novel string-todep...   
4  We present a novel transition system for depen...   
5  Morphological processes in Semitic languages d...   
6  Previous studies of data-driven dependency par...   
7  This paper presents an unsupervised opinanalys...   
8  We present a phrasal synchronous grammar model...   
9  Broad-coverage annotated treebanks necessary t...   

                               abstractive_summaries  
0  We show that prior work in translating lattice...  
1  We formulate the problem of nonprojective depe...  
2  We propose a cascaded linear model for joint C...  
3  In this paper, we propose a novel string-todep...  
4  System constructs arcs only between adjacent w...  
5  Morphological processes in Semitic languages d... 

#Model Evaluation

In [None]:
!pip install datasets
!pip install evaluate
!pip install rouge_score
!pip install sacrebleu

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
from evaluate import load

rouge_metric = load("rouge")
bleu_metric = load("bleu")

references = abstractive_summarized_text["full_text"].tolist()  # Ground truth (original full text)
predictions = abstractive_summarized_text["abstractive_summaries"].tolist()  # Generated summaries

#ROUGE scores
rouge_results = rouge_metric.compute(predictions=predictions, references=references, use_stemmer=True)
print("ROUGE Scores:", rouge_results)

#BLEU score
bleu_results = bleu_metric.compute(predictions=predictions,references=[[ref] for ref in references])
print("BLEU Score:", bleu_results['bleu'])

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

ROUGE Scores: {'rouge1': 0.08687512822719118, 'rouge2': 0.07724751665580187, 'rougeL': 0.06870929152358406, 'rougeLsum': 0.06871598130566486}
BLEU Score: 5.027203733528803e-10


#Saving the model

In [None]:
# First, ensure we have the necessary libraries installed
!pip install transformers
!pip install torch

# Importing necessary libraries
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# Load the pre-trained model and tokenizer (e.g., for BART-based summarization)
model_name = "facebook/bart-large-cnn"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Ensure that all tensors are contiguous before saving
for param in model.parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()

# Save the model and tokenizer to your Google Drive (or a specific folder in your Colab environment)
save_directory = '/content/brevity_model/'

# Save the model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

# Verify the model is saved by listing the directory contents
import os
print("Saved model files:", os.listdir(save_directory))

# Now you can load the model back into any future Colab session like so:
# model = AutoModelForSeq2SeqLM.from_pretrained(save_directory)
# tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Let's create a summarizer pipeline using the saved model
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)




loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--bart-large-cnn/snapshots/37f520fa929c961707657b28798b30c003dd100b/config.json
Model config BartConfig {
  "_name_or_path": "facebook/bart-large-cnn",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": fals

Saved model files: ['merges.txt', 'tokenizer.json', 'tokenizer_config.json', 'special_tokens_map.json', 'config.json', 'model.safetensors', 'vocab.json', 'generation_config.json']


In [None]:
text = """
Hugging Face has become a leading provider of natural language processing (NLP) tools and models.
Their library, transformers, allows easy access to state-of-the-art models such as BERT, GPT, and T5.
These models can be used for a variety of NLP tasks, including text classification, translation, and summarization.
"""

# Use the model to generate a summary of the example text
summary = summarizer(text, max_length=50, min_length=25, do_sample=False)

# Print the summary
print("Summary:", summary[0]['summary_text'])

# To download the model files to your local machine
from google.colab import files
import shutil

shutil.make_archive('/content/brevity_model', 'zip', '/content/brevity_model')
files.download('/content/brevity_model.zip')

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1
}



Summary: Hugging Face has become a leading provider of natural language processing tools and models. Their library, transformers, allows easy access to state-of-the-art models such as BERT, GPT, and T5.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
import shutil
files.download('/content/brevity_model.zip')