In [7]:
from google.colab import drive
drive.mount('/content/drive')

import os
print(os.getcwd())
folder = "/content/drive/My Drive/Colab Notebooks/MDS"
os.chdir(folder)
print(os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content
/content/drive/My Drive/Colab Notebooks/MDS




---
preprocessing already applied :

```
# This code is already implemented on file
```




**Dataset Info**

In [None]:
import pandas as pd
df=pd.read_csv("dataset/combined_data_resolved.csv")
df.head()


Unnamed: 0,Index,Article,Summary
0,1,\nD-Tree Grammars\n\ndesigned to share some of...,Title: D-Tree Grammars\n\nAbstract: designed t...
1,2,\nJoint Learning Improves Semantic Role Labeli...,Title: Joint Learning Improves Semantic Role L...
2,3,\nBilingually-Constrained (Monolingual) Shift-...,Title: Bilingually-Constrained (Monolingual) S...
3,4,\nA Generative Constituent-Context Model For I...,Title: A Generative Constituent-Context Model ...
4,5,\nWord Association Norms Mutual Information An...,Title: Word Association Norms Mutual Informati...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 608 entries, 0 to 607
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Index    608 non-null    int64 
 1   Article  608 non-null    object
 2   Summary  608 non-null    object
dtypes: int64(1), object(2)
memory usage: 14.4+ KB


In [None]:

wc= pd.DataFrame()

wc['article'] = df['Article'].str.split().str.len()
wc['summary'] = df['Summary'].str.split().str.len()

print(f"Maximum word count:{ wc['article'].max()} average : {wc['article'].mean()} min : {wc['article'].min()}")

print(f"Maximum word count:{ wc['summary'].max()} average : {wc['summary'].mean()} min : {wc['summary'].min()}")


Maximum word count:18867 average : 4003.159539473684 min : 748
Maximum word count:5591 average : 407.65625 min : 111




---

**Preprocessing**

---



In [None]:
import re

def preprocess_text(text):
    text = text.replace('\n', ' ').replace('\r', ' ').strip()
    text = re.sub(r'[^\w\s.,!?;\'"-]', '', text)             #special character except punctuations
    text = re.sub(r'\s+', ' ', text)

    return text

df['Article'] = df['Article'].apply(preprocess_text)
df['Summary'] = df['Summary'].apply(preprocess_text)
df.head()

Unnamed: 0,Index,Article,Summary
0,1,D-Tree Grammars designed to share some of the ...,Title D-Tree Grammars Abstract designed to sha...
1,2,Joint Learning Improves Semantic Role Labeling...,Title Joint Learning Improves Semantic Role La...
2,3,Bilingually-Constrained Monolingual Shift-Redu...,Title Bilingually-Constrained Monolingual Shif...
3,4,A Generative Constituent-Context Model For Imp...,Title A Generative Constituent-Context Model F...
4,5,Word Association Norms Mutual Information And ...,Title Word Association Norms Mutual Informatio...


In [None]:
df.to_csv("dataset/final_cleaned.csv", index=False)

**create train test splits**

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
print(f"Total set size: {len(df)}")
print(f"Training set size: {len(train_df)}")
print(f"Testing set size: {len(test_df)}")

#save
train_df.to_csv("dataset/train.csv", index=False)
test_df.to_csv("dataset/test.csv", index=False)

Total set size: 608
Training set size: 547
Testing set size: 61




---



#**start point**






---
**DATA LOADER**

---


In [8]:
!pip install transformers datasets torch rouge-score






---
multinews dataset for later


In [None]:
# from datasets import load_dataset
# dataset=load_dataset('multi_news',split='test')



---



In [9]:
import pandas as pd
from datasets import Dataset

df = pd.read_csv("dataset/final_cleaned.csv")

dataset = Dataset.from_pandas(df)
dataset


Dataset({
    features: ['Index', 'Article', 'Summary'],
    num_rows: 608
})

In [10]:
scisumm = dataset.train_test_split(test_size=0.1, seed =42 )
scisumm


DatasetDict({
    train: Dataset({
        features: ['Index', 'Article', 'Summary'],
        num_rows: 547
    })
    test: Dataset({
        features: ['Index', 'Article', 'Summary'],
        num_rows: 61
    })
})

In [11]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-small')

In [12]:
def preprocess_function(examples):
    inputs = [doc for doc in examples["Article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(examples["Summary"], max_length=128, truncation=True, padding=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [13]:
tokenized_scisumm=scisumm.map(preprocess_function, batched=True)
tokenized_scisumm

Map:   0%|          | 0/547 [00:00<?, ? examples/s]

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Index', 'Article', 'Summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 547
    })
    test: Dataset({
        features: ['Index', 'Article', 'Summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 61
    })
})

In [14]:
tokenized_scisumm = tokenized_scisumm.remove_columns(['Index', 'Article', 'Summary'])
tokenized_scisumm

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 547
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 61
    })
})

In [15]:
from transformers import T5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="t5-small")

model=T5ForConditionalGeneration.from_pretrained('t5-small')
model

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [16]:
# training arguments
training_arguments = Seq2SeqTrainingArguments(
            output_dir='./results',
            evaluation_strategy='epoch',

            save_strategy='epoch',  #save model state after each epoch (space consuming)
            #save_strategy='no',  #1. use to save model only once after finishing training (comment out above line in case)

            logging_dir='./logs',
            learning_rate=2e-5,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            weight_decay=0.01,
            save_total_limit=4,
            num_train_epochs=2,
            # remove_unused_columns=False,
            fp16=True,
            )


trainer = Seq2SeqTrainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_scisumm['train'],
    eval_dataset=tokenized_scisumm['test'],
    tokenizer=tokenizer,
    data_collator=data_collator
    )


  trainer = Seq2SeqTrainer(


In [17]:
trainer.train()

#trainer.save_model()  # 2.save model after completing training (use if (1.) is followed)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,0.483852
2,1.048600,0.399393


TrainOutput(global_step=548, training_loss=1.0214617304558302, metrics={'train_runtime': 6621.9363, 'train_samples_per_second': 0.165, 'train_steps_per_second': 0.083, 'total_flos': 296127861620736.0, 'train_loss': 1.0214617304558302, 'epoch': 2.0})

**Evaluate**

In [18]:
from transformers import DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
import torch

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

def generate_summaries(test_dataset, model, tokenizer, batch_size=4, max_output_length=150):
    model.eval()
    all_summaries = []

    dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=data_collator)

    for batch in dataloader:
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)

        summary_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_output_length,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )


        summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
        all_summaries.extend(summaries)

    return all_summaries

generated_summaries = generate_summaries(tokenized_scisumm['test'], model, tokenizer)

for i in range(5):
    print(f"Generated Summary {i+1}: {generated_summaries[i]}")


Generated Summary 1: Discriminative Training And Maximum Entropy Models For Statistical Machine Translation Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract Abstract
Generated Summary 2

In [20]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [21]:
import evaluate

rouge = evaluate.load("rouge")

def decode_labels(labels, tokenizer):
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return decoded_labels

generated_summaries = generate_summaries(tokenized_scisumm['test'], model, tokenizer)
reference_summaries = decode_labels(tokenized_scisumm['test']['labels'], tokenizer)


def calculate_rouge(generated_summaries, reference_summaries):
    results = rouge.compute(predictions=generated_summaries, references=reference_summaries)
    return results

rouge_results = calculate_rouge(generated_summaries, reference_summaries)


print("ROUGE Scores:")
print(rouge_results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ROUGE Scores:
{'rouge1': 0.6950126022757905, 'rouge2': 0.6364337606453072, 'rougeL': 0.6408800051875528, 'rougeLsum': 0.6419400906456656}




---
**su4 included**


In [None]:
import evaluate

rouge = evaluate.load("rouge")

def decode_labels(labels, tokenizer):
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return decoded_labels

generated_summaries = generate_summaries(tokenized_scisumm['test'], model, tokenizer)
reference_summaries = decode_labels(tokenized_scisumm['test']['labels'], tokenizer)


def calculate_rouge(generated_summaries, reference_summaries):
    results = rouge.compute(predictions=generated_summaries, references=reference_summaries, rouge_types=["rouge1", "rouge2", "rougeL", "rouge4"])
    return results

rouge_results = calculate_rouge(generated_summaries, reference_summaries)


print("ROUGE Scores:")
print(rouge_results)



---


Bert Model  : https://huggingface.co/google-t5/t5-small
---



END

---



---





---
**live test**
