# T5 model for text simplification

## import package

In [None]:
!pip install --upgrade pip
!pip install --upgrade torch
!pip install SentencePiece
!pip uninstall transformers -y
!pip install transformers
!pip install rich
!pip install rouge
!pip install evaluate
!pip install Levenshtein

[0mFound existing installation: transformers 4.36.1
Uninstalling transformers-4.36.1:
  Successfully uninstalled transformers-4.36.1
[0mCollecting transformers
  Using cached transformers-4.36.1-py3-none-any.whl.metadata (126 kB)
Using cached transformers-4.36.1-py3-none-any.whl (8.3 MB)
Installing collected packages: transformers
Successfully installed transformers-4.36.1
[0m

In [None]:
import string
import json
import pandas as pd
from tqdm import tqdm
import re
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os

# Importing the T5 modules from huggingface/transformers
from transformers import T5TokenizerFast, T5ForConditionalGeneration
from transformers import BartTokenizer, BartForConditionalGeneration

# rich: for a better display on terminal
from rich.table import Column, Table
from rich import box
from rich.console import Console
from torch import cuda

## Train data

In [None]:
df=pd.read_csv("/content/wikilarge_train.csv")
new_column_names = {'Normal': 'source_snt', 'Simple': 'simplified_snt'}
df.rename(columns=new_column_names, inplace=True)
df

Unnamed: 0,source_snt,simplified_snt
0,engines need oil to make them slippery or the ...,engines need oil to make them slippery or the ...
1,media studies is an academic discipline that d...,media studies is an academic discipline which ...
2,the formation of the mediterranean sea is a mo...,the formation of the mediterranean sea is a mo...
3,many insects use a sex determination system ba...,because genetic sex determination is controlle...
4,take me away lrb together as one rrb lrb stan...,take me away lrb together as one rrb
...,...,...
595,he was burned at the stake by secular authorit...,he was burnt at the stake as a heretic by the ...
596,he has since resigned for super nova racing to...,he has re signed for super nova racing to driv...
597,the entrance of the university of a coru a was...,it was blocked by flood waters on october. a n...
598,lim is a commune in the aisne department in pi...,it is found in the region picardie in the aisn...


In [None]:
def data_info(data,column_name):
    max_length=max([len(i.split()) for i in data[column_name]])
    average_length=sum([len(i.split()) for i in data[column_name]])/len(data[column_name])
    return max_length,average_length

max_length,average_length=data_info(df,'source_snt')
print({'Dataframe':'Train data','max length of source sentence':max_length,'avg length of source sentence':average_length})

max_length,average_length=data_info(df,'simplified_snt')
print({'Dataframe':'Train data','max length of simplified sentence':max_length,'avg length of simplified sentence':average_length})

{'Dataframe': 'Train data', 'max length of source sentence': 69, 'avg length of source sentence': 22.86}
{'Dataframe': 'Train data', 'max length of simplified sentence': 63, 'avg length of simplified sentence': 18.715}


 ## Add control tokens to training data

In [None]:
from string import punctuation
import Levenshtein
import spacy
import nltk
import time
from nltk.corpus import stopwords
import multiprocessing
from multiprocessing import Pool, Lock
from tqdm import tqdm
nltk.download('stopwords')
import threading
from queue import Queue
from functools import lru_cache

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
!pip install sacremoses
!pip install pytorch-lightning
!pip install python-Levenshtein
!pip install sacrebleu
!pip install yattag

[0m

In [None]:
from source.resources import DUMPS_DIR, WORD_FREQUENCY_FILEPATH, WORD_EMBEDDINGS_NAME, download_glove,COMPLEXITY_MODEL_FILEPATH,GOOGLE_WORD_EMBEDDINGS_FILEPATH
from source.helper import tokenize, yield_lines, load_dump, dump, write_lines,save_preprocessor, yield_sentence_pair,generate_hash

In [None]:
stopwords = set(stopwords.words('english'))

def round(val):
    return '%.2f' % val

def safe_division(a, b):
    return a / b if b else 0

@lru_cache(maxsize=1024)
def is_punctuation(word):
    return ''.join([char for char in word if char not in punctuation]) == ''

def remove_punctuation(text):
    return ' '.join([word for word in tokenize(text) if not is_punctuation(word)])

def remove_stopwords(text):
    return ' '.join([w for w in tokenize(text) if w.lower() not in stopwords])

In [None]:
@lru_cache(maxsize=1)
def get_spacy_model():
    model = 'en_core_web_sm'
    if not spacy.util.is_package(model):
        spacy.cli.download(model)
        spacy.cli.link(model, model, force=True, model_path=spacy.util.get_package_path(model))
    return spacy.load(model)

@lru_cache(maxsize=10 ** 6)
def spacy_process(text):
    return get_spacy_model()(str(text))

In [None]:
@lru_cache(maxsize=1024)
def get_dependency_tree_depth(sentence):
    def tree_height(node):
        if len(list(node.children)) == 0:
            return 0
        return 1 + max(tree_height(child) for child in node.children)

    tree_depths = [tree_height(spacy_sentence.root) for spacy_sentence in spacy_process(sentence).sents]
    if len(tree_depths) == 0:
        return 0
    return max(tree_depths)

In [None]:
@lru_cache(maxsize=1)
def get_word2rank(vocab_size=np.inf):
    model_filepath = DUMPS_DIR / f"{WORD_EMBEDDINGS_NAME}.pk"
    if model_filepath.exists():
        return load_dump(model_filepath)
    else:
        print("Downloading glove.42B.300d ...")
        download_glove(model_name='glove.42B.300d', dest_dir=str(DUMPS_DIR))
        print("Preprocessing word2rank...")
        DUMPS_DIR.mkdir(parents=True, exist_ok=True)
        WORD_EMBEDDINGS_PATH = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.txt'
        lines_generator = yield_lines(WORD_EMBEDDINGS_PATH)
        word2rank = {}
        # next(lines_generator)
        for i, line in enumerate(lines_generator):
            if i >= vocab_size: break
            word = line.split(' ')[0]
            word2rank[word] = i
        dump(word2rank, model_filepath)
        txt_file = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.txt'
        zip_file = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.zip'
        if txt_file.exists(): txt_file.unlink()
        if zip_file.exists(): zip_file.unlink()
        return word2rank

get_word2rank()

@lru_cache(maxsize=10000)
def get_normalized_rank(word):
    max = len(get_word2rank())
    rank = get_word2rank().get(word, max)
    return np.log(1 + rank) / np.log(1 + max)

@lru_cache(maxsize=5000)
def get_rank(word):
    rank = get_word2rank().get(word, len(get_word2rank()))
    return np.log(1 + rank)

@lru_cache(maxsize=2048)
def get_lexical_complexity_score(sentence):
    words = tokenize(remove_stopwords(remove_punctuation(sentence)))
    words = [word for word in words if word in get_word2rank()]  # remove unknown words
    if len(words) == 0:
        return 1.0
#     return np.array([get_normalized_rank(word) for word in words]).mean()
    return np.quantile([get_rank(word) for word in words], 0.75)

# @lru_cache(maxsize=2048)
def get_lexical_complexity_score_batch(simple_sentences,complex_sentences):
    scores=[]
    for i in range(len(simple_sentences)):
        scores.append(float(round(min(safe_division(get_lexical_complexity_score(simple_sentences[i]),
                                       get_lexical_complexity_score(complex_sentences[i])), 2))))
    return np.mean(scores)

In [None]:
simple_sentences=['Current academic and industrial research is interested in autonomous vehicles.','Current academic and industrial research is interested in autonomous vehicles.']
complex_sentences=['In the modern era of automation and robotics, autonomous vehicles are currently the focus of academic and industrial research, related to drones.','In the modern era of automation and robotics, autonomous vehicles are currently the focus of academic and industrial research, related to drones.']
get_lexical_complexity_score_batch(simple_sentences,complex_sentences)

0.88

In [None]:

@lru_cache(maxsize=1)
def get_word_frequency():
    model_filepath = DUMPS_DIR / f'{WORD_FREQUENCY_FILEPATH.stem}.pk'
    if model_filepath.exists():
        return load_dump(model_filepath)
    else:
        DUMPS_DIR.mkdir(parents=True, exist_ok=True)
        word_freq = {}
        with open(WORD_FREQUENCY_FILEPATH,'r',encoding='utf-8') as f:
            for line in f:
                chunks = line.rstrip().split(' ')
                word = chunks[0]
                freq = int(chunks[1])
                word_freq[word] = freq
        dump(word_freq, model_filepath)
        return word_freq

get_word_frequency()

@lru_cache(maxsize=10000)
def get_normalized_frequency(word):
    max = 179573112 # the 153141437, the max frequency
    freq = get_word_frequency().get(word, 0)
    return 1.0 - np.log(1 + freq) / np.log(1 + max)

@lru_cache(maxsize=2048)
def get_complexity_score(sentence):
    words = tokenize(remove_stopwords(remove_punctuation(sentence)))
#     words = tokenize(remove_punctuation(sentence))
    words = [word for word in words if word in get_word_frequency()]  # remove unknown words
    if len(words) == 0:
        return 1.0

    return np.array([get_normalized_frequency(word.lower()) for word in words]).mean()

In [None]:
sentence='Current academic and industrial research is interested in autonomous vehicles.'
# sentence='In the modern era of automation and robotics, autonomous vehicles are currently the focus of academic and industrial research, related to drones.'
words = tokenize(remove_stopwords(remove_punctuation(sentence)))
words = [word for word in words if word in get_word2rank()]
rank=np.quantile([get_rank(word) for word in words], 0.75)
# rank=np.array([get_normalized_rank(word) for word in words]).mean()
rank

7.837459915682471

In [None]:
class RatioFeature:
    def __init__(self, feature_extractor, target_ratio=0.80):
        self.feature_extractor = feature_extractor
        self.target_ratio = str(target_ratio)

    def encode_sentence(self, sentence):
        return f'{self.name}_{self.target_ratio}'

    def encode_sentence_pair(self, complex_sentence, simple_sentence):
        return f'{self.name}_{self.feature_extractor(complex_sentence, simple_sentence)}', simple_sentence

    def decode_sentence(self, encoded_sentence):
        return encoded_sentence

    @property
    def name(self):
        class_name = self.__class__.__name__.replace('RatioFeature', '')
        name = ""
        for word in re.findall('[A-Z][^A-Z]*', class_name):
            if word: name += word[0]
        if not name: name = class_name
        return name

In [None]:
class WordRatioFeature(RatioFeature):
    def __init__(self, *args, **kwargs):
        super().__init__(self.get_word_length_ratio, *args, **kwargs)

    def get_word_length_ratio(self, complex_sentence, simple_sentence):
        return round(safe_division(len(tokenize(simple_sentence)), len(tokenize(complex_sentence))))

class CharRatioFeature(RatioFeature):
    def __init__(self, *args, **kwargs):
        super().__init__(self.get_char_length_ratio, *args, **kwargs)

    def get_char_length_ratio(self, complex_sentence, simple_sentence):
        return round(safe_division(len(simple_sentence), len(complex_sentence)))


class LevenshteinRatioFeature(RatioFeature):
    def __init__(self, *args, **kwargs):
        super().__init__(self.get_levenshtein_ratio, *args, **kwargs)

    def get_levenshtein_ratio(self, complex_sentence, simple_sentence):
        return round(Levenshtein.ratio(complex_sentence, simple_sentence))


class WordRankRatioFeature(RatioFeature):
    def __init__(self, *args, **kwargs):
        super().__init__(self.get_word_rank_ratio, *args, **kwargs)

    def get_word_rank_ratio(self, complex_sentence, simple_sentence):
        return round(min(safe_division(self.get_lexical_complexity_score(simple_sentence),
                                       self.get_lexical_complexity_score(complex_sentence)), 2))

    def get_lexical_complexity_score(self, sentence):
        words = tokenize(remove_stopwords(remove_punctuation(sentence)))
        words = [word for word in words if word in get_word2rank()]
        if len(words) == 0:
            return np.log(1 + len(get_word2rank()))
        return np.quantile([self.get_rank(word) for word in words], 0.75)

    @lru_cache(maxsize=5000)
    def get_rank(self, word):
        rank = get_word2rank().get(word, len(get_word2rank()))
        return np.log(1 + rank)


class DependencyTreeDepthRatioFeature(RatioFeature):
    def __init__(self, *args, **kwargs):
        super().__init__(self.get_dependency_tree_depth_ratio, *args, **kwargs)

    def get_dependency_tree_depth_ratio(self, complex_sentence, simple_sentence):
        return round(
            safe_division(self.get_dependency_tree_depth(simple_sentence),
                          self.get_dependency_tree_depth(complex_sentence)))

    @lru_cache(maxsize=1024)
    def get_dependency_tree_depth(self, sentence):
        def get_subtree_depth(node):
            if len(list(node.children)) == 0:
                return 0
            return 1 + max([get_subtree_depth(child) for child in node.children])

        tree_depths = [get_subtree_depth(spacy_sentence.root) for spacy_sentence in self.spacy_process(sentence).sents]
        if len(tree_depths) == 0:
            return 0
        return max(tree_depths)

    @lru_cache(maxsize=10 ** 6)
    def spacy_process(self, text):
        return get_spacy_model()(text)

In [None]:
class Preprocessor:
    def __init__(self, features_kwargs=None):
        super().__init__()

        self.features = self.get_features(features_kwargs)
        if features_kwargs:
            self.hash = generate_hash(str(features_kwargs).encode())
        else:
            self.hash = "no_feature"

    def get_class(self, class_name, *args, **kwargs):
        return globals()[class_name](*args, **kwargs)

    def get_features(self, feature_kwargs):
        features = []
        for feature_name, kwargs in feature_kwargs.items():
            features.append(self.get_class(feature_name, **kwargs))
        return features

    def encode_sentence(self, sentence):
        if self.features:
            line = ''
            for feature in self.features:
                line += feature.encode_sentence(sentence) + ' '
            line += sentence
            return line.rstrip()
        else:
            return sentence

    def encode_sentence_pair(self, complex_sentence, simple_sentence):
        if self.features:
            line = ''
            for feature in self.features:
                processed_complex, _ = feature.encode_sentence_pair(complex_sentence, simple_sentence)
                line += processed_complex + ' '
            line += complex_sentence
            return line.rstrip()

        else:
            return complex_sentence

    def decode_sentence(self, encoded_sentence):
        for feature in self.features:
            decoded_sentence = feature.decode_sentence(encoded_sentence)
        return decoded_sentence

    def encode_file(self, input_filepath, output_filepath):
        with open(output_filepath, 'w') as f:
            for line in yield_lines(input_filepath):
                f.write(self.encode_sentence(line) + '\n')

    def decode_file(self, input_filepath, output_filepath):
        with open(output_filepath, 'w') as f:
            for line in yield_lines(input_filepath):
                f.write(self.decode_sentence(line) + '\n')

    def encode_dataframe(self,dataset):
        processed_complex_sentences = []
        for complex_sentence, simple_sentence in tqdm(zip(dataset['source_snt'], dataset['simplified_snt']),total=len(dataset)):
            processed_complex_sentence = self.encode_sentence_pair(complex_sentence, simple_sentence)
            processed_complex_sentences.append(processed_complex_sentence)
        return processed_complex_sentences

    def preprocess_dataset(self, dataset):
        new_df=dataset.copy()
        new_df['source_snt']= self.encode_dataframe(dataset)
        return new_df

In [None]:
features_kwargs = {
        'WordRatioFeature': {'target_ratio': '1.05'},
        'CharRatioFeature': {'target_ratio': '0.95'},
        'LevenshteinRatioFeature': {'target_ratio': '0.75'},
        'WordRankRatioFeature': {'target_ratio': '0.95'},
        'DependencyTreeDepthRatioFeature': {'target_ratio': '0.85'}
    }
# features_kwargs = {}
preprocessor = Preprocessor(features_kwargs)

In [None]:
preprocessor = Preprocessor(features_kwargs)
# preprocessor.encode_sentence_pair('In the modern era of automation and robotics, autonomous vehicles are currently the focus of academic and industrial research, related to drones.','Current academic and industrial research is interested in autonomous vehicles.')
preprocessor.encode_sentence('In the modern era of automation and robotics, autonomous vehicles are currently the focus of academic and industrial research, related to drones.')

'W_1.05 C_0.95 L_0.75 WR_0.95 DTD_0.85 In the modern era of automation and robotics, autonomous vehicles are currently the focus of academic and industrial research, related to drones.'

In [None]:
simple_sentence='Current academic and industrial research is interested in autonomous vehicles.'
complex_sentence='In the modern era of automation and robotics, autonomous vehicles are currently the focus of academic and industrial research, related to drones.'
round(min(safe_division(get_lexical_complexity_score(simple_sentence),
                                       get_lexical_complexity_score(complex_sentence)),2))

'0.88'

In [None]:
new_df = preprocessor.preprocess_dataset(df)

100%|██████████| 600/600 [00:13<00:00, 44.93it/s]


In [None]:
new_df

Unnamed: 0,source_snt,simplified_snt
0,W_0.94 C_0.97 L_0.91 WR_1.00 DTD_0.80 engines ...,engines need oil to make them slippery or the ...
1,W_0.82 C_0.79 L_0.85 WR_1.04 DTD_0.78 media st...,media studies is an academic discipline which ...
2,W_0.52 C_0.53 L_0.69 WR_1.00 DTD_0.80 the form...,the formation of the mediterranean sea is a mo...
3,W_2.47 C_2.46 L_0.43 WR_1.15 DTD_1.57 many ins...,because genetic sex determination is controlle...
4,W_0.67 C_0.63 L_0.77 WR_0.86 DTD_1.00 take me...,take me away lrb together as one rrb
...,...,...
595,W_0.70 C_0.59 L_0.65 WR_1.12 DTD_1.25 he was b...,he was burnt at the stake as a heretic by the ...
596,W_1.11 C_1.10 L_0.89 WR_0.91 DTD_1.00 he has s...,he has re signed for super nova racing to driv...
597,W_0.58 C_0.58 L_0.73 WR_1.05 DTD_0.50 the entr...,it was blocked by flood waters on october. a n...
598,W_1.21 C_1.14 L_0.68 WR_0.87 DTD_1.00 lim is a...,it is found in the region picardie in the aisn...


## Test data

In [None]:

test=pd.read_csv("/content/asset_test.csv")

new_column_names = {'original': 'source_snt', 'simplifications': 'simplified_snt'}
test.rename(columns=new_column_names, inplace=True)
columns_to_drop = ['simplified_snt']
test_s = test.drop(columns=columns_to_drop)

In [None]:

console = Console(record=True)


def display_df(df):


    console = Console()
    table = Table(
        Column("source_text", justify="left"),
        Column("target_text", justify="left"),
        title="Sample Data",
        pad_edge=False,
        box=box.ASCII,
    )

    for i, row in enumerate(df.values.tolist()):
        table.add_row(row[0], row[1])

    console.print(table)

device = 'cuda' if cuda.is_available() else 'cpu'

## Dataset

In [None]:
class TrainDataSetClass(Dataset):


    def __init__(
        self, dataframe, tokenizer, source_len, target_len, source_text, target_text
    ):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):


        return len(self.target_text)

    def __getitem__(self, index):


        row_source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])


        source_text = " ".join(row_source_text.split())
        target_text = " ".join(target_text.split())

        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.summ_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze()
        target_mask = target["attention_mask"].squeeze()

        return {
            "source_text":row_source_text,
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "target_ids": target_ids.to(dtype=torch.long),
            "target_mask": target_mask.to(dtype=torch.long),
        }

In [None]:
class TestDataSetClass(Dataset):


    def __init__(
        self, dataframe, tokenizer, source_len, source_text
    ):

        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.source_text = self.data[source_text]

    def __len__(self):


        return len(self.source_text)

    def __getitem__(self, index):


        source_text = str(self.source_text[index])


        source_text = " ".join(source_text.split())

        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()

        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
        }

## Train, validation and test

In [None]:

def training_logger_init():
    training_logger = Table(
        Column("Epoch", justify="center"),
        Column("Steps", justify="center"),
        Column("Loss", justify="center"),
        Column("ROUGE_1", justify="center"),
        Column("ROUGE_2", justify="center"),
        Column("ROUGE_L", justify="center"),
        Column("SARI", justify="center"),
        Column("BLEU", justify="center"),
        Column("FKGL", justify="center"),
        title="Training Status",
        pad_edge=False,
        box=box.ASCII,
    )
    return training_logger


def epoch_training_logger_init():
    epoch_training_logger = Table(
        Column("Epoch", justify="center"),
        Column("Train_Loss", justify="center"),
        Column("Valid_Loss", justify="center"),
        Column("ROUGE_1", justify="center"),
        Column("ROUGE_2", justify="center"),
        Column("ROUGE_L", justify="center"),
        Column("SARI", justify="center"),
        Column("BLEU", justify="center"),
        Column("FKGL", justify="center"),
        title="Training Epoch Status",
        pad_edge=False,
        box=box.ASCII,
    )
    return epoch_training_logger

In [None]:
from rouge import Rouge
from evaluate import load
from nltk.translate.bleu_score import sentence_bleu

def calculate_rouge(predicted, target):
    rouger = Rouge()
    scores = rouger.get_scores(predicted, target, avg=True)
    return scores['rouge-1']['f'],scores['rouge-2']['f'],scores['rouge-l']['f']

def compute_bleu(predicted, target):
    bleu_scores = []
    for i in range(len(predicted)):
        bleu_scores.append(sentence_bleu([target[i]],predicted[i], weights=(0.25, 0.25, 0.25, 0.25)))
    return np.mean(bleu_scores)

def compute_sari(sources, predicted, target):
    sari = load("sari")
    sari_scores=sari.compute(sources=sources, predictions=predicted, references=[[i] for i in target])
    return sari_scores['sari']

sources=["About 95 species are currently accepted.","About 95 species are currently accepted."]
predictions=["About 95 species are currently accepted.","About 95 species are currently accepted."]
references=["About 95 species are currently known.","About 95 species are currently known."]

compute_sari(sources,predictions,references)

59.49786324786325

In [None]:
from evaluation.sari import corpus_sari
from evaluation.bleu import corpus_bleu
from evaluation.fkgl import corpus_fkgl
sources=["About 95 species are currently accepted.","About 95 species are currently accepted."]
predictions=["About 95 species are currently accepted.","About 95 species are currently accepted."]
references=["About 95 species are currently known.","About 95 species are currently known."]
corpus_sari(sources,predictions,[references],lowercase=True)

26.16452991452991

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer,model_params):
    model.train()
    total_loss=[]
    for _, data in enumerate(loader, 0):
        y = data["target_ids"].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)
        target_mask = data["target_mask"].to(device, dtype=torch.long)[:, 1:]
        source_text = data["source_text"]

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
            decoder_attention_mask=target_mask,
        )
        loss = outputs[0]

        generated_ids = model.generate(
                  input_ids = ids,
                  attention_mask = mask,
                  max_length=model_params["MAX_TARGET_TEXT_LENGTH"],
                  num_beams=model_params["NUM_BEAMS"],
                  repetition_penalty=model_params["REPETITION_PENALTY"],
                  length_penalty=model_params["LENGTH_PENALTY"],
                  early_stopping=True,
                  do_sample=False,
                  temperature=0.25,
                  top_k=120,
                  top_p=0.98,
                  )


        preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
        target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
        rouge1,rouge2,rougeL = calculate_rouge(preds, target)

        sari_score=corpus_sari(source_text,preds, [target],lowercase=False)
        bleu_score=corpus_bleu(preds, [target],lowercase=False)
        fkgl_score=corpus_fkgl(preds)


        if model_params["SENTENCE_COMPLEXITY"]:
            complexity_score = get_lexical_complexity_score_batch(target,preds)#higher is better
            lambda_ = 0.7
            loss = lambda_ * loss + (1-lambda_)*(1-complexity_score)

        if _ % 10 == 0:
            training_logger.add_row(str(epoch), str(_), str(loss),str(rouge1),str(rouge2),str(rougeL),str(sari_score),str(bleu_score),str(fkgl_score))
            console.print(training_logger)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss.append(loss)
    average_loss=sum(total_loss)/len(total_loss)

    return average_loss

In [None]:
def validate(tokenizer, model, device, loader,model_params):


    model.eval()
    sources = []
    predictions = []
    actuals = []
    total_loss=[]
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    bleu_scores = []
    sari_scores = []
    fkgl_scores = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data["target_ids"].to(device, dtype=torch.long)
            y_ids = y[:, :-1].contiguous()
            lm_labels = y[:, 1:].clone().detach()
            lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
            ids = data["source_ids"].to(device, dtype=torch.long)
            mask = data["source_mask"].to(device, dtype=torch.long)
            target_mask = data["target_mask"].to(device, dtype=torch.long)[:, 1:]
            source_text = data["source_text"]

            outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
            decoder_attention_mask=target_mask,
            )
            loss = outputs[0]

            generated_ids = model.generate(
                  input_ids = ids,
                  attention_mask = mask,
                  max_length=model_params["MAX_TARGET_TEXT_LENGTH"],
                  num_beams=model_params["NUM_BEAMS"],
                  repetition_penalty=model_params["REPETITION_PENALTY"],
                  length_penalty=model_params["LENGTH_PENALTY"],
                  early_stopping=True,
                  do_sample=False,
                  temperature=0.25,
                  top_k=120,
                  top_p=0.98,
                  )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            rouge1,rouge2,rougeL = calculate_rouge(preds, target)
            sari_score=corpus_sari(source_text,preds, [target],lowercase=False)
            bleu_score=corpus_bleu(preds, [target],lowercase=False)
            fkgl_score=corpus_fkgl(preds)

            sources.extend(source_text)
            predictions.extend(preds)
            actuals.extend(target)

            total_loss.append(loss)
            rouge1_scores.append(rouge1)
            rouge2_scores.append(rouge2)
            rougeL_scores.append(rougeL)
            sari_scores.append(sari_score)
            bleu_scores.append(bleu_score)
            fkgl_scores.append(fkgl_score)

    average_loss=sum(total_loss)/len(total_loss)
    average_rouge1=sum(rouge1_scores)/len(rouge1_scores)
    average_rouge2=sum(rouge2_scores)/len(rouge2_scores)
    average_rougeL=sum(rougeL_scores)/len(rougeL_scores)
    average_bleu=sum(bleu_scores)/len(bleu_scores)
    average_sari=sum(sari_scores)/len(sari_scores)
    average_fkgl=sum(fkgl_scores)/len(fkgl_scores)
    return sources,predictions, actuals,average_loss, average_rouge1,average_rouge2,average_rougeL,average_bleu,average_sari,average_fkgl

In [None]:
def testing(tokenizer, model, device, loader,model_params):


    model.eval()
    predictions = []
    fkgl_scores = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                  input_ids = ids,
                  attention_mask = mask,
                  max_length=model_params["MAX_TARGET_TEXT_LENGTH"],
                  num_beams=model_params["NUM_BEAMS"],
                  repetition_penalty=model_params["REPETITION_PENALTY"],
                  length_penalty=model_params["LENGTH_PENALTY"],
                  early_stopping=True,
                  do_sample=False,
                  temperature=0.25,
                  top_k=120,
                  top_p=0.98,
                  )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            fkgl_score=corpus_fkgl(preds)
            fkgl_scores.append(fkgl_score)
            if _%10==0:
                console.print(f'Completed {_}')

            predictions.extend(preds)
    average_fkgl=sum(fkgl_scores)/len(fkgl_scores)
    return predictions,average_fkgl

## ModelTrainer and ModelTest

In [None]:
def ModelTrainer(
    dataframe, source_text, target_text, model,tokenizer,model_params, output_dir
):


    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    torch.manual_seed(model_params["SEED"])
    np.random.seed(model_params["SEED"])
    torch.backends.cudnn.deterministic = True


    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")
    model = model.to(device)


    console.log(f"[Data]: Reading data...\n")


    dataframe = dataframe[[source_text, target_text]]

    # Creation of Dataset and Dataloader
    # Defining the train size. So 80% of the data will be used for training and the rest for validation.
    train_size = 0.8
    train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
    train_dataset["source_snt"] = "simplify: " + train_dataset["source_snt"]
    train_dataset = train_dataset.reset_index(drop=True)

    val_dataset = dataframe.drop(train_dataset.index).reset_index(drop=True)
    if model_params["CONTROL_TOKENS"]:
        val_dataset["source_snt"]=[preprocessor.encode_sentence(i[38:]) for i in val_dataset["source_snt"]]
    val_dataset["source_snt"] = "simplify: " + val_dataset["source_snt"]

    display_df(train_dataset.head(2))
    display_df(val_dataset.head(2))

    console.print(f"FULL Dataset: {dataframe.shape}")
    console.print(f"TRAIN Dataset: {train_dataset.shape}")
    console.print(f"VALID Dataset: {val_dataset.shape}\n")

    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = TrainDataSetClass(
        train_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )
    val_set = TrainDataSetClass(
        val_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )

    # Defining the parameters for creation of dataloaders
    train_params = {
        "batch_size": model_params["TRAIN_BATCH_SIZE"],
        "shuffle": True,
        "num_workers": 0,
    }

    val_params = {
        "batch_size": model_params["VALID_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    # Defining the optimizer that will be used to tune the weights of the network in the training session.
    optimizer = torch.optim.AdamW(
        params=model.parameters(), lr=model_params["LEARNING_RATE"]
    )

    # Training loop
    console.log(f"[Initiating Fine Tuning]...\n")
    best_sari = 0
    for epoch in range(model_params["TRAIN_EPOCHS"]):
        train_loss =train(epoch, tokenizer, model, device, training_loader, optimizer,model_params)
        sources,predictions, actuals, valid_loss, average_rouge1,average_rouge2,average_rougeL,average_bleu,average_sari,average_fkgl = validate(tokenizer, model, device, val_loader,model_params)
        epoch_training_logger.add_row(str(epoch), str(train_loss), str(valid_loss), str(average_rouge1),str(average_rouge2),str(average_rougeL),str(average_sari),str(average_bleu),str(average_fkgl))
        console.print(epoch_training_logger)
        if average_sari>best_sari:
            best_sari = average_sari
            console.log(f"[Saving Model]...\n")
            # Saving the model after training
            path = os.path.join(output_dir, "model_files")
            model.save_pretrained(path)
            tokenizer.save_pretrained(path)
            final_df = pd.DataFrame({"Original Text": sources,"Generated Text": predictions, "Actual Text": actuals})
            final_df.to_csv(os.path.join(output_dir, "valid_predictions.csv"))

    console.save_text(os.path.join(output_dir, "logs.txt"))
    console.print(
        f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n"""
    )
    console.print(
        f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'valid_predictions.csv')}\n"""
    )
    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")

In [None]:
def ModelTest(
   test,size,source_text,model,tokenizer,model_params, output_dir
):


    console.log(f"""[Model]: Testing {model_params["MODEL"]}...\n""")
    model = model.to(device)


    console.log(f"[Data]: Reading {size} test data...\n")

    # Creation of Dataset and Dataloader
    test = test.reset_index(drop=True)
    console.print(f"Test {size} Dataset: {test.shape}")

    # Creating the Training and Validation dataset for further creation of Dataloader
    test_set = TestDataSetClass(
        test,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        source_text,
    )

    # Defining the parameters for creation of dataloaders
    test_params = {
        "batch_size": model_params["TEST_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }
    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    test_loader = DataLoader(test_set, **test_params)

    # Testing loop
    console.log(f"[Initiating Testing]...\n")
    predictions,average_fkgl = testing(tokenizer, model, device, test_loader,model_params)
    console.log(f"[test {size} FKGL scores: {average_fkgl}]\n")
    console.log(f"[Testing Completed.]\n")

    final_df = pd.DataFrame({"simplified_snt": predictions})
    final_df.to_csv(os.path.join(output_dir, f"test_{size}_predictions.csv"))
    console.print(
            f"""[Test] Generation on Test data saved @ {os.path.join(output_dir,f'test_{size}_predictions.csv')}\n"""
        )

In [None]:
def test(test_set,model,tokenizer,size,output_dir):
    test_set=test_set.copy()
    if model_params["CONTROL_TOKENS"]:
        test_set["source_snt"]=[preprocessor.encode_sentence(i) for i in test_set["source_snt"]]
    test_set["source_snt"] = "simplify: " + test_set["source_snt"]
    ModelTest(
        test=test_set,
        source_text="source_snt",
        size=size,
        model=model,
        tokenizer=tokenizer,
        model_params=model_params,
        output_dir=output_dir,
    )

## Load model

In [None]:
def load_model(output_dir,model_class,tokenizer_class):

    # complete path
    model_path = os.path.join(output_dir, "model_files")

    # load model
    model = model_class.from_pretrained(model_path)
    tokenizer = tokenizer_class.from_pretrained(model_path)

    return model, tokenizer

def simplify(o_text,model,tokenizer):
    if model_params["CONTROL_TOKENS"]:
        text=preprocessor.encode_sentence(o_text)
    input_ids = tokenizer.encode("simplify: "+text, return_tensors='pt',
                                    max_length=tokenizer.model_max_length,
                                    truncation=True)
    summary_ids = model.generate(input_ids = input_ids,
                                max_length=model_params["MAX_TARGET_TEXT_LENGTH"],
                                 num_beams=model_params["NUM_BEAMS"],
                                 repetition_penalty=model_params["REPETITION_PENALTY"],
                                 length_penalty=model_params["LENGTH_PENALTY"],
                                 early_stopping=True,
                                 do_sample=False,
                                 temperature=0.25,
                                 top_k=120,
                                 top_p=0.98,)

    summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(summary)

## T5 model with control tokens

### T5 Training loop

In [None]:
# tokenzier for encoding the text
model_params = {
    "MODEL": "t5-base",
    "TRAIN_BATCH_SIZE": 8,
    "VALID_BATCH_SIZE": 8,
    "TEST_BATCH_SIZE": 8,
    "TRAIN_EPOCHS": 5,
    "LEARNING_RATE": 3e-4,
    "MAX_SOURCE_TEXT_LENGTH": 75,
    "MAX_TARGET_TEXT_LENGTH": 50,
    "SEED": 42,
    "NUM_BEAMS":8,
    "REPETITION_PENALTY":2.5,
    "LENGTH_PENALTY":0.75,
    "CONTROL_TOKENS":True,
    "SENTENCE_COMPLEXITY":True,
}


model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
tokenizer = T5TokenizerFast.from_pretrained(model_params["MODEL"])
training_logger = training_logger_init()
epoch_training_logger=epoch_training_logger_init()

# control tokens
features_kwargs = {
        'WordRatioFeature': {'target_ratio': '1.05'},
        'CharRatioFeature': {'target_ratio': '0.95'},
        'LevenshteinRatioFeature': {'target_ratio': '0.75'},
        'WordRankRatioFeature': {'target_ratio': '0.95'},
        'DependencyTreeDepthRatioFeature': {'target_ratio': '0.85'}
    }

preprocessor = Preprocessor(features_kwargs)

ModelTrainer(
    dataframe=new_df,
    source_text="source_snt",
    target_text="simplified_snt",
    model=model,
    tokenizer=tokenizer,
    model_params=model_params,
    output_dir="T5_outputs",
)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.












### T5 Testing

In [None]:
model,tokenizer=load_model("T5_outputs",T5ForConditionalGeneration,T5TokenizerFast)
test(test_s,model,tokenizer,'small',"T5_outputs")




In [None]:
model,tokenizer=load_model("T5_outputs",T5ForConditionalGeneration,T5TokenizerFast)
text='This is moving us to a tipping point and to a crossroads: we must decide between a society in which the actions are determined in a top-down way and then implemented by coercion or manipulative technologies (such as personalized ads and nudging) or a society, in which decisions are taken in a free and participatory way and mutually coordinated.'
simplify(text,model,tokenizer)



this is moving us to a tipping point and to a crossroads: we must decide between a society in which the actions are determined in a top-down way and then implemented by coercion or manipulative technologies


In [None]:
text='In the modern era of automation and robotics, autonomous vehicles are currently the focus of academic and industrial research.'
simplify(text,model,tokenizer)

in the modern era of automation and robotics, autonomous vehicles are the focus of academic and industrial research.


In [None]:
df_predicted = pd.read_csv("/content/T5_outputs/test_small_predictions.csv")
columns_to_drop = ['Unnamed: 0']

# Drop the specified columns
df_predicted = df_predicted.drop(columns=columns_to_drop)
df_predicted.head()

Unnamed: 0,simplified_snt
0,one side of the armed conflicts is composed mo...
1,"Jeddah is the principal gateway to Mecca, Isla..."
2,The Great Dark Spot is thought to represent a ...
3,"his next work, Saturday, follows an especially..."
4,"the tarantula, the trickster character, spun a..."


In [None]:
df_source_actual = pd.read_csv("/content/asset_test.csv")

In [None]:
merged_df = pd.merge(df_predicted, df_source_actual, left_index=True, right_index=True)

In [None]:
merged_df.head()

Unnamed: 0,simplified_snt,original,simplifications
0,one side of the armed conflicts is composed mo...,One side of the armed conflicts is composed ma...,['On one side of the conflicts are the Sudanes...
1,"Jeddah is the principal gateway to Mecca, Isla...","Jeddah is the principal gateway to Mecca, Isla...",['Muslims are required to visit Mecca once in ...
2,The Great Dark Spot is thought to represent a ...,The Great Dark Spot is thought to represent a ...,['The dark spot on Ne;tune may be a hole in th...
3,"his next work, Saturday, follows an especially...","His next work, Saturday, follows an especially...",['Next Saturday is a presentation of a success...
4,"the tarantula, the trickster character, spun a...","The tarantula, the trickster character, spun a...",['The tarantula spun a black cord and attached...


In [None]:
merged_df['sari'] = merged_df.apply(lambda row: compute_sari([row['original']], [row['simplified_snt']], [row['simplifications']]), axis=1)

In [None]:
overall_sari = merged_df['sari'].mean()  # You can also use df['sari'].sum()

print("Overall SARI Score:", overall_sari)

Overall SARI Score: 43.76297745858551


In [None]:
!pip install textstat

[0m

In [None]:
import textstat
def compute_fkgl(text):
    return textstat.flesch_kincaid_grade(text)
merged_df['fkgl'] = merged_df['simplified_snt'].apply(compute_fkgl)

In [None]:
overall_fkgl = merged_df['fkgl'].mean()

print("Overall FKGL Score:", overall_fkgl)

Overall FKGL Score: 8.756824512534818


In [None]:
merged_df['bleu'] = merged_df.apply(lambda row: compute_bleu([row['simplified_snt']], [row['simplifications']]), axis=1)

In [None]:
merged_df.to_csv("Final_Result.csv")

In [None]:
references= ['Quantitative measures are used to determine the ingredients needed for biological processes.', 'This unit shows how much of a drug is needed to slow a biological process by half of its speed.']
predictions = ['This quantitative measure indicates how much of a particular drug or other substance is needed to inhibit a given biological process (i.e. an enzyme, cell, cell receptor or other component of a process).','This quantitative measure indicates how much of a particular drug or other substance is needed to inhibit a given biological process (i.e. an enzyme, cell, cell receptor or other component of a process).']
sources = ['This quantitative measure indicates how much of a particular drug or other substance (inhibitor) is needed to inhibit a given biological process (or component of a process, i.e. an enzyme, cell, cell receptor or microorganism) by half.','This quantitative measure indicates how much of a particular drug or other substance (inhibitor) is needed to inhibit a given biological process (or component of a process, i.e. an enzyme, cell, cell receptor or microorganism) by half.']

In [None]:
compute_sari(sources,predictions,references)

37.28732644691455

In [None]:
compute_bleu(predictions,references)

0.3060465567333454

In [None]:
text = "This quantitative measure indicates how much of a particular drug or other substance is needed to inhibit a given biological process (i.e. an enzyme, cell, cell receptor or other component of a process)."

In [None]:
textstat.flesch_kincaid_grade(text)

12.1

In [None]:
textstat.dale_chall_readability_score(text)

10.68

In [None]:
SOURCES = ["This quantitative measure indicates how much of a particular drug or other substance (inhibitor) is needed to inhibit a given biological process (or component of a process, i.e. an enzyme, cell, cell receptor or microorganism) by half."]
PRED = ["This quantitative measure indicates how much of a particular drug or other substance is needed to inhibit a given biological process (i.e. an enzyme, cell, cell receptor or other component of a process)."]
REF = ['Quantitative measures are used to determine the ingredients needed for biological processes.', 'This unit shows how much of a drug is needed to slow a biological process by half of its speed.', 'This quantitative measure shows how much of a drug or other substance is needed to inhibit a biological process by half.', 'This number measure tells how much of a drug or chemical is needed to stop a life process or part of a process by half.', 'This measure shows how much of a drug or substance is needed to slow a biological process by half.', 'This quantitative measure shows how much of a particular drug or other substance is needed to block a specific biological process by half.', 'This measure shows how much of a drug or substance is needed to halt a given process by half.', 'This quantitative measure indicates how much of a particular drug or other substance (inhibitor) is needed to inhibit a given biological process  by half.', 'This measurement will indicate how much of a particular drug or substance is needed to hold back a biological process by half.', 'It tells how much of a drug would be needed to stop a process by half.']

In [None]:
corpus_bleu(PRED,REF,lowercase=True)

0.8954307276600084

In [None]:
corpus_sari(SOURCES,PRED,[REF],lowercase=True)

17.326333983414155