In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
from datetime import datetime
import time
import os
import warnings
warnings.filterwarnings("ignore")

from gensim.summarization import summarize, summarize_corpus
import nltk
nltk.download('punkt')
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer

import tensorflow_hub as hub
from sklearn.metrics.pairwise import cosine_similarity

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

from rouge import Rouge
from bert_score import score
from sentence_transformers import SentenceTransformer


######################
# METADATA
######################
input_folder = r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\TextSum\inputs\source & target document'
output_folder = r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\TextSum\outputs'

######################
# READ IN DATA
######################
input_file = 'source_target_df.xlsx'
path = os.path.join(input_folder, input_file)

st_df=pd.read_excel(path, engine='openpyxl')

  from pandas.core.computation.check import NUMEXPR_INSTALLED
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\A4023862\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\A4023862\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
######################
# ROUGE-3 METRIC
######################

from __future__ import absolute_import
import six
import rouge.rouge_score as rouge_score
import io
import os


class FilesRouge:
    def __init__(self, *args, **kwargs):
        """See the `Rouge` class for args
        """
        self.rouge = Rouge(*args, **kwargs)

    def _check_files(self, hyp_path, ref_path):
        assert(os.path.isfile(hyp_path))
        assert(os.path.isfile(ref_path))

        def line_count(path):
            count = 0
            with open(path, "rb") as f:
                for line in f:
                    count += 1
            return count

        hyp_lc = line_count(hyp_path)
        ref_lc = line_count(ref_path)
        assert(hyp_lc == ref_lc)

    def get_scores(self, hyp_path, ref_path, avg=False, ignore_empty=False):
        """Calculate ROUGE scores between each pair of
        lines (hyp_file[i], ref_file[i]).
        Args:
          * hyp_path: hypothesis file path
          * ref_path: references file path
          * avg (False): whether to get an average scores or a list
        """
        self._check_files(hyp_path, ref_path)

        with io.open(hyp_path, encoding="utf-8", mode="r") as hyp_file:
            hyps = [line[:-1] for line in hyp_file]

        with io.open(ref_path, encoding="utf-8", mode="r") as ref_file:
            refs = [line[:-1] for line in ref_file]

        return self.rouge.get_scores(hyps, refs, avg=avg,
                                     ignore_empty=ignore_empty)


class Rouge:
    DEFAULT_METRICS = ["rouge-1", "rouge-2", "rouge-3"]
    AVAILABLE_METRICS = {
        "rouge-1": lambda hyp, ref, **k: rouge_score.rouge_n(hyp, ref, 1, **k),
        "rouge-2": lambda hyp, ref, **k: rouge_score.rouge_n(hyp, ref, 2, **k),
        "rouge-3": lambda hyp, ref, **k: rouge_score.rouge_n(hyp, ref, 3, **k),
        "rouge-4": lambda hyp, ref, **k: rouge_score.rouge_n(hyp, ref, 4, **k),
        "rouge-5": lambda hyp, ref, **k: rouge_score.rouge_n(hyp, ref, 5, **k),
        "rouge-l": lambda hyp, ref, **k:
            rouge_score.rouge_l_summary_level(hyp, ref, **k),
    }
    DEFAULT_STATS = ["r", "p", "f"]
    AVAILABLE_STATS = ["r", "p", "f"]

    def __init__(self, metrics=None, stats=None, return_lengths=False,
                 raw_results=False, exclusive=True):
        self.return_lengths = return_lengths
        self.raw_results = raw_results
        self.exclusive = exclusive

        if metrics is not None:
            self.metrics = [m.lower() for m in metrics]

            for m in self.metrics:
                if m not in Rouge.AVAILABLE_METRICS:
                    raise ValueError("Unknown metric '%s'" % m)
        else:
            self.metrics = Rouge.DEFAULT_METRICS

        if self.raw_results:
            self.stats = ["hyp", "ref", "overlap"]
        else:
            if stats is not None:
                self.stats = [s.lower() for s in stats]

                for s in self.stats:
                    if s not in Rouge.AVAILABLE_STATS:
                        raise ValueError("Unknown stat '%s'" % s)
            else:
                self.stats = Rouge.DEFAULT_STATS

    def get_scores(self, hyps, refs, avg=False, ignore_empty=False):
        if isinstance(hyps, six.string_types):
            hyps, refs = [hyps], [refs]

        if ignore_empty:
            # Filter out hyps of 0 length
            hyps_and_refs = zip(hyps, refs)
            hyps_and_refs = [_ for _ in hyps_and_refs
                             if len(_[0]) > 0
                             and len(_[1]) > 0]
            hyps, refs = zip(*hyps_and_refs)

        assert(isinstance(hyps, type(refs)))
        assert(len(hyps) == len(refs))

        if not avg:
            return self._get_scores(hyps, refs)
        return self._get_avg_scores(hyps, refs)

    def _get_scores(self, hyps, refs):
        scores = []
        for hyp, ref in zip(hyps, refs):
            sen_score = {}

            hyp = [" ".join(_.split()) for _ in hyp.split(".") if len(_) > 0]
            ref = [" ".join(_.split()) for _ in ref.split(".") if len(_) > 0]

            for m in self.metrics:
                fn = Rouge.AVAILABLE_METRICS[m]
                sc = fn(
                    hyp,
                    ref,
                    raw_results=self.raw_results,
                    exclusive=self.exclusive)
                sen_score[m] = {s: sc[s] for s in self.stats}

            if self.return_lengths:
                lengths = {
                    "hyp": len(" ".join(hyp).split()),
                    "ref": len(" ".join(ref).split())
                }
                sen_score["lengths"] = lengths
            scores.append(sen_score)
        return scores

    def _get_avg_scores(self, hyps, refs):
        scores = {m: {s: 0 for s in self.stats} for m in self.metrics}
        if self.return_lengths:
            scores["lengths"] = {"hyp": 0, "ref": 0}

        count = 0
        for (hyp, ref) in zip(hyps, refs):
            hyp = [" ".join(_.split()) for _ in hyp.split(".") if len(_) > 0]
            ref = [" ".join(_.split()) for _ in ref.split(".") if len(_) > 0]

            for m in self.metrics:
                fn = Rouge.AVAILABLE_METRICS[m]
                sc = fn(hyp, ref, exclusive=self.exclusive)
                scores[m] = {s: scores[m][s] + sc[s] for s in self.stats}

            if self.return_lengths:
                scores["lengths"]["hyp"] += len(" ".join(hyp).split())
                scores["lengths"]["ref"] += len(" ".join(ref).split())

            count += 1
        avg_scores = {
            m: {s: scores[m][s] / count for s in self.stats}
            for m in self.metrics
        }

        if self.return_lengths:
            avg_scores["lengths"] = {
                k: scores["lengths"][k] / count
                for k in ["hyp", "ref"]
            }

        return avg_scores

In [5]:
######################
# USE - UNIVERSAL SENTENCE ENCODER
######################
# This model is used to vectorize the summaries for comparison
model_wd = r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\SOP\DocSim'
input_file = 'models/universal-sentence-encoder_4'
path = os.path.join(model_wd, input_file)
use_model = hub.load(path)

# Set model to use
emb_model = use_model

In [6]:
########################
# EXTRACTIVE - GENSIM
########################
source = st_df['source'][0]
target = st_df['target'][0]

# Get number of sentences in target and source doc
sentn_target = len(sent_tokenize(target))
sentn_source = len(sent_tokenize(source))

# Summarize
sum1=summarize(source, ratio=(sentn_target/sentn_source))

# Get embeddings for target document
base_embeddings = emb_model([target])

# Get embeddings for predicted document
embeddings = emb_model([sum1])

# Calculate cosine similarity
scores = cosine_similarity(base_embeddings, embeddings).flatten()

# Calculate ROUGE score
rouge_scores = Rouge().get_scores(sum1, target)

print(sum1)
print(scores)
print(rouge_scores)

Project 1 was administered orally once daily for 4 weeks at dose levels of 0 (vehicle), 10, 30, 100, and 800 mg/kg to 3 male and 3 female cynomolgus monkeys per group in order to investigate its toxicity.
Three males and three females were added to the 800 mg/kg group in order to assess the reversibility of toxicity observed during the dosing period in a subsequent 4-week recovery period.
In the 800 mg/kg group, vomiting was observed in males and females on 1 to 5 days mainly at Weeks 1 and 2 of dosing.
Salivation was observed in males and females immediately after dosing during the dosing period.
Decreased body weight was noted in males and females during the dosing period.
Low erythrocyte count, hematocrit value, and hemoglobin concentration were noted in 1 male and 1 female on Day 27 of dosing.
High relative liver weight in males and low absolute and relative adrenal weights in 1 female were noted at the end of the dosing period.
AUC0-24h increased with dose in both males and female

In [7]:
######################
# CHRF
######################
from sacrebleu.metrics import BLEU, CHRF, TER

pred = sum1
target = [st_df['target'][0]]

chrf = CHRF()
results = chrf.corpus_score(pred, target)
# results = chrf.compute(predictions=prediction, references=reference)

print(results)

chrF2 = 10.56


In [4]:
########################
# EXTRACTIVE - SUMY: LexRank, Luhn, LSA, TextRank
########################
# Summarizer types
# summarizer = LexRankSummarizer()
# summarizer = LuhnSummarizer()
# summarizer = LsaSummarizer()
summarizer = TextRankSummarizer()

# Data
source = st_df['source'][0]
target = st_df['target'][0]

# Get number of sentences in target doc
number_of_sentences = len(sent_tokenize(target))

# Parser
parser = PlaintextParser.from_string(source, Tokenizer("english"))

# Create summary
sum2 = summarizer(parser.document, number_of_sentences) 

# Combine sentences into one summary
sum2a = ''
for sentence in sum2:
    sum2a=sum2a + str(sentence)

# Get embeddings for target document
base_embeddings = emb_model([target])

# Get embeddings for predicted document
embeddings = emb_model([sum2a])

# Calculate cosine similarity
scores = cosine_similarity(base_embeddings, embeddings).flatten()

# Calculate ROUGE score
rouge_scores = ROUGE.get_scores(sum2a, target)

print(sum2a)
print(scores)
print(rouge_scores)

Three males and three females were added to the 800 mg/kg group in order to assess the reversibility of toxicity observed during the dosing period in a subsequent 4-week recovery period.In the 10, 30, and 100 mg/kg groups, no toxicologically significant changes were noted in any examination during the dosing period.In the 800 mg/kg group, vomiting was observed in males and females on 1 to 5 days mainly at Weeks 1 and 2 of dosing.Salivation was observed in males and females immediately after dosing during the dosing period.Decreased body weight was noted in males and females during the dosing period.High triglycerides were noted in males and females on Days 14 and/or 27 of dosing, and high glucose was noted in 1 male on Days 14 and 27 of dosing.High relative liver weight in males and low absolute and relative adrenal weights in 1 female were noted at the end of the dosing period.The degree of the increase was almost dose proportional in males and females between 10 and 100 mg/kg, and wa

In [8]:
########################
# TEST AREA
########################
# Calculate ROUGE score
test1='You are smelly'
test2='The odor you have is bad'
test2='You are smelly'

# ROUGE
rouge_scores = Rouge().get_scores(test1, test2)

# CHRF
chrf_scores = chrf.corpus_score(test1, test2)

# USE
model = use_model
base_embeddings = model([test1])
embeddings = model([test2])
scores = cosine_similarity(base_embeddings, embeddings).flatten()

print(rouge_scores)
print(chrf_scores)
print(scores)

[{'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}, 'rouge-2': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}, 'rouge-3': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}}]
chrF2 = 100.00
[1.]


In [7]:
########################
# ABSTRACTIVE - T5 Small
########################
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

input_file = 'models/t5-small'
path = os.path.join(model_wd, input_file)

tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForSeq2SeqLM.from_pretrained(path)

# Data
source = st_df['source'][0]
target = st_df['target'][0]

inputs = tokenizer.encode("summarize: " + source, return_tensors='pt', max_length=512, truncation=True)
summary_ids = model.generate(inputs, max_length=1000, min_length=500, length_penalty=5., num_beams=1)
sum3 = tokenizer.decode(summary_ids[0])

# Get embeddings for target document
base_embeddings = emb_model([target])

# Get embeddings for predicted document
embeddings = emb_model([sum3])

# Calculate cosine similarity
scores = cosine_similarity(base_embeddings, embeddings).flatten()

print(len(sum3))
print(sum3)
print(scores)

NOTE: Redirects are currently not supported in Windows or MacOs.


675
<pad> project 1 was administered orally for 4 weeks at dose levels of 0 (vehicle), 10, 30, 100, and 800 mg/kg to 3 male and 3 female cynomolgus monkeys per group. three males and three females were added to the 800 mg/kg group in order to assess its reversibility. no animal died or was sacrificed due to moribundity in any group during the dosing or recovery period. a methylcellulose solution was administered to animals........................... three.............. no animal died or was sacrificed...................................................... no animal died or was.............................................................................................</s>
[0.58850896]


In [8]:
########################
# ABSTRACTIVE - Pegasus
########################
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

input_file = 'models/pegasus-xsum'
path = os.path.join(model_wd, input_file)

torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = PegasusTokenizer.from_pretrained(path)
model = PegasusForConditionalGeneration.from_pretrained(path).to(torch_device)

# Data
source = st_df['source'][0]
target = st_df['target'][0]

batch = tokenizer.prepare_seq2seq_batch(source, truncation=True, padding='longest', return_tensors="pt").to(torch_device)
translated = model.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
sum4 = tgt_text[0]

# Get embeddings for target document
base_embeddings = emb_model([target])

# Get embeddings for predicted document
embeddings = emb_model([sum3])

# Calculate cosine similarity
scores = cosine_similarity(base_embeddings, embeddings).flatten()

print(len(sum4))
print(sum4)
print(scores)

91
The no-observed-ad toxicity of methylcellulose in cynomolgus monkeys has been investigated.
[0.58850896]


In [24]:
########################
# ABSTRACTIVE - ChatGPT
########################

source = st_df['source'][0]
target = st_df['target'][0]

# Summarize this in 9 complete sentences and keep all numeric values:
sum_cgpt1 = 'A 4-week oral toxicity study was conducted on cynomolgus monkeys to investigate the effects of Project 1 at dose levels of 0, 10, 30, 100, and 800 mg/kg, with three males and three females per group. Three males and three females were added to the 800 mg/kg group for a 4-week recovery period to assess the reversibility of toxicity. No animal died or was sacrificed due to moribundity in any group during the dosing or recovery period. In the 10, 30, and 100 mg/kg groups, no significant changes were observed. In the 800 mg/kg group, vomiting, salivation, decreased body weight, low erythrocyte count, hematocrit value, and hemoglobin concentration, high triglycerides, glucose, and relative liver weight in males, and low absolute and relative adrenal weights in one female were noted. Toxicokinetics data showed that Tmax was between 0.5 and 4 hours after dosing, and Cmax and AUC0-24h increased with dose in both males and females. No apparent gender difference was observed in any TK parameter. The no-observed-adverse-effect level of Project 1 was determined to be 100 mg/kg/day for males and females, and the changes observed during the dosing period recovered during the 4-week recovery period.'

# Get embeddings for target document
base_embeddings = emb_model([target])

# Get embeddings for predicted document
embeddings = emb_model([sum_cgpt1])

# Calculate cosine similarity
scores = cosine_similarity(base_embeddings, embeddings).flatten()

# Calculate ROUGE score
rouge_scores = Rouge().get_scores(sum_cgpt1, target)

print(scores)
print(rouge_scores)

[0.85408497]
[{'rouge-1': {'r': 0.6333333333333333, 'p': 0.49137931034482757, 'f': 0.5533980533320766}, 'rouge-2': {'r': 0.31690140845070425, 'p': 0.2571428571428571, 'f': 0.2839116669784753}, 'rouge-3': {'r': 0.20754716981132076, 'p': 0.17647058823529413, 'f': 0.19075144011944953}}]


In [12]:
######################
# BERT MODEL
######################
model_wd = r'C:\Users\A4023862\OneDrive - Astellas Pharma Inc\SOP\DocSim'
# input_file = 'models/all-MiniLM-L6-v2'
input_file = 'models/sci-bert'
path = os.path.join(model_wd, input_file)
bert_model = SentenceTransformer(path)

NOTE: Redirects are currently not supported in Windows or MacOs.


In [35]:
# Data
source = st_df['source'][0]
target = st_df['target'][0]

# Tokenize
sentences = sent_tokenize(source)
sentences = sent_tokenize(target)
sentences = sent_tokenize(sum_cgpt1)
sentences = sent_tokenize(sum1)

# Embeddings
base_embeddings_sentences = bert_model.encode(sentences1)
tar_bert_emb = np.mean(np.array(base_embeddings_sentences), axis=0)

base_embeddings_sentences = bert_model.encode(sentences2)
sum_bert_emb = np.mean(np.array(base_embeddings_sentences), axis=0)

# Similarity
scores = cosine_similarity([tar_bert_emb], [sum_bert_emb]).flatten()
scores

array([0.9900106], dtype=float32)

In [38]:
from bert_score import score

# When you are running this cell for the first time, 
# it will download the BERT model which will take relatively longer. 
P, R, F1 = score([source], [target], lang="en", verbose=True)

Could not locate the tokenizer configuration file, will try to use the model config instead.


Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

loading configuration file config.json from cache at C:\Users\A4023862/.cache\huggingface\hub\models--roberta-large\snapshots\5069d8a2a32a7df4c69ef9b56348be04152a2341\config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}



Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

loading file vocab.json from cache at C:\Users\A4023862/.cache\huggingface\hub\models--roberta-large\snapshots\5069d8a2a32a7df4c69ef9b56348be04152a2341\vocab.json
loading file merges.txt from cache at C:\Users\A4023862/.cache\huggingface\hub\models--roberta-large\snapshots\5069d8a2a32a7df4c69ef9b56348be04152a2341\merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at C:\Users\A4023862/.cache\huggingface\hub\models--roberta-large\snapshots\5069d8a2a32a7df4c69ef9b56348be04152a2341\config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  