Install Libraries

In [None]:
!pip install google-generativeai langchain rouge bert-score language_tool_python torch py-readability-metrics tiktoken google-cloud-aiplatform

Collecting langchain
  Using cached langchain-0.0.352-py3-none-any.whl (794 kB)
Collecting rouge
  Using cached rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting bert-score
  Using cached bert_score-0.3.13-py3-none-any.whl (61 kB)
Collecting language_tool_python
  Using cached language_tool_python-2.7.1-py3-none-any.whl (34 kB)
Collecting py-readability-metrics
  Using cached py_readability_metrics-1.4.5-py3-none-any.whl (26 kB)
Collecting tiktoken
  Using cached tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Using cached dataclasses_json-0.6.3-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Using cached jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.2 (from langchain)
  Using cached langchain_community-0.0.6-py3-none-any.whl (1.5 MB)
Collecting langchain-core<0.2,>=0.1 (from langchain)
  Using cached langchain_core-0.1.3-py3-non

Helper Functions

In [None]:
import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from transformers import BertTokenizer, BertForMaskedLM
import language_tool_python
import torch
from bert_score import score
from readability import Readability

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate
import time

# rouge-1 recall -> are all the words in the reference text in the generated one?
# rouge-1 precision -> how much of the generated summary is relevant?
# rouge-1 f1 -> balance between the 2 metrics

# rouge-2 -> uses bigrams instead of unigrams
# rouge-l -> treats each summary as a sequence of words then looks for the longest common subsequence (same relative order but not necessarily contiguous, eg. there are other words in the middle) -> has the advantage of not depending on consecutive ngrams matches -> captures sentence structure more accurately
# rouge-lsum -> computed over whole summary vs avg of indiv sentences

class SummarizationMetrics:
    """
    This class implements the following metrics for evaluating text summarization models:
    1. ROUGE
    2. BLEU
    3. BERTScore
    4. Readability Index
    5. Grammar Check

    The class takes in two arguments:
    1. reference: The reference summary
    2. generated: The generated summary

    Only 4 and 5 can be generated if the reference summary is not provided.
    1, 2 and 3 require both the reference and generated summaries.

    A list can also be passed in when generating only BERTScore.
    """

    def __init__(self, reference, generated):
        self.reference = reference
        self.generated = generated

    def rouge_scores(self):
        rouge = Rouge()
        scores = rouge.get_scores(self.generated, self.reference)
        return scores

    def bleu_score(self):
        reference = [nltk.word_tokenize(self.reference)]
        generated = nltk.word_tokenize(self.generated)
        score = sentence_bleu(reference, generated)
        return score

    def bert_score(self):
        # check if they are both are lists
        if isinstance(self.reference, list) and isinstance(self.generated, list):
            P, R, F1 = score(self.generated, self.reference, lang='en', verbose=True)
        else:
            P, R, F1 = score([self.generated], [self.reference], lang='en', verbose=True)
        return P, R, F1


    def readability_index(self):
        try:
            if len(self.generated) >= 100:
                r = Readability(self.generated)
                return r.flesch_kincaid()
            else:
                return "100 words required."
        except:
            return "100 words required."

    def grammar_check(self):
        tool = language_tool_python.LanguageTool('en-US')

        # return counts of errors
        return tool.check(self.generated)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from google.colab import auth as google_auth

google_auth.authenticate_user()

In [None]:
PROJECT_ID = "nyctextsummarizer"
LOCATION = "us-central1"

import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [31]:
import pandas as pd

gcp_models = {

    # Models selected
    "models": [
        "gemini-pro",
        "chat-bison@001",
        "text-bison@001"
    ],

    # Input Token
    "max_tokens": [
        30720,
        4096,
        8196,

    ]
}

df_gcp_models = pd.DataFrame(gcp_models)
df_gcp_models

Unnamed: 0,models,max_tokens
0,chat-bison@001,4096
1,text-bison@001,8196


In [None]:
from google.cloud import aiplatform
from langchain.chat_models import ChatVertexAI
from langchain.llms import VertexAI

# Standard LLM completion model
# llm = VertexAI(
#     model_name="text-bison@001",
#     temperature=0,
#     verbose=True,
# )

# Chat
# chat = ChatVertexAI(model='chat-bison@001',
#                     temperature=0,
#                     verbose=True)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
df_test = pd.read_excel("/content/drive/MyDrive/tib_test.xlsx")
df_test = df_test.head(5)
df_test

Unnamed: 0,summary,transcript
0,A firsthand look at efforts to improve diversi...,All right. So our next talk is called Hacking...
1,It is certainly a time of discovery- though th...,"Welcome, DEF CON 28, the Do No Harm panel. Th..."
2,Roman Architecture (HSAR 252) Professor Kleine...,Good morning. As you can see from the title o...
3,Stochastic rewriting systems evolving over gra...,"Thank you very much, first important question..."
4,"In typical military operations, the advantage ...",I was great to be with all of you today. I sa...


In [None]:
df_scores = pd.DataFrame(columns=['model', 'method', 'max_tokens', 'num_tokens' ,'transcript', 'original summary' ,'summary', 'grammar', 'readability', 'rouge', 'bert_score', 'time_taken', 'prompt', 'temperature'])
df_scores

Unnamed: 0,model,method,max_tokens,num_tokens,transcript,original summary,summary,grammar,readability,rouge,bert_score,time_taken,prompt,temperature


In [35]:
for model_index, model_row in df_gcp_models.iterrows():
    model_name = model_row["models"]
    print(model_name)

    temperature = 0

    if "chat" in model_name.lower():
      llm = ChatVertexAI(temperature=temperature, model_name=model_name, verbose=True)
    else:
      llm = VertexAI(temperature=temperature, model_name=model_name, verbose=True)

    prompt_template = "Write a concise summary of the following: {text}"""
    PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])

    for index, row in df_test.iterrows():
        method = "MapReduce"

        # get the summary
        start_time = time.time()
        num_tokens = llm.get_num_tokens(row['transcript'])
        print("Number of tokens:", num_tokens)

        max_tokens = model_row["max_tokens"]
        print(max_tokens)
        # break
        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=max_tokens-100, chunk_overlap=100)
        docs = text_splitter.create_documents([row['transcript']])
        print("Number of chunks:", len(docs))

        # break
        summary_chain = load_summarize_chain(llm=llm, chain_type='map_reduce', token_max=max_tokens , map_prompt=PROMPT)
        summary = summary_chain.run(docs)

        end_time = time.time()
        elapsed_time = end_time - start_time

        metrics = SummarizationMetrics(row['summary'], summary)

        new_result = {
            'model': model_name,
            'method': method,
            'max_tokens': max_tokens,
            'transcript': row['transcript'],
            'original summary': row['summary'],
            'summary': summary,
            'rouge': metrics.rouge_scores(),
            'bert_score': metrics.bert_score(),
            'bleu': metrics.bleu_score(),
            'time_taken': elapsed_time,
            'grammar': metrics.grammar_check(),
            'readability': metrics.readability_index(),
            'num_tokens': num_tokens,
            'prompt': prompt_template,
            'temperature': temperature
        }


        new_row = pd.DataFrame([new_result])

        df_scores = pd.concat([df_scores, new_row], ignore_index=True)


chat-bison@001


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (6786 > 1024). Running this sequence through the model will result in indexing errors


Number of tokens: 6786
4096
Number of chunks: 2


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.19 seconds, 5.18 sentences/sec


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Number of tokens: 6971
4096
Number of chunks: 2


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.16 seconds, 6.07 sentences/sec


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Number of tokens: 7640
4096
Number of chunks: 2


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.20 seconds, 4.90 sentences/sec
Number of tokens: 6929
4096
Number of chunks: 2


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.28 seconds, 3.62 sentences/sec


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Number of tokens: 7052
4096
Number of chunks: 2


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.19 seconds, 5.23 sentences/sec


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


text-bison@001
Number of tokens: 6963
8196
Number of chunks: 1


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.21 seconds, 4.74 sentences/sec


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Number of tokens: 7202
8196
Number of chunks: 1


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.21 seconds, 4.71 sentences/sec


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Number of tokens: 7658
8196
Number of chunks: 1


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.26 seconds, 3.89 sentences/sec


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Number of tokens: 6868
8196
Number of chunks: 1


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.32 seconds, 3.15 sentences/sec


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Number of tokens: 7224
8196
Number of chunks: 1


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.24 seconds, 4.20 sentences/sec


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [36]:
df_scores

Unnamed: 0,model,method,max_tokens,num_tokens,transcript,original summary,summary,grammar,readability,rouge,bert_score,time_taken,prompt,temperature,bleu
0,gemini-pro,MapReduce,30720,6983,All right. So our next talk is called Hacking...,A firsthand look at efforts to improve diversi...,"Professor Christina Tamba-Hester's talk, ""Hack...","[Offset 20, length 12, Rule ID: MORFOLOGIK_RUL...",100 words required.,"[{'rouge-1': {'r': 0.09523809523809523, 'p': 0...","([tensor(0.8677)], [tensor(0.8245)], [tensor(0...",6.114026,Write a concise summary of the following: {text},0,1.069393e-79
1,gemini-pro,MapReduce,30720,7196,"Welcome, DEF CON 28, the Do No Harm panel. Th...",It is certainly a time of discovery- though th...,The DEF CON 28 Do No Harm panel discussed heal...,[],100 words required.,"[{'rouge-1': {'r': 0.12598425196850394, 'p': 0...","([tensor(0.8749)], [tensor(0.8198)], [tensor(0...",5.093334,Write a concise summary of the following: {text},0,7.721633e-80
2,gemini-pro,MapReduce,30720,7611,Good morning. As you can see from the title o...,Roman Architecture (HSAR 252) Professor Kleine...,"Herculaneum, Pompeii's sister city, is located...","[Offset 193, length 11, Rule ID: MORFOLOGIK_RU...",100 words required.,"[{'rouge-1': {'r': 0.09285714285714286, 'p': 0...","([tensor(0.8375)], [tensor(0.7858)], [tensor(0...",4.874872,Write a concise summary of the following: {text},0,2.190582e-156
3,gemini-pro,MapReduce,30720,6824,"Thank you very much, first important question...",Stochastic rewriting systems evolving over gra...,The talk explores the mathematical connections...,[],100 words required.,"[{'rouge-1': {'r': 0.08333333333333333, 'p': 0...","([tensor(0.8500)], [tensor(0.7963)], [tensor(0...",4.832208,Write a concise summary of the following: {text},0,2.0598370000000002e-156
4,gemini-pro,MapReduce,30720,7238,I was great to be with all of you today. I sa...,"In typical military operations, the advantage ...",A former military official presented a talk on...,[],100 words required.,"[{'rouge-1': {'r': 0.037037037037037035, 'p': ...","([tensor(0.8740)], [tensor(0.7981)], [tensor(0...",2.656942,Write a concise summary of the following: {text},0,1.3388729999999998e-238
5,chat-bison@001,MapReduce,4096,6786,All right. So our next talk is called Hacking...,A firsthand look at efforts to improve diversi...,The talk is about the history of diversity in ...,[],100 words required.,"[{'rouge-1': {'r': 0.06547619047619048, 'p': 0...","([tensor(0.8609)], [tensor(0.8094)], [tensor(0...",5.773754,Write a concise summary of the following: {text},0,1.9584800000000002e-156
6,chat-bison@001,MapReduce,4096,6971,"Welcome, DEF CON 28, the Do No Harm panel. Th...",It is certainly a time of discovery- though th...,Healthcare cybersecurity is a complex issue wi...,[],100 words required.,"[{'rouge-1': {'r': 0.08661417322834646, 'p': 0...","([tensor(0.8700)], [tensor(0.8051)], [tensor(0...",3.307963,Write a concise summary of the following: {text},0,3.839293e-157
7,chat-bison@001,MapReduce,4096,7640,Good morning. As you can see from the title o...,Roman Architecture (HSAR 252) Professor Kleine...,The lecture is about the city of Herculaneum a...,[],100 words required.,"[{'rouge-1': {'r': 0.11428571428571428, 'p': 0...","([tensor(0.9055)], [tensor(0.8060)], [tensor(0...",3.196498,Write a concise summary of the following: {text},0,0.002703009
8,chat-bison@001,MapReduce,4096,6929,"Thank you very much, first important question...",Stochastic rewriting systems evolving over gra...,The talk is about the combinatorial structures...,[],100 words required.,"[{'rouge-1': {'r': 0.044444444444444446, 'p': ...","([tensor(0.8761)], [tensor(0.7803)], [tensor(0...",2.540207,Write a concise summary of the following: {text},0,3.2648679999999998e-164
9,chat-bison@001,MapReduce,4096,7052,I was great to be with all of you today. I sa...,"In typical military operations, the advantage ...",The speaker discussed the different perspectiv...,"[Offset 52, length 11, Rule ID: CYBER_COMPOUND...",100 words required.,"[{'rouge-1': {'r': 0.08148148148148149, 'p': 0...","([tensor(0.8644)], [tensor(0.8001)], [tensor(0...",3.500789,Write a concise summary of the following: {text},0,7.551878e-80


In [39]:
df_scores.to_excel("/content/drive/MyDrive/closed_source_model_vertexai_api.xlsx", index=False)