In [1]:
#import all and setup cuda


import torch
torch.cuda.empty_cache()
print("torch cuda version:", torch.version.cuda)
# Check for cuda availability
if(torch.cuda.is_available()):
    deviceCount = torch.cuda.device_count()
    currentNumber = torch.cuda.current_device()
    deviceName = torch.cuda.get_device_name(currentNumber) 
    print(f"Cuda available. {deviceCount} device(s) detected.")
    print(f"Current Device: Number:{currentNumber} Name:{deviceName}")
else:
    print("Cuda not available")

device = "cuda:0" if torch.cuda.is_available() else "cpu"

torch cuda version: 12.1
Cuda available. 1 device(s) detected.
Current Device: Number:0 Name:NVIDIA GeForce GTX 1080 Ti


In [25]:
# Get input

inputString = ""
filePath = "../testData/test.txt"
with open(filePath, "r") as file:
    inputString = file.read()

speakers = ["Sean Casten"]
speakerText = ""
if len(speakers) >  1:
    speakerText = "speeches by " + [s for s in speakers].join(", ")
else:
    speakerText = "a speech by " + speakers[0]
prompt = f"Please summarize this U.S. Government transcript of {speakerText}. Feel free to ignore things that look like this: '[[page number]]'"
inputString = prompt + inputString

print(f"Input read. Length is {len(inputString)}")

Input read. Length is 4700


In [19]:
#import the model
from transformers import AutoModelForSeq2SeqLM, pipeline, AutoTokenizer


modelID = "google/flan-t5-base"

# Instantiate a pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained(modelID)
model.to(device)


# Instantiate a pretrained tokenizer, use same as model
tokenizer = AutoTokenizer.from_pretrained(modelID)


Downloading model.safetensors: 100%|██████████| 990M/990M [00:42<00:00, 23.4MB/s] 
Downloading generation_config.json: 100%|██████████| 147/147 [00:00<?, ?B/s] 
Downloading tokenizer_config.json: 100%|██████████| 2.54k/2.54k [00:00<?, ?B/s]
Downloading spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 60.9MB/s]
Downloading tokenizer.json: 100%|██████████| 2.42M/2.42M [00:00<00:00, 8.19MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 2.20k/2.20k [00:00<?, ?B/s]


In [32]:
# Checking the limits we're working with
maxInput = tokenizer.model_max_length # max length of input
maxSentence = tokenizer.max_len_single_sentence # max len of a single sentince
specialTokens = tokenizer.num_special_tokens_to_add() # tokenizer will add 2 special tokens for input seq
print(f"Max input length: {maxInput}, Max Sentence length: {maxSentence}, SpecialTokens: {specialTokens}")

Max input length: 512, Max Sentence length: 511, SpecialTokens: 1


In [27]:
# we need nltk to tokenize large inputs
import nltk
# punkt seems nessecary
nltk.download('punkt')

sentences = nltk.tokenize.sent_tokenize(inputString)
sentenceLen = len(sentences)

# check max token length of all sentences once tokenized
maxSentenceLen = max([len(tokenizer.tokenize(sentence)) for sentence in sentences])

print(f"Num of sentences:{sentenceLen}, Max Sentence Len:{maxSentenceLen}")

Num of sentences:51, Max Sentence Len:63


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Adam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:


# holds working chunk seperated by whitespace
chunk = ""
# holds all completed chunks
chunks = []

length = 0
count = -1

for sentence in sentences:
    count += 1
    combined_length = len(tokenizer.tokenize(sentence)) + length
    
    # If the combined length is within permissable length
    if(combined_length < tokenizer.max_len_single_sentence):
        
        # and the sentince and length to our working chunk
        chunk += sentence + " "
        length = combined_length
        
        # if this is the last chunk, strip whitespace and save it in chunks
        if count == len(sentences) -1:
            chunks.append(chunk.strip())
    
    # If it breaches the maxmimum allowed chunks
    else:
        # save the chunk we have
        chunks.append(chunk.strip())
        
        # reset the length and chunk
        length = 0
        chunk = ""
        
        # add the overflowing chunk and update length
        chunk += sentence + " "
        length = len(tokenizer.tokenize(sentence))
        
        
# =============== Sanity Checks =============== #
# How many chunks we have
print(len(chunks))
# How many tokens are in each chunk (excluding special tokens)
print([len(tokenizer.tokenize(c)) for c in chunks])
# Number of tokens in eeach chunk (including spacial tokens)
print([len(tokenizer(c).input_ids) for c in chunks])
# total number of tokens in all chunks
print(sum([len(tokenizer.tokenize(c)) for c in chunks]))
# this should be close to the total number of chunks
# if it's not, thats because we removed extra whitespaces while stripping
print(len(tokenizer.tokenize(inputString)))


3
[507, 501, 67]
[508, 502, 68]
1075
1076


In [30]:
# Create our inputs using the tokenizer on our chunks
# Send them to the correct device while we're at it
inputs = [tokenizer(chunk, return_tensors="pt").to(device) for chunk in chunks]


In [31]:
# F
# print("input:")
# print(inputString)
print("output:")
for input in inputs:
    output = model.generate(**input, max_length=120)
    print(tokenizer.decode(*output, skip_special_tokens=True))

output:
The U.S. Government transcript of a speech by Sean Casten.
The House would have voted to raise the debt ceiling, and President Biden would sign it.
We can pass a clean debt limit bill.
