In [1]:
#import all and setup cuda


import torch
torch.cuda.empty_cache()
print("torch cuda version:", torch.version.cuda)
# Check for cuda availability
if(torch.cuda.is_available()):
    deviceCount = torch.cuda.device_count()
    currentNumber = torch.cuda.current_device()
    deviceName = torch.cuda.get_device_name(currentNumber) 
    print(f"Cuda available. {deviceCount} device(s) detected.")
    print(f"Current Device: Number:{currentNumber} Name:{deviceName}")
else:
    print("Cuda not available")

device = "cuda:0" if torch.cuda.is_available() else "cpu"

torch cuda version: 12.1
Cuda available. 1 device(s) detected.
Current Device: Number:0 Name:NVIDIA GeForce GTX 1080 Ti


In [2]:
# Get input

inputString = ""
filePath = "../testData/test.txt"
with open(filePath, "r") as file:
    inputString = file.read()

print(f"Input read. Length is {len(inputString)}")

Input read. Length is 4558


In [3]:
#import the model
from transformers import AutoModelForSeq2SeqLM, pipeline, AutoTokenizer


modelID = "Stancld/longt5-tglobal-large-16384-pubmed-3k_steps"

# Instantiate a pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained(modelID)
model.to(device)


# Instantiate a pretrained tokenizer, use same as model
tokenizer = AutoTokenizer.from_pretrained(modelID)


  from .autonotebook import tqdm as notebook_tqdm
Downloading config.json: 100%|██████████| 853/853 [00:00<00:00, 853kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading model.safetensors: 100%|██████████| 3.13G/3.13G [01:32<00:00, 33.8MB/s]
Downloading tokenizer_config.json: 100%|██████████| 2.34k/2.34k [00:00<?, ?B/s]
Downloading tokenizer.json: 100%|██████████| 2.42M/2.42M [00:00<00:00, 7.94MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 2.20k/2.20k [00:00<?, ?B/s]


In [4]:
# Checking the limits we're working with
maxInput = tokenizer.model_max_length # max length of input
maxSentence = tokenizer.max_len_single_sentence # max len of a single sentince
specialTokens = tokenizer.num_special_tokens_to_add() # tokenizer will add 2 special tokens for input seq
print(f"Max input length: {maxInput}, Max Sentence length: {maxSentence}, SpecialTokens: {specialTokens}")

Max input length: 1000000000000000019884624838656, Max Sentence length: 1000000000000000019884624838655, SpecialTokens: 1


In [5]:
# we need nltk to tokenize large inputs
import nltk
# punkt seems nessecary
nltk.download('punkt')

sentences = nltk.tokenize.sent_tokenize(inputString)
sentenceLen = len(sentences)

# check max token length of all sentences once tokenized
maxSentenceLen = max([len(tokenizer.tokenize(sentence)) for sentence in sentences])

print(f"Num of sentences:{sentenceLen}, Max Sentence Len:{maxSentenceLen}")

Num of sentences:50, Max Sentence Len:63


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Adam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:


# holds working chunk seperated by whitespace
chunk = ""
# holds all completed chunks
chunks = []

length = 0
count = -1

for sentence in sentences:
    count += 1
    combined_length = len(tokenizer.tokenize(sentence)) + length
    
    # If the combined length is within permissable length
    if(combined_length < tokenizer.max_len_single_sentence):
        
        # and the sentince and length to our working chunk
        chunk += sentence + " "
        length = combined_length
        
        # if this is the last chunk, strip whitespace and save it in chunks
        if count == len(sentences) -1:
            chunks.append(chunk.strip())
    
    # If it breaches the maxmimum allowed chunks
    else:
        # save the chunk we have
        chunks.append(chunk.strip())
        
        # reset the length and chunk
        length = 0
        chunk = ""
        
        # add the overflowing chunk and update length
        chunk += sentence + " "
        length = len(tokenizer.tokenize(sentence))
        
        
# =============== Sanity Checks =============== #
# How many chunks we have
print(len(chunks))
# How many tokens are in each chunk (excluding special tokens)
print([len(tokenizer.tokenize(c)) for c in chunks])
# Number of tokens in eeach chunk (including spacial tokens)
print([len(tokenizer(c).input_ids) for c in chunks])
# total number of tokens in all chunks
print(sum([len(tokenizer.tokenize(c)) for c in chunks]))
# this should be close to the total number of chunks
# if it's not, thats because we removed extra whitespaces while stripping
print(len(tokenizer.tokenize(inputString)))


1
[1038]
[1039]
1038
1038


In [7]:
# Create our inputs using the tokenizer on our chunks
# Send them to the correct device while we're at it
inputs = [tokenizer(chunk, return_tensors="pt").to(device) for chunk in chunks]


In [9]:
# F
print("input:")
print(inputString)
print("output:")
for input in inputs:
    output = model.generate(**input, max_length=200)
    print(tokenizer.decode(*output, skip_special_tokens=True))

input:
The SPEAKER pro tempore. The Chair recognizes the gentleman from 
Illinois (Mr. Casten) for 5 minutes.
  Mr. CASTEN. Mr. Speaker, I want you to imagine that you get home from 
work tomorrow and decide that you just don't want to pay your mortgage, 
so you call your bank and share the good news.
  The bank, at that point, explains that if you don't, you are looking 
at eviction, so you propose a counteroffer because you are a clever 
fellow. I will pay my mortgage, but only if my family stops using hot 
water and stops going to the doctor. At which point, the bank says: 
You, sir, are a moron.
  Yet, that is what is happening in the House this week. The United 
States Government has already agreed to buy a house. We are already 
living in the house. We took out a loan to pay for the house, but the 
Republicans want to stop making the mortgage payments unless the 
American people agree to skimp on healthcare and basic services.
  I would remind everybody in this body that it is Co



the majority of the government is hoping that the adults in the room will prevail and that the adults will be so frightened by that consequence that we will accept their self-destructive alternative. the alternative is a bill that would slash nondefense discretionary spending by 22 percent. the majority would have us believe that our only choice right now is: do we want to blow up the economy or do [[page ] we want to ruin the lives of millions of people?
