In [4]:
# wrap the output in colab cells
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

# Install Transformers

# install transformers with sentencepiece
!pip install transformers[sentencepiece]

import pandas as pd

# Specify the path to your uploaded file in Kaggle
file_path = "/kaggle/input/education/education.txt"

# Read the file using pandas or other methods depending on the file type
try:
    with open(file_path, "r") as file:
        FileContent = file.read().strip()
        # Now you can process the file_content as needed
except FileNotFoundError:
    print(f"File not found at {file_path}")


# display file content
FileContent 

# Read input file from Google Drive

# total characters in the file
len(FileContent) 

# Load the Model and Tokenizer

# import and initialize the tokenizer and model from the checkpoint
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

checkpoint = "sshleifer/distilbart-cnn-12-6"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# Some model statistics

# max tokens including the special tokens
tokenizer.model_max_length 

# max tokens excluding the special tokens
tokenizer.max_len_single_sentence 

# number of special tokens
tokenizer.num_special_tokens_to_add() 

# Convert file content to sentences

# extract the sentences from the document
import nltk
nltk.download('punkt')
sentences = nltk.tokenize.sent_tokenize(FileContent)

# find the max tokens in the longest sentence
max([len(tokenizer.tokenize(sentence)) for sentence in sentences])

# Create the chunks

# initialize
length = 0
chunk = ""
chunks = []
count = -1
for sentence in sentences:
  count += 1
  combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter

  if combined_length  <= tokenizer.max_len_single_sentence: # if it doesn't exceed
    chunk += sentence + " " # add the sentence to the chunk
    length = combined_length # update the length counter

    # if it is the last sentence
    if count == len(sentences) - 1:
      chunks.append(chunk.strip()) # save the chunk
    
  else: 
    chunks.append(chunk.strip()) # save the chunk
    
    # reset 
    length = 0 
    chunk = ""

    # take care of the overflow sentence
    chunk += sentence + " "
    length = len(tokenizer.tokenize(sentence))
len(chunks)

# Some checks

[len(tokenizer.tokenize(c)) for c in chunks]

[len(tokenizer(c).input_ids) for c in chunks]

## With special tokens added

sum([len(tokenizer(c).input_ids) for c in chunks])

len(tokenizer(FileContent).input_ids)

## Without special tokens added

sum([len(tokenizer.tokenize(c)) for c in chunks])

len(tokenizer.tokenize(FileContent))

# Get the inputs

# inputs to the model
inputs = [tokenizer(chunk, return_tensors="pt") for chunk in chunks]

# Output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
for input in inputs:
  output = model.generate(**input)
  print(tokenizer.decode(*output, skip_special_tokens=True))

 The Indian college education system is one of the largest and most diverse in the world. It plays a crucial role in shaping the future of millions of students and is a significant contributor to the country's economic and social development. The education system encompasses a wide range of institutions, courses, and approaches to education.
