Installing various libraries

In [1]:
!pip install transformers datasets textract pypdf2


Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting textract
  Downloading textract-1.6.5-py3-none-any.whl (23 kB)
Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading toke

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Importing the required libraries

In [3]:
import transformers
import datasets
import pandas as pd
import PyPDF2

Importing ug-rulebook and processing the data

In [4]:
input_pdf_path = '/content/drive/MyDrive/Colab Notebooks/ugrulebook.pdf'   #Change the path of rulebook accordingly
output_pdf_path = 'output.pdf'

pdf_reader = PyPDF2.PdfReader(open(input_pdf_path, 'rb'))

pdf_writer = PyPDF2.PdfWriter()

for page_num in range(7, len(pdf_reader.pages)):    #Removing first seven pages as they were unnecessary
    page = pdf_reader.pages[page_num]
    pdf_writer.add_page(page)

with open(output_pdf_path, 'wb') as output_file:
    pdf_writer.write(output_file)


Specifying the unnecessary texts/paragraphs

In [5]:
forbidden_text = ["", " ", "  ", "   ", "    ", "     "]
for i in range(0,40):
  forbidden_text.append(str(i) + "." + " ")
  forbidden_text.append(str(i) + " ")
  forbidden_text.append(str(i))
  for j in range(0,40):
    forbidden_text.append(str(i) + "." + str(j) + " ")
    for k in range(0,40):
      forbidden_text.append(str(i) + "." + str(j) + "." + str(k) + " ")

Splitting the pages into paragraphs

In [6]:
import re
import textract
text = textract.process(output_pdf_path).decode('utf-8')
splitted = re.split('\n',text)
splitted_final = [text for text in splitted if text not in forbidden_text]

Creating a dataframe

In [8]:
df2 = pd.DataFrame({'text': splitted_final})

Removing the paragraphs which are smaller than 3 words

In [9]:
df = df2[df2['text'].apply(lambda x: len(x.split()) >= 3)]

In [10]:
df

Unnamed: 0,text
1,The B.Tech./ Dual Degree/ B.S. programmes con...
2,"sciences, engineering and technology and other..."
3,of three phases.
4,The first phase is an intense study of science...
5,of concepts than what was done in school.
...,...
1465,Consolidated statement of the Academic Perform...
1466,for all the semesters completed.
1468,: Under-Graduate Academic Performance Evaluat...
1469,: Under-Graduate Programmes Committee


Importing text summarisation pipeline

In [11]:
from transformers import pipeline

summarization = pipeline("summarization", model="facebook/bart-large-cnn" , device = 0)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [12]:
!pip install nltk



Using NLTK for data preprocessing

In [13]:
from nltk.corpus import stopwords
import nltk

In [14]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
english_stopwords = stopwords.words('english')


In [16]:
df

Unnamed: 0,text
1,The B.Tech./ Dual Degree/ B.S. programmes con...
2,"sciences, engineering and technology and other..."
3,of three phases.
4,The first phase is an intense study of science...
5,of concepts than what was done in school.
...,...
1465,Consolidated statement of the Academic Perform...
1466,for all the semesters completed.
1468,: Under-Graduate Academic Performance Evaluat...
1469,: Under-Graduate Programmes Committee


In [17]:
def tokenization(df):
  sentences = []
  for sen in df["text"]:
    tokens = sen.split()
    tokenized = [token.lower() for token in tokens if token.lower() not in english_stopwords]
    sen2 = " ".join(tokenized)
    sentences.append(sen2)
  return sentences

In [18]:
sentences = tokenization(df)    #Tokenizing the paragraphs to remove the stopwords and joining them to form a sentence again

In [19]:
df["title"] = sentences    #Creating title column with summarized text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["title"] = sentences


In [20]:
df["title"] = df["title"].apply(lambda text: summarization(text, max_length=7, min_length=1, do_sample=False)[0]['summary_text'])   #Creating title column with summarized text

print(df)

Your max_length is set to 7, but your input_length is only 5. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)
Your max_length is set to 7, but your input_length is only 6. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)
Your max_length is set to 7, but your input_length is only 6. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)
Your max_length is set to 7, but your input_length is only 6. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)
Your max_length is s

                                                   text  \
1     The B.Tech./ Dual Degree/ B.S.  programmes con...   
2     sciences, engineering and technology and other...   
3                                    of three phases.     
4     The first phase is an intense study of science...   
5           of concepts than what was done in school.     
...                                                 ...   
1465  Consolidated statement of the Academic Perform...   
1466                  for all the semesters completed.    
1468  :  Under-Graduate Academic Performance Evaluat...   
1469            :  Under-Graduate Programmes Committee    
1470                  :  Undergraduate Research Award.    

                                   title  
1                               b.tech./  
2      sequence studies broadly consists  
3                    Three phases. three  
4              First phase intense study  
5                  concepts done school.  
...                              

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["title"] = df["title"].apply(lambda text: summarization(text, max_length=7, min_length=1, do_sample=False)[0]['summary_text'])


In [21]:
from datasets import Dataset

In [34]:
df

Unnamed: 0,text,title
1,The B.Tech./ Dual Degree/ B.S. programmes con...,b.tech./
2,"sciences, engineering and technology and other...",sequence studies broadly consists
3,of three phases.,Three phases. three
4,The first phase is an intense study of science...,First phase intense study
5,of concepts than what was done in school.,concepts done school.
...,...,...
1465,Consolidated statement of the Academic Perform...,Student performance is based
1466,for all the semesters completed.,Two more semesters
1468,: Under-Graduate Academic Performance Evaluat...,Under-graduate academic
1469,: Under-Graduate Programmes Committee,Under-graduate programmes


Saving the processed dataset

In [35]:
dataset = Dataset.from_pandas(df)
dataset.save_to_disk("dataset")

Saving the dataset (0/1 shards):   0%|          | 0/1227 [00:00<?, ? examples/s]