Installing the libraries

In [39]:
!pip install transformers datasets textract pypdf2




In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Importing the libraries

In [40]:
import transformers
import datasets
import pandas as pd
import PyPDF2

Importing the pdf file and removing first 7 pages as they were not useful

In [41]:
input_pdf_path = '/content/drive/MyDrive/Colab Notebooks/ugrulebook.pdf'
output_pdf_path = 'output.pdf'

pdf_reader = PyPDF2.PdfReader(open(input_pdf_path, 'rb'))

pdf_writer = PyPDF2.PdfWriter()

for page_num in range(7, len(pdf_reader.pages)):
    page = pdf_reader.pages[page_num]
    pdf_writer.add_page(page)

with open(output_pdf_path, 'wb') as output_file:
    pdf_writer.write(output_file)


Making a list of text which is not useful

In [42]:
forbidden_text = ["", " ", "  ", "   ", "    ", "     "]
for i in range(0,40):
  forbidden_text.append(str(i) + "." + " ")
  forbidden_text.append(str(i) + " ")
  forbidden_text.append(str(i))
  for j in range(0,40):
    forbidden_text.append(str(i) + "." + str(j) + " ")
    for k in range(0,40):
      forbidden_text.append(str(i) + "." + str(j) + "." + str(k) + " ")

Splitting the document into paragraphs

In [43]:
import re
import textract
text = textract.process(output_pdf_path).decode('utf-8')
splitted = re.split('\n',text)
splitted_final = [text for text in splitted if text not in forbidden_text]

Creating a dataframe

In [45]:
df2 = pd.DataFrame({'text': splitted_final})

Removing the paragraphs with less than three words

In [46]:
df = df2[df2['text'].apply(lambda x: len(x.split()) >= 3)]

In [47]:
df

Unnamed: 0,text
1,The B.Tech./ Dual Degree/ B.S. programmes con...
2,"sciences, engineering and technology and other..."
3,of three phases.
4,The first phase is an intense study of science...
5,of concepts than what was done in school.
...,...
1465,Consolidated statement of the Academic Perform...
1466,for all the semesters completed.
1468,: Under-Graduate Academic Performance Evaluat...
1469,: Under-Graduate Programmes Committee


Creating a text question generator pipeline

In [48]:
from transformers import pipeline

pipe = pipeline("text2text-generation", model="voidful/context-only-question-generator", device = 0)


Using nltk library for text processing such as tokenization, lemmatization etc.

In [49]:
!pip install nltk



In [50]:
from nltk.corpus import stopwords
import nltk

In [51]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [52]:
english_stopwords = stopwords.words('english')


In [55]:
df

Unnamed: 0,text,title
1,The B.Tech./ Dual Degree/ B.S. programmes con...,The BTech Dual Degree BS programmes consist o...
2,"sciences, engineering and technology and other...",sciences engineering and technology and other ...
3,of three phases.,of three phases
4,The first phase is an intense study of science...,The first phase is an intense study of science...
5,of concepts than what was done in school.,of concepts than what was done in school
...,...,...
1465,Consolidated statement of the Academic Perform...,Consolidated statement of the Academic Perform...
1466,for all the semesters completed.,for all the semesters completed
1468,: Under-Graduate Academic Performance Evaluat...,UnderGraduate Academic Performance Evaluatio...
1469,: Under-Graduate Programmes Committee,UnderGraduate Programmes Committee


In [54]:
def remove_punct(text):
    punct_free = "".join([i for i in text if i not in string.punctuation])
    return punct_free
df["title"] = df['text'].apply(lambda x: remove_punct(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["title"] = df['text'].apply(lambda x: remove_punct(x))


In [56]:
from nltk.stem import WordNetLemmatizer
import string

In [61]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [62]:
word_net_lemmatizer = WordNetLemmatizer()

In [63]:
def tokenize_lemmatize(df):
    preprocessed = []
    for sen in df["title"]:
        tokens = sen.split()
        tokens = [word_net_lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in english_stopwords]
        sen1 = " ".join(tokens)
        preprocessed.append(sen1)
    df["title"] = preprocessed
    return df

In [64]:
df = tokenize_lemmatize(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["title"] = preprocessed


In [65]:
df

Unnamed: 0,text,title
1,The B.Tech./ Dual Degree/ B.S. programmes con...,btech dual degree b programme consist course b...
2,"sciences, engineering and technology and other...",science engineering technology related topic s...
3,of three phases.,three phase
4,The first phase is an intense study of science...,first phase intense study science mathematics ...
5,of concepts than what was done in school.,concept done school
...,...,...
1465,Consolidated statement of the Academic Perform...,consolidated statement academic performance st...
1466,for all the semesters completed.,semester completed
1468,: Under-Graduate Academic Performance Evaluat...,undergraduate academic performance evaluation ...
1469,: Under-Graduate Programmes Committee,undergraduate programme committee


Generating questions out of text and adding them to a title column

In [66]:
df["title"] = df["title"].apply(lambda text: pipe(text, max_length=50, min_length=1, do_sample=False)[0]['generated_text'])

# Display the DataFrame with headings
print(df)



                                                   text  \
1     The B.Tech./ Dual Degree/ B.S.  programmes con...   
2     sciences, engineering and technology and other...   
3                                    of three phases.     
4     The first phase is an intense study of science...   
5           of concepts than what was done in school.     
...                                                 ...   
1465  Consolidated statement of the Academic Perform...   
1466                  for all the semesters completed.    
1468  :  Under-Graduate Academic Performance Evaluat...   
1469            :  Under-Graduate Programmes Committee    
1470                  :  Undergraduate Research Award.    

                                                  title  
1              how to do btech dual degree b programme?  
2      Science engineering technology related topic ...  
3         how many phases are there in the three phase?  
4     the first phase intense study science math hum...  
5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["title"] = df["title"].apply(lambda text: pipe(text, max_length=50, min_length=1, do_sample=False)[0]['generated_text'])


In [70]:
from datasets import Dataset

In [68]:
df

Unnamed: 0,text,title
1,The B.Tech./ Dual Degree/ B.S. programmes con...,how to do btech dual degree b programme?
2,"sciences, engineering and technology and other...",Science engineering technology related topic ...
3,of three phases.,how many phases are there in the three phase?
4,The first phase is an intense study of science...,the first phase intense study science math hum...
5,of concepts than what was done in school.,did school have aconcept done school?
...,...,...
1465,Consolidated statement of the Academic Perform...,musical performance student
1466,for all the semesters completed.,"After themester was complete, what was the nex..."
1468,: Under-Graduate Academic Performance Evaluat...,undergraduate academic performance evaluation ...
1469,: Under-Graduate Programmes Committee,undergraduate program committee is comprised o...


Saving the processed dataset

In [69]:
dataset = Dataset.from_pandas(df)
dataset.save_to_disk("dataset")

Saving the dataset (0/1 shards):   0%|          | 0/1227 [00:00<?, ? examples/s]