# Request for Proposal for various Insurance Policies for India Postal Payments Bank Document Summarization

## Importing Relevant Libraries

In [68]:
from __future__ import print_function
import re
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

import numpy as np
import pandas as pd
import mglearn as mg

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

%pylab
%matplotlib inline

import pyLDAvis
import pyLDAvis.sklearn

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


## Loading Data

In [32]:
#Creating a Function to Read the PDF Document and convert into Text
def convert_pdf_to_text(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
        interpreter.process_page(page)
        
    text = retstr.getvalue()
    
    fp.close()
    device.close()
    retstr.close()
    
    return text

#Reading the PDF Document and saving as lone
lone = convert_pdf_to_text('C:\\Users\\DELL\\Desktop\\Akshay\\Projects\\rfp_ipp_bank_document_summarization\\rfp_ipp_bank_document.pdf')

## 1. Data Exploration

In [33]:
#Exploring the complete document
lone

" \n \n \n \n\nREQUEST FOR PROPOSAL (RFP) \n\nfor \n\nvarious Insurance Policies  \n\nfor  \n\nIndia Post Payments Bank Limited \n\nDate: May 14, 2019 \n\nIndia Post Payments Bank \n\n  \n\n \n\n\x0cInvitation for Request for Proposal (RFP) \n\nIPPB invites proposals from eligible general insurance companies for various insurance requirements \nof India Post Payments Bank (IPPB). \n\nThis  RFP  may  be  downloaded  by \n(www.eprocure.gov.in). \n\nthe  bidders  free  of  cost  from  the  e-procure  website \n\nRFP Summary Sheet \n\nName of the company \nProcurement Reference Number \nTender Cost \nDate of issue of the RFP \nPre-bid meeting date \nLast date to seek clarification \nLast Date for reply of queries \nDate and Time for RFP submission / upload of \nbids \nDate and time for opening of RFP \nPrimary point of contact for RFP process  and \ntechnical queries \n\nInsurance Broker Contact Details \n\nPlace of Pre-bid meeting \n\nPlace of Bid submission and RFP Opening \n\nAddress fo

## 2. Data Preprocessing

In [37]:
#Splitting lines
clean_cont = lone.splitlines()

#Exploring the document after splitting lines
clean_cont

[' ',
 ' ',
 ' ',
 ' ',
 '',
 'REQUEST FOR PROPOSAL (RFP) ',
 '',
 'for ',
 '',
 'various Insurance Policies  ',
 '',
 'for  ',
 '',
 'India Post Payments Bank Limited ',
 '',
 'Date: May 14, 2019 ',
 '',
 'India Post Payments Bank ',
 '',
 '  ',
 '',
 ' ',
 '',
 '',
 'Invitation for Request for Proposal (RFP) ',
 '',
 'IPPB invites proposals from eligible general insurance companies for various insurance requirements ',
 'of India Post Payments Bank (IPPB). ',
 '',
 'This  RFP  may  be  downloaded  by ',
 '(www.eprocure.gov.in). ',
 '',
 'the  bidders  free  of  cost  from  the  e-procure  website ',
 '',
 'RFP Summary Sheet ',
 '',
 'Name of the company ',
 'Procurement Reference Number ',
 'Tender Cost ',
 'Date of issue of the RFP ',
 'Pre-bid meeting date ',
 'Last date to seek clarification ',
 'Last Date for reply of queries ',
 'Date and Time for RFP submission / upload of ',
 'bids ',
 'Date and time for opening of RFP ',
 'Primary point of contact for RFP process  and ',
 'te

In [39]:
#Removing all Non-Alphabetic Characters from the document
shear = [re.sub("[^a-zA-Z]+", " ", s) for s in clean_cont]

#Removing the unnecessary spaces from the document
shears = [x for x in shear if x != ' ']
shearss = [x for x in shears if x != '']

In [40]:
shear

[' ',
 ' ',
 ' ',
 ' ',
 '',
 'REQUEST FOR PROPOSAL RFP ',
 '',
 'for ',
 '',
 'various Insurance Policies ',
 '',
 'for ',
 '',
 'India Post Payments Bank Limited ',
 '',
 'Date May ',
 '',
 'India Post Payments Bank ',
 '',
 ' ',
 '',
 ' ',
 '',
 '',
 'Invitation for Request for Proposal RFP ',
 '',
 'IPPB invites proposals from eligible general insurance companies for various insurance requirements ',
 'of India Post Payments Bank IPPB ',
 '',
 'This RFP may be downloaded by ',
 ' www eprocure gov in ',
 '',
 'the bidders free of cost from the e procure website ',
 '',
 'RFP Summary Sheet ',
 '',
 'Name of the company ',
 'Procurement Reference Number ',
 'Tender Cost ',
 'Date of issue of the RFP ',
 'Pre bid meeting date ',
 'Last date to seek clarification ',
 'Last Date for reply of queries ',
 'Date and Time for RFP submission upload of ',
 'bids ',
 'Date and time for opening of RFP ',
 'Primary point of contact for RFP process and ',
 'technical queries ',
 '',
 'Insurance Br

In [41]:
shears

['',
 'REQUEST FOR PROPOSAL RFP ',
 '',
 'for ',
 '',
 'various Insurance Policies ',
 '',
 'for ',
 '',
 'India Post Payments Bank Limited ',
 '',
 'Date May ',
 '',
 'India Post Payments Bank ',
 '',
 '',
 '',
 '',
 'Invitation for Request for Proposal RFP ',
 '',
 'IPPB invites proposals from eligible general insurance companies for various insurance requirements ',
 'of India Post Payments Bank IPPB ',
 '',
 'This RFP may be downloaded by ',
 ' www eprocure gov in ',
 '',
 'the bidders free of cost from the e procure website ',
 '',
 'RFP Summary Sheet ',
 '',
 'Name of the company ',
 'Procurement Reference Number ',
 'Tender Cost ',
 'Date of issue of the RFP ',
 'Pre bid meeting date ',
 'Last date to seek clarification ',
 'Last Date for reply of queries ',
 'Date and Time for RFP submission upload of ',
 'bids ',
 'Date and time for opening of RFP ',
 'Primary point of contact for RFP process and ',
 'technical queries ',
 '',
 'Insurance Broker Contact Details ',
 '',
 'Place

In [42]:
shearss

['REQUEST FOR PROPOSAL RFP ',
 'for ',
 'various Insurance Policies ',
 'for ',
 'India Post Payments Bank Limited ',
 'Date May ',
 'India Post Payments Bank ',
 'Invitation for Request for Proposal RFP ',
 'IPPB invites proposals from eligible general insurance companies for various insurance requirements ',
 'of India Post Payments Bank IPPB ',
 'This RFP may be downloaded by ',
 ' www eprocure gov in ',
 'the bidders free of cost from the e procure website ',
 'RFP Summary Sheet ',
 'Name of the company ',
 'Procurement Reference Number ',
 'Tender Cost ',
 'Date of issue of the RFP ',
 'Pre bid meeting date ',
 'Last date to seek clarification ',
 'Last Date for reply of queries ',
 'Date and Time for RFP submission upload of ',
 'bids ',
 'Date and time for opening of RFP ',
 'Primary point of contact for RFP process and ',
 'technical queries ',
 'Insurance Broker Contact Details ',
 'Place of Pre bid meeting ',
 'Place of Bid submission and RFP Opening ',
 'Address for Communic

## 3. Topic Modeling

In [47]:
#Fitting Count Vectorizer on the document with Stop Words
vect = CountVectorizer(ngram_range=(1,1), stop_words='english')
dtm = vect.fit_transform(shearss)

#Document Term Matrix
dtm

<1609x1491 sparse matrix of type '<class 'numpy.int64'>'
	with 5610 stored elements in Compressed Sparse Row format>

In [48]:
#Converting the Document Term Matrix from Count Vectorizer into a Pandas Dataframe
pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,abide,ability,absolute,accept,acceptable,acceptance,accepted,access,accessed,accidental,...,writing,writings,written,wrong,wrongful,www,year,years,yes,yyyy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1605,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1606,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1607,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [53]:
#Fitting the Latent Dirichlet Allocation Model on the Document Term Matrix
lda = LatentDirichletAllocation(n_components=5)
lda_dtf = lda.fit_transform(dtm)

#Latent Dirichlet Allocation Model
lda_dtf

array([[0.05081725, 0.79770803, 0.05022423, 0.05003291, 0.05121757],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.79998644, 0.05001145, 0.05000068, 0.05000068, 0.05000074],
       ...,
       [0.0400078 , 0.04063165, 0.0402849 , 0.83906867, 0.04000698],
       [0.02320548, 0.02400545, 0.02308681, 0.90745594, 0.02224632],
       [0.79997736, 0.05001413, 0.05000276, 0.05000279, 0.05000296]])

## 4. Topic Extracting

In [61]:
#Extracting 5 Topics from LDA and the most common words in each topic
sorting = np.argsort(lda.components_)[:, ::-1]
features = np.array(vect.get_feature_names())

mg.tools.print_topics(topics=range(5), feature_names=features, sorting=sorting, topics_per_chunk=5, n_words=15)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
insurance     insured       bank          bid           date          
limits        assets        post          bank          terms         
policy        policy        india         ippb          documents     
equipment     property      payments      inr           rfp           
policies      data          bidder        loss          bid           
business      access        annexure      bidder        limit         
company       submitted     delhi         provided      cover         
including     non           new           evaluation    conditions    
insured       rfp           centre        time          bidder        
ippb          signatory     marg          process       costs         
post          offices       bidders       claim         information   
marsh         cost          information   shall         payment       
declar

In [62]:
#Senteces within the Topic Model 1
topic_0 = np.argsort(lda_dtf[:,0])[::-1]
for i in topic_0[:5]:
    print(f".".join(shearss[i].split(f".")[:2]) + f".\n")

The Bank has authorized Marsh India Insurance Brokers Pvt Limited Marsh for assisting the Bank .

Laptops Mobiles Cell Phone POS Transaction Processing Equipment I Pad Tablet .

IPPB invites proposals from eligible general insurance companies for various insurance requirements .

 Marsh Broker insurance broker refers to Marsh India Insurance Brokers Pvt Ltd .

Around lakh micro ATMs are distributed to Postal staff for carrying out business of IPPB .



In [63]:
#Senteces within the Topic Model 2
topic_1 = np.argsort(lda_dtf[:,1])[::-1]
for i in topic_1[:5]:
    print(f".".join(shearss[i].split(f".")[:2]) + f".\n")

temporary offices currency chests vaults Post offices BO SO HO customer access points etc .

temporary offices currency chests vaults Post offices BO SO HO customer .

No of access points in India customer access points lakh Post offices approx .

assets or financial condition or Marsh IPPB s reputation or render the Bidder unable to .

in separate envelops and placed in one bigger envelop and sealed This bigger envelop .



In [64]:
#Senteces within the Topic Model 3
topic_2 = np.argsort(lda_dtf[:,2])[::-1]
for i in topic_2[:5]:
    print(f".".join(shearss[i].split(f".")[:2]) + f".\n")

Corporate Office nd Floor Speed Post Centre Building Bhai Veer Singh Marg Gole Market .

Corporate Office nd Floor Speed Post Centre Building Bhai Veer Singh Marg Gole Market .

Corporate Office nd Floor Speed Post Centre Building Bhai Veer Singh Marg Gole Market .

Corporate Office nd Floor Speed Post Centre Building Bhai Veer Singh Marg Gole Market .

Certificate DSC Class II or Class III Certificates with signing key usage issued by any .



In [65]:
#Senteces within the Topic Model 4
topic_3 = np.argsort(lda_dtf[:,3])[::-1]
for i in topic_3[:5]:
    print(f".".join(shearss[i].split(f".")[:2]) + f".\n")

 Mode of transit used Car Taxi Private Vehicle Two wheeler By Cycle mail vans state .

 k IPPB s right to vary Marsh in consultation with IPPB reserves the right to vary any aspect of .

 Cover for all premises occupied by IPPB viz branches CPC mobile branches .

The total price quoted should be inclusive of applicable duties levies and charges GST etc .

 g Termination or suspension of evaluation process Bank reserves the right to suspend or .



In [66]:
#Senteces within the Topic Model 5
topic_4 = np.argsort(lda_dtf[:,4])[::-1]
for i in topic_4[:5]:
    print(f".".join(shearss[i].split(f".")[:2]) + f".\n")

https eprocure gov in eprocure app by clicking on the link Online Bidder Enrolment .

online Bids on the CPP Portal may be obtained at https eprocure gov in eprocure app .

Procurement Portal www eprocure gov in hereon referred to as CPP Portal .

The bidder shall submit the commercial bid as per the format given in Annexure IX .

 n Governing Laws Dispute Resolution The RFP and selection process shall be governed by .



## 5. Topic Visualization

In [69]:
zit=pyLDAvis.sklearn.prepare(lda,dtm,vect)

pyLDAvis.display(zit)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
