### LOADING DEPENDENCIES

In [None]:
!pip install python-docx
import re
import glob
from docx import Document 
import unicodedata
from sumy.summarizers.lsa import LsaSummarizer
!pip install sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy import displacy
from sumy.summarizers.text_rank import TextRankSummarizer



### LOADING DATA:

In [None]:
# We pushed the SOW documents on github and clone the github repo whenever we need the documents
#This is faster than mounting our colab notebook to google drive
!git clone https://github.com/NLP-Contracts/NLP-summarization.git
%cd NLP-summarization/Sample\ SoW\ docs
list_docsNames = glob.glob('*.docx') #USING GLOB TO GET NAMES OF PROJECTS
 
docs = []
st = ""
for docsName in list_docsNames:
  docs.append(st.join([p.text for p in Document(docsName).paragraphs])) #USING DOCUMENT TO LOAD PROJECTS

fatal: destination path 'NLP-summarization' already exists and is not an empty directory.
/content/NLP-summarization/Sample SoW docs


### DATA CLEANING:

In [None]:
#GENERIC CLEANING METHODS
# The basic_cleaner function will be applied first in the main corpus 
#at which we will select the sections using regex pattern matching
def basic_cleaner(s):
  s = s.lower()
  s = re.sub(r'\n', '', s)
  s = re.sub(r'\t', '', s)
  s = re.sub(r' + ', '', s)
  return(s)
#after select each section using regex pattern matching
#we apply the extra_cleaner function to remove punctuations 
#excpet "." & ",", in addition the section numbers and information inside brackets 
# will be removed as well
def extra_cleaner(s):
  # s= re.sub(',', ' ', s)
  s= re.sub('/', '', s)
  s= re.sub(':', '', s)
  s= re.sub('\'', '', s)
  s= re.sub('-', '', s)
  s= re.sub('/', '', s)
  s= re.sub('<', '', s)
  s= re.sub('>', '', s)
  s=re.sub(r'\d{1,2}\.\d', '', s) # remove subsection numbers by removing digit numbers (that have)\d{1,2}\.\d pattern
  s=re.sub(r'\([^)]*\)', '', s) #remove brakets and anything thats inside the brakets
  return(s)


def sum_capatalizer(sum): #To fix the output of each section. Captalizing and adding "." at the end of each section.
  t11 = ''
  for sentence in sum: 
   t11 = t11 + ' '.join(sentence.words).capitalize() + ". "
  return(t11)
 
c_docs = [basic_cleaner(_) for _ in docs]


### Section Selection

In [None]:
# It has to be noted that often the section names are not consistant in all documents that is why we applied an "or", "|" symbol 
# to find either or matched of a string.For instance the "section9_charges" section has 
#"charges, expenses and payment terms" and in some documents"fees, expenses and payment terms" or "expenses and payment terms"
#which represnet the same section 
section3_services=[' '.join(map(str, (re.findall('(?:.0services?|.sevices?| services 3.1?)(.*?) (?:term and schedule?|and schedule?|term and?)', i)))) for i in c_docs]
section4_schedule=[' '.join(map(str, (re.findall('(?:term and schedule?|and schedule?|term and?)(.*?)(?:place of performance?|place of performance and hours?|performance and hours)', i)))) for i in c_docs]
section5_PPH = [' '.join(map(str, (re.findall('(?:place of performance and hours?|performance and hours?)(.*?)(?:structure and roles|and roles?)', i)))) for i in c_docs]
section6_roles = [' '.join(map(str, (re.findall('(?:structure and roles ?|and roles?)(.*?)(?:general responsibilities|responsibilities?)', i)))) for i in c_docs]
section7_responsibilities = [' '.join(map(str, (re.findall('(?:general responsibilities)(.*?)(?:charges, expenses and payment terms|fees, expenses and payment terms?|expenses and payment terms?|milestones, deliverables, and acceptance criteria?|8.0 intentionally left blank?|.0 intentionally left blank)', i)))) for i in c_docs]
section9_charges = [' '.join(map(str, (re.findall('(?:charges, expenses and payment terms|fees, expenses and payment terms?|expenses and payment terms?)(.*?)(?:specific service levels)', i)))) for i in c_docs]
section12_assumptions=[' '.join(map(str, (re.findall('(?:assumptions and additional provisions?)(.*?)(?:addresses for administration and invoicing)', i)))) for i in c_docs]
section14_agreement = [' '.join(map(str, (re.findall('(?:.0 agreement?)(.*?)(?:agreed and accepted?)', i)))) for i in c_docs]

#Applyinf the "extra_cleaner" function to take out brackets, subsection numbers and most of the punctuations
c_section3_services = [extra_cleaner(i) for i in section3_services]
c_section4_schedule = [extra_cleaner(i) for i in section4_schedule]
c_section5_PPH = [extra_cleaner(i) for i in section5_PPH]
c_section6_roles = [extra_cleaner(i) for i in section6_roles]
c_section7_responsibilities = [extra_cleaner(i) for i in section7_responsibilities]
c_section9_charges = [extra_cleaner(i) for i in section9_charges]
c_section12_assumptions = [extra_cleaner(i) for i in section12_assumptions]
c_section14_agreement = [extra_cleaner(i) for i in section14_agreement]
# This corpus is the collection of sections which we selectively chose to remain in the document
corpus = [] 
for i in range(len(docs)): # range of the loop is the number of documents that are introduced 
  corpus.append(c_section3_services[i] + "\n"+ c_section4_schedule[i] + "\n" + c_section5_PPH[i] + "\n" + c_section6_roles[i] + "\n" + c_section7_responsibilities[i] + "\n" + c_section9_charges[i] + "\n" + c_section12_assumptions[i] + "\n" + c_section14_agreement[i])

# We are using Corpus[0]  
AKA--> TI_SOW_58_2019_TM_MITS_Stratus_mock.docx as our sample document to display our summarization results
It must be noted that we tested our model for all documents but for easy representation of our result we just decided to show one sample corpus to Mahmadul. 

# LSA

Resources: https://scholar.google.com/citations?user=0fTuW_YAAAAJ&hl=en

###FULL DOCUMENT SUMMARIZATION USING LSA SUMMARIZATION:

In [None]:
def lsa_full(doc_num, size):
  lsa_summarizer = LsaSummarizer()  #using sumy to load Latent Semantic Analysis(Lsa) summarizer

  parser=PlaintextParser.from_string(corpus[doc_num],Tokenizer("english")) 
  full_sum = lsa_summarizer(parser.document, size)  
  full_sum = sum_capatalizer(full_sum)
  return(full_sum)

In [None]:
def summaize_full_doc(n_doc , size, func):
  
  sum_full = func(n_doc, size)
    
  if nlp(sum_full).ents:
    displacy.render(nlp(sum_full), style="ent",jupyter=True) # shows the Named Entity Recognition labels as highlights if applicable to that summary
                                                        # This will assist the reader while looking at the summarized document
  else:
    display(sum_full)                                 #If The section doesnt have NER labels then display the summarized section as is 
  print("\033[95m" + "Overal total words from the sectioned document After Summarization:"+ "\033[0m",(len(sum_full.split())))
  print("\033[95m" + "Overal total words from the sectioned document before Summarization:"+ "\033[0m",(len(corpus[n_doc].split())))
  print("\033[95m" + "Ratio to the Original document: %"+ "\033[0m",(len(sum_full.split())/(len(corpus[n_doc].split()))*100))
  print("\033[95m" + "Overal Orignal document words before Summarization:"+ "\033[0m",(len(c_docs[n_doc].split())))
  print("\033[95m" + "Ratio to the Original document: %"+ "\033[0m",(len(sum_full.split())/(len(c_docs[n_doc].split()))*100))


In [None]:
summaize_full_doc(0 ,10 , lsa_full) #First argument is the Document No # that you want to be summarized.
                                                    #Second argument is the number of sentences you want in each section summary.
                                                    #Third argument you are passing the summarization method to be used. (trank_full/lsa_full)

[95mOveral total words from the sectioned document After Summarization:[0m 805
[95mOveral total words from the sectioned document before Summarization:[0m 2452
[95mRatio to the Original document: %[0m 32.83034257748776
[95mOveral Orignal document words before Summarization:[0m 4246
[95mRatio to the Original document: %[0m 18.959020254357043


###SECTION BY SECTION USING LSA SUMMARIZATION:

In [None]:
def lsa(doc_num , size):      #Taking the Doc number that we want to summarize 
                              #and the size of summary, which is controlled by, number of
                              #sentences in each section. 
  lsa_summarizer = LsaSummarizer()  #using sumy to load Latent Semantic Analysis(Lsa) summarizer


  #Summarizing each section indivdually. 

  #Section3:
  parser=PlaintextParser.from_string(c_section3_services[doc_num],Tokenizer("english"))  
  service_sum = lsa_summarizer(parser.document,size)  
  service_sum = sum_capatalizer(service_sum)

  #Section4:
  parser=PlaintextParser.from_string(c_section4_schedule[doc_num],Tokenizer("english"))  
  schedule_sum = lsa_summarizer(parser.document,size)  
  schedule_sum = sum_capatalizer(schedule_sum)

  #Section5:
  parser=PlaintextParser.from_string(c_section5_PPH[doc_num],Tokenizer("english"))  
  PPH_sum = lsa_summarizer(parser.document,size)  
  PPH_sum = sum_capatalizer(PPH_sum)

  #Section6:
  parser=PlaintextParser.from_string(c_section6_roles[doc_num],Tokenizer("english"))  
  role_sum = lsa_summarizer(parser.document,size)  
  role_sum = sum_capatalizer(role_sum)

  #Section7:
  parser=PlaintextParser.from_string(c_section7_responsibilities[doc_num],Tokenizer("english"))  
  resp_sum = lsa_summarizer(parser.document,size)  
  resp_sum = sum_capatalizer(resp_sum)

  #Section9:
  parser=PlaintextParser.from_string(c_section9_charges[doc_num],Tokenizer("english"))  
  charge_sum = lsa_summarizer(parser.document,size)  
  charge_sum = sum_capatalizer(charge_sum)

  #Section12:
  parser=PlaintextParser.from_string(c_section12_assumptions[doc_num],Tokenizer("english"))  
  assum_sum = lsa_summarizer(parser.document,size)  
  assum_sum = sum_capatalizer(assum_sum)

  #Section14:
  parser=PlaintextParser.from_string(c_section14_agreement[doc_num],Tokenizer("english"))  
  agree_sum = lsa_summarizer(parser.document,size)  
  agree_sum = sum_capatalizer(agree_sum)

  return service_sum, schedule_sum, PPH_sum , role_sum, resp_sum, charge_sum, assum_sum, agree_sum


In [None]:
def summaize_by_section(n_doc, sentence_length, func):
  
  service_sum,schedule_sum,PPH_sum,role_sum,resp_sum, charge_sum,assum_sum,agree_sum= func(n_doc, sentence_length) #This function t5_section(n_doc) 
                                                                                                        #will select the document number 
                                                                                                        #which we want to summarize and spits out seven
                                                                                                        #summarized sections which will be displayed in the 
                                                                                                        #for loop below
  original_sections=[c_section3_services,c_section4_schedule,c_section5_PPH,c_section6_roles,c_section7_responsibilities,c_section9_charges,c_section12_assumptions,c_section14_agreement]
  sum_sections={"Services":service_sum,
            "Schedule":schedule_sum,
            "Place of Performance and Hours":PPH_sum,
            "Role":role_sum,
            "Responsibilities":resp_sum,
            "Charge":charge_sum,
            "Assumptions":assum_sum,
            "Agreement":agree_sum}

  for v,i in zip(sum_sections.items(),original_sections): #I zipped sum_sections and original_sections so to display 
                                                          #the length of words in the original_sections before summarization and after summarization

    print("\033[95m" + v[0]+ "\033[0m") #sum_sections is a dictionary which in a zip will be transformed to a list of lists 
                                        #where the list with index 0 becomes the keys and 
                                        #list with index 1 becomes the summarized strings
                                        #This print will display the section names
    print("\t")


    if nlp(v[1]).ents:
     displacy.render(nlp(v[1]), style="ent",jupyter=True) # shows the Named Entity Recognition labels as highlights if applicable to that summary
                                                          # This will assist the reader while looking at the summarized document
    else:
      display(v[1])                                       #If The section doesnt have NER labels then display the summarized section as is 
    
    print("\t")
    print("\033[33m" + "summarized lenght of section"+ "\033[0m",len(v[1].split())) # displays the word length of the sum_sections 
    print("\033[33m" + "original lenght of section"+ "\033[0m",len(i[n_doc].split())) # displays the word length of the original_sections 
    
#this for loop below helps in calculating the overal word lenght of the full summarized document with all sections 
  sum=0
  h=[]
  for k,v in sum_sections.items():
    h.append(len(v.split()))
  for i in h:
    sum=i+sum
  print("\t")
  print("\033[95m" + "Overal total words from the sectioned document in the Summarized Version:"+ "\033[0m",sum)
  print("\033[95m" + "Overal total words from the sectioned document before Summarization:"+ "\033[0m",(len(corpus[n_doc].split())))
  print("\033[95m" + "Ratio to the sectioned document: %"+ "\033[0m",(sum/(len(corpus[n_doc].split())))*100)
  print("\033[95m" + "Overal Orignal document words before Summarization:"+ "\033[0m",(len(c_docs[n_doc].split())))
  print("\033[95m" + "Ratio to the Original document: %"+ "\033[0m",(sum/(len(c_docs[n_doc].split())))*100)

    

In [None]:
summaize_by_section(0, 1, lsa)   #First argument is the Document No # that you want to be summarized.
                                 #Second argument is the number of sentences you want in each section summary.
                                 #Third argument you are passing the summarization method to be used. (trank/lsa)

[95mServices[0m
	


'Subject to the agreement the sowspecific scope of services shall include the following this sow provides a broad set of it services that are all delivered in a time and materials and staff augmentation delivery model. '

	
[33msummarized lenght of section[0m 36
[33moriginal lenght of section[0m 122
[95mSchedule[0m
	


	
[33msummarized lenght of section[0m 137
[33moriginal lenght of section[0m 147
[95mPlace of Performance and Hours[0m
	


	
[33msummarized lenght of section[0m 21
[33moriginal lenght of section[0m 192
[95mRole[0m
	


'The ti manager will procure and manage ti service representatives as required in furtherance of its obligations under this sow and shall be responsible for providing qualified ti representatives with suitable personal development training education experience competence and skill to perform the services in a workmanlike manner. '

	
[33msummarized lenght of section[0m 47
[33moriginal lenght of section[0m 375
[95mResponsibilities[0m
	


	
[33msummarized lenght of section[0m 227
[33moriginal lenght of section[0m 383
[95mCharge[0m
	


'It is understood that these ti service representatives may be shared with other telus programs. '

	
[33msummarized lenght of section[0m 15
[33moriginal lenght of section[0m 585
[95mAssumptions[0m
	


'If applicable under this sow ti will provide at no extra cost to telus any required appropriate and appropriately configured compatible with telus standards network data and or voice connectivity between the telus local area network environment and all applicable ti facilities under this sow as well as within such ti facilities lan environment. '

	
[33msummarized lenght of section[0m 54
[33moriginal lenght of section[0m 577
[95mAgreement[0m
	


'This sow and any change orders issued hereunder may be executed by the exchange of signed counterparts by facsimile transmission or electronically in pdf or similar secure format. '

	
[33msummarized lenght of section[0m 28
[33moriginal lenght of section[0m 71
	
[95mOveral total words from the sectioned document in the Summarized Version:[0m 565
[95mOveral total words from the sectioned document before Summarization:[0m 2452
[95mRatio to the sectioned document: %[0m 23.042414355628058
[95mOveral Orignal document words before Summarization:[0m 4246
[95mRatio to the Original document: %[0m 13.306641544983513


# TEXT RANK

###FULL DOCUMENT SUMMARIZATION USING LSA SUMMARIZATION:

In [None]:
def trank_full(doc_num, size):
  trank_summarizer = TextRankSummarizer()  #using sumy to load TEXT RANK summarizer

  parser=PlaintextParser.from_string(corpus[doc_num],Tokenizer("english")) 
  full_sum = trank_summarizer(parser.document, size)  
  full_sum = sum_capatalizer(full_sum)
  return(full_sum)

In [None]:
summaize_full_doc(0 ,5 , trank_full)               #First argument is the Document No # that you want to be summarized.
                                                    #Second argument is the number of sentences you want in each section summary.
                                                    #Third argument you are passing the summarization method to be used. (trank_full/lsa_full)

[95mOveral total words from the sectioned document After Summarization:[0m 1172
[95mOveral total words from the sectioned document before Summarization:[0m 2452
[95mRatio to the Original document: %[0m 47.79771615008156
[95mOveral Orignal document words before Summarization:[0m 4246
[95mRatio to the Original document: %[0m 27.602449364107397


###SECTION BY SECTION USING TEXT RANK SUMMARIZATION:

In [None]:
def trank(doc_num , size):    #Taking the Doc number that we want to summarize 
                              #and the size of summary, which is controlled by, number of
                              #sentences in each section. 

  trank_summarizer = TextRankSummarizer()  #using sumy to TEXT RANK summarizer

  #Summarizing each section indivdually. 

  #Section3:
  parser=PlaintextParser.from_string(c_section3_services[doc_num],Tokenizer("english"))  
  service_sum = trank_summarizer(parser.document,size)  
  service_sum = sum_capatalizer(service_sum)

  #Section4:
  parser=PlaintextParser.from_string(c_section4_schedule[doc_num],Tokenizer("english"))  
  schedule_sum = trank_summarizer(parser.document,size)  
  schedule_sum = sum_capatalizer(schedule_sum)

  #Section5:
  parser=PlaintextParser.from_string(c_section5_PPH[doc_num],Tokenizer("english"))  
  PPH_sum = trank_summarizer(parser.document,size)  
  PPH_sum = sum_capatalizer(PPH_sum)

  #Section6:
  parser=PlaintextParser.from_string(c_section6_roles[doc_num],Tokenizer("english"))  
  role_sum = trank_summarizer(parser.document,size)  
  role_sum = sum_capatalizer(role_sum)

  #Section7:
  parser=PlaintextParser.from_string(c_section7_responsibilities[doc_num],Tokenizer("english"))  
  resp_sum = trank_summarizer(parser.document,size)  
  resp_sum = sum_capatalizer(resp_sum)

  #Section9:
  parser=PlaintextParser.from_string(c_section9_charges[doc_num],Tokenizer("english"))  
  charge_sum = trank_summarizer(parser.document,size)  
  charge_sum = sum_capatalizer(charge_sum)

  #Section12:
  parser=PlaintextParser.from_string(c_section12_assumptions[doc_num],Tokenizer("english"))  
  assum_sum = trank_summarizer(parser.document,size)  
  assum_sum = sum_capatalizer(assum_sum)

  #Section14:
  parser=PlaintextParser.from_string(c_section14_agreement[doc_num],Tokenizer("english"))  
  agree_sum = trank_summarizer(parser.document,size)  
  agree_sum = sum_capatalizer(agree_sum)

  return service_sum, schedule_sum, PPH_sum , role_sum, resp_sum, charge_sum, assum_sum, agree_sum


def sum_capatalizer(sum): #Just to fix the output of each section. Captalizing and adding "." at the end of each section.
  t11 = ''
  for sentence in sum: 
   t11 = t11 + ' '.join(sentence.words).capitalize() + ". "
  return(t11)



In [None]:
summaize_by_section(0, 1, trank) #First argument is the Document No # that you want to be summarized.
                                 #Second argument is the number of sentences you want in each section summary.
                                 #Third argument you are passing the summarization method to be used. (trank/lsa)

[95mServices[0m
	


'The resources provided by ti service representatives scope of duties are directed and managed by telus manager and their scope of duties is therefore open to change and is dependent on the needs and priorities of telus requirements. '

	
[33msummarized lenght of section[0m 38
[33moriginal lenght of section[0m 122
[95mSchedule[0m
	


	
[33msummarized lenght of section[0m 137
[33moriginal lenght of section[0m 147
[95mPlace of Performance and Hours[0m
	


	
[33msummarized lenght of section[0m 114
[33moriginal lenght of section[0m 192
[95mRole[0m
	


'The telus manager shall be regularly available to meet with the ti telus manager shall be responsible for providing qualified ti representatives with function or project specific training coaching education and skill parties shall appoint the following key personnel for the sow termfor telus as telus rep under the agreement for purposes of this sow mock super for ti as ti csm under the agreement for purposes of this sow mock super as ti manager mock super or delegates as agreed by the parties ti shall be responsible for supplying the below resource plan to following table summarizes the project scope and scale that are currently identified to provide the services under this sow. '

	
[33msummarized lenght of section[0m 114
[33moriginal lenght of section[0m 375
[95mResponsibilities[0m
	


	
[33msummarized lenght of section[0m 227
[33moriginal lenght of section[0m 383
[95mCharge[0m
	


	
[33msummarized lenght of section[0m 201
[33moriginal lenght of section[0m 585
[95mAssumptions[0m
	


	
[33msummarized lenght of section[0m 143
[33moriginal lenght of section[0m 577
[95mAgreement[0m
	


'This sow and any change orders issued hereunder may be executed by the exchange of signed counterparts by facsimile transmission or electronically in pdf or similar secure format. '

	
[33msummarized lenght of section[0m 28
[33moriginal lenght of section[0m 71
	
[95mOveral total words from the sectioned document in the Summarized Version:[0m 1002
[95mOveral total words from the sectioned document before Summarization:[0m 2452
[95mRatio to the sectioned document: %[0m 40.864600326264274
[95mOveral Orignal document words before Summarization:[0m 4246
[95mRatio to the Original document: %[0m 23.598681111634477
