In [5]:
!pip install pymupdf
!pip install spacy
!pip install scikit-learn
!pip install nltk
!pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py): started
  Building wheel for fpdf (setup.py): finished with status 'done'
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40713 sha256=83e215440e6a53168c0693f01286921109a4a5e681a13d6f2e58dd92a20d8bc5
  Stored in directory: c:\users\sundeepyalamanchili\appdata\local\pip\cache\wheels\65\4f\66\bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [4]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

pdf_text = extract_text_from_pdf("Simple_Service_Agreement.pdf")
print(pdf_text[:1000])  # Print first 1000 characters


Simple Service Agreement
Simple Service Agreement
This Service Agreement (the "Agreement") is made and entered into on this 1st day of June, 2024,
by and between:
Party A: ABC Corporation, located at 123 Business St., City, State, ZIP.
Party B: XYZ Services, located at 456 Service Ave., City, State, ZIP.
Services Provided: Party B agrees to provide software development services to Party A.
Term: The term of this Agreement shall commence on June 1, 2024, and shall continue for a period
of one year, ending on May 31, 2025.
Payment: Party A agrees to pay Party B $10,000 per month for the services provided.
Confidentiality: Both parties agree to maintain the confidentiality of any proprietary information
shared during the term of this Agreement.
Termination: Either party may terminate this Agreement with 30 days written notice to the other
party.
Governing Law: This Agreement shall be governed by and construed in accordance with the laws of
the State of [State].
Simple Service Agreement
Si

In [10]:
import fitz  # PyMuPDF
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Download and load the spaCy model
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 660.6 kB/s eta 0:00:20
     - -------------------------------------- 0.5/12.8 MB 5.1 MB/s eta 0:00:03
     ---- ----------------------------------- 1.5/12.8 MB 10.3 MB/s eta 0:00:02
     ------- -------------------------------- 2.3/12.8 MB 12.1 MB/s eta 0:00:01
     --------- ------------------------------ 3.1/12.8 MB 12.5 MB/s eta 0:00:01
     ------------ --------------------------- 3.9/12.8 MB 13.1 MB/s eta 0:00:01
     -------------- ------------------------- 4.8/12.8 MB 14.5 MB/s eta 0:00:01
     ----------------- ---------------------- 5.7/12.8 MB 14.5 MB/s eta 0:00:01
     -------------------- ------------------- 6.5/12.8 MB 14.9 MB/s eta 0:00:01
     -------------------- -------

In [12]:
def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

def extract_parties(text):
    parties = re.findall(r'Party\s[A-Z]\w*', text)
    return parties

def extract_dates(text):
    # Improved regex to capture various date formats
    dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b', text)
    return dates

def extract_payment_terms(text):
    payment_terms = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s*per month', text)
    return payment_terms

# Example training data for classifying clauses
X_train = [
    "The term of this agreement shall be one year.",
    "Party A agrees to provide services to Party B.",
    "Party A agrees to pay Party B $10,000 per month for the services provided."
]
y_train = ["Term", "Obligation", "Payment"]

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

pipeline.fit(X_train, y_train)

def classify_clause(clause):
    return pipeline.predict([clause])[0]

def extract_information_from_contract(pdf_path):
    contract_text = extract_text_from_pdf(pdf_path)
    doc = nlp(contract_text)
    sentences = [sent.text for sent in doc.sents]

    parties = extract_parties(contract_text)
    dates = extract_dates(contract_text)
    payment_terms = extract_payment_terms(contract_text)
    
    clauses = {sent: classify_clause(sent) for sent in sentences if classify_clause(sent) in ["Term", "Obligation", "Payment"]}

    return {
        "Parties": parties,
        "Dates": dates,
        "Payment Terms": payment_terms,
        "Clauses": clauses
    }

def print_contract_info(contract_info):
    print("Extracted Contract Information:")
    print("\nParties Involved:")
    for party in set(contract_info['Parties']):
        print(f" - {party}")

    print("\nDates Mentioned:")
    for date in contract_info['Dates']:
        print(f" - {date}")

    print("\nPayment Terms:")
    for payment in contract_info['Payment Terms']:
        print(f" - {payment}")

    print("\nClauses:")
    for clause, clause_type in contract_info['Clauses'].items():
        print(f" - {clause_type}: {clause.strip()}")

# Example usage
pdf_path = "Downloads/Simple_Service_Agreement.pdf"  # Ensure this is the correct path
contract_info = extract_information_from_contract(pdf_path)
print_contract_info(contract_info)

Extracted Contract Information:

Parties Involved:
 - Party B
 - Party A

Dates Mentioned:
 - June 1, 2024
 - May 31, 2025

Payment Terms:
 - $10,000 per month

Clauses:
 - Term: Simple Service Agreement
Simple Service Agreement
This Service Agreement (the "Agreement") is made and entered into on this 1st day of June, 2024,
by and between:
Party A: ABC Corporation, located at 123 Business St., City, State, ZIP.
 - Obligation: Party B: XYZ Services, located at 456 Service Ave., City, State, ZIP.
 - Term: Services Provided: Party B agrees to provide software development services to Party A.
Term: The term of this Agreement shall commence on June 1, 2024, and shall continue for a period
of one year, ending on May 31, 2025.
 - Payment: Payment: Party A agrees to pay Party B $10,000 per month for the services provided.
 - Term: Confidentiality: Both parties agree to maintain the confidentiality of any proprietary information
shared during the term of this Agreement.
 - Term: Termination:
 -