In [1]:
import fitz

In [3]:
from collections import deque  
  
def find_fastest_path_with_conveyors(matrix, start, end):  
    def get_neighbors(node):  
        directions = [(1, 0), (-1, 0), (0, 1), (0, -1)]  
        neighbors = []  
        for direction in directions:  
            neighbor = (node[0] + direction[0], node[1] + direction[1])  
            if 0 <= neighbor[0] < len(matrix) and 0 <= neighbor[1] < len(matrix[0]):  
                if matrix[neighbor[0]][neighbor[1]] == ' ':  
                    neighbors.append(neighbor)  
        return neighbors  
  
    # Mapping conveyor points  
    conveyor_map = {}  
    for i in range(len(matrix)):  
        for j in range(len(matrix[0])):  
            if matrix[i][j] == 'c':  
                if 'start' not in conveyor_map:  
                    conveyor_map['start'] = (i, j)  
                else:  
                    conveyor_map['end'] = (i, j)  
                    conveyor_map[conveyor_map['start']] = conveyor_map['end']  
                    conveyor_map[conveyor_map['end']] = conveyor_map['start']  
                    del conveyor_map['start']  
                    del conveyor_map['end']  
  
    queue = deque([(start, 0)])  
    visited = set()  
    visited.add(start)  
  
    while queue:  
        current, distance = queue.popleft()  
        if current == end:  
            return distance  
  
        for neighbor in get_neighbors(current):  
            if neighbor not in visited:  
                visited.add(neighbor)  
                queue.append((neighbor, distance + 1))  
  
        # Check if the current cell is a conveyor start point  
        if current in conveyor_map:  
            conveyor_destination = conveyor_map[current]  
            if conveyor_destination not in visited:  
                visited.add(conveyor_destination)  
                queue.append((conveyor_destination, distance + 1))  
  
    return float('inf')  
  
matrix = [  
    ['x', 'c', ' ', ' ', 'x'],  
    [' ', ' ', ' ', ' ', ' '],  
    ['x', 'x', 'c', ' ', ' '],  
    [' ', ' ', 'x', 'x', ' '],  
    ['c', ' ', ' ', ' ', ' ']  
]  
  
start = (1, 0)  
end = (3, 4)  
shortest_distance = find_fastest_path_with_conveyors(matrix, start, end)  
print("Fastest Path Distance with Conveyors:", shortest_distance)  


Fastest Path Distance with Conveyors: 6


In [8]:
from collections import deque  
  
def find_fastest_path_with_conveyors(matrix, start, end):  
    def get_neighbors(node):  
        directions = [(1, 0), (-1, 0), (0, 1), (0, -1)]  
        neighbors = []  
        for direction in directions:  
            neighbor = (node[0] + direction[0], node[1] + direction[1])  
            if 0 <= neighbor[0] < len(matrix) and 0 <= neighbor[1] < len(matrix[0]):  
                if matrix[neighbor[0]][neighbor[1]] == ' ' or matrix[neighbor[0]][neighbor[1]].startswith('c'):  
                    neighbors.append(neighbor)  
        return neighbors  
  
    # Mapping conveyor points  
    conveyor_map = {}  
    for i in range(len(matrix)):  
        for j in range(len(matrix[0])):  
            if matrix[i][j].startswith('c'):  
                if matrix[i][j] not in conveyor_map:  
                    conveyor_map[matrix[i][j]] = [(i, j)]  
                else:  
                    conveyor_map[matrix[i][j]].append((i, j))  
  
    # Create a dictionary that maps each conveyor point to its corresponding endpoint  
    conveyor_endpoints = {}  
    for points in conveyor_map.values():  
        if len(points) == 2:  
            conveyor_endpoints[points[0]] = points[1]  
            conveyor_endpoints[points[1]] = points[0]  
  
    queue = deque([(start, 0)])  
    visited = set()  
    visited.add(start)  
  
    while queue:  
        current, distance = queue.popleft()  
        if current == end:  
            return distance  
  
        for neighbor in get_neighbors(current):  
            if neighbor not in visited:  
                visited.add(neighbor)  
                queue.append((neighbor, distance + 1))  
  
        # Check if the current cell is a conveyor start point  
        if current in conveyor_endpoints:  
            conveyor_destination = conveyor_endpoints[current]  
            if conveyor_destination not in visited:  
                visited.add(conveyor_destination)  
                queue.append((conveyor_destination, distance + 1))  
  
    return float('inf')  
  
matrix = [  
    ['x', 'c1', ' ', ' ', 'x'],  
    [' ', ' ', ' ', ' ', 'c2'],  
    ['x', 'x', 'c1', ' ', ' '],  
    [' ', ' ', 'x', 'x', ' '],  
    ['c2', ' ', ' ', ' ', ' ']  
]  
  
start = (0, 2)  
end = (4, 1)  
shortest_distance = find_fastest_path_with_conveyors(matrix, start, end)  
print("Fastest Path Distance with Multiple Conveyors:", shortest_distance)  


Fastest Path Distance with Multiple Conveyors: 5


In [11]:
from collections import deque  
  
def find_fastest_path_with_conveyors(matrix, start, end):  
    def get_neighbors(node):  
        directions = [(1, 0), (-1, 0), (0, 1), (0, -1)]  
        neighbors = []  
        for direction in directions:  
            neighbor = (node[0] + direction[0], node[1] + direction[1])  
            if 0 <= neighbor[0] < len(matrix) and 0 <= neighbor[1] < len(matrix[0]):  
                if matrix[neighbor[0]][neighbor[1]] == ' ' or matrix[neighbor[0]][neighbor[1]].startswith('c'):  
                    neighbors.append(neighbor)  
        return neighbors  
  
    # Mapping conveyor points  
    conveyor_map = {}  
    for i in range(len(matrix)):  
        for j in range(len(matrix[0])):  
            if matrix[i][j].startswith('c'):  
                if matrix[i][j] not in conveyor_map:  
                    conveyor_map[matrix[i][j]] = [(i, j)]  
                else:  
                    conveyor_map[matrix[i][j]].append((i, j))  
  
    # Create a dictionary that maps each conveyor point to its corresponding endpoint  
    conveyor_endpoints = {}  
    for points in conveyor_map.values():  
        if len(points) == 2:  
            conveyor_endpoints[points[0]] = points[1]  
            conveyor_endpoints[points[1]] = points[0]  
  
    queue = deque([(start, 0)])  
    visited = set()  
    visited.add(start)  
  
    while queue:  
        current, distance = queue.popleft()  
        if current == end:  
            return distance  
  
        for neighbor in get_neighbors(current):  
            if neighbor not in visited:  
                visited.add(neighbor)  
                queue.append((neighbor, distance + 1))  
  
        # Check if the current cell is a conveyor start point  
        if current in conveyor_endpoints:  
            conveyor_destination = conveyor_endpoints[current]  
            if conveyor_destination not in visited:  
                visited.add(conveyor_destination)  
                queue.append((conveyor_destination, distance + 1))  
  
    return float('inf')  
  
matrix = [  
    ['x', 'c1', ' ', ' ', 'x'],  
    [' ', ' ', ' ', ' ', ' '],  
    ['x', 'x', 'c2', ' ', ' '],  
    [' ', ' ', 'x', 'x', ' '],  
    ['c1', ' ', ' ', 'c2', ' ']  
]  
  
start = (1,0)  
end = (4, 4)  
shortest_distance = find_fastest_path_with_conveyors(matrix, start, end)  
print("Fastest Path Distance with Multiple Conveyors:", shortest_distance)  


Fastest Path Distance with Multiple Conveyors: 5


In [12]:
from collections import deque  
  
def find_fastest_path_with_conveyors(matrix, start, end):  
    def get_neighbors(node):  
        directions = [(1, 0), (-1, 0), (0, 1), (0, -1)]  
        neighbors = []  
        for direction in directions:  
            neighbor = (node[0] + direction[0], node[1] + direction[1])  
            if 0 <= neighbor[0] < len(matrix) and 0 <= neighbor[1] < len(matrix[0]):  
                if matrix[neighbor[0]][neighbor[1]] == ' ' or matrix[neighbor[0]][neighbor[1]].startswith('c'):  
                    neighbors.append(neighbor)  
        return neighbors  
  
    # Mapping conveyor points  
    conveyor_map = {}  
    for i in range(len(matrix)):  
        for j in range(len(matrix[0])):  
            if matrix[i][j].startswith('c'):  
                if matrix[i][j] not in conveyor_map:  
                    conveyor_map[matrix[i][j]] = [(i, j)]  
                else:  
                    conveyor_map[matrix[i][j]].append((i, j))  
  
    # Create a dictionary that maps each conveyor point to its corresponding endpoint  
    conveyor_endpoints = {}  
    for points in conveyor_map.values():  
        if len(points) == 2:  
            conveyor_endpoints[points[0]] = points[1]  
            conveyor_endpoints[points[1]] = points[0]  
  
    queue = deque([(start, 0)])  
    visited = set()  
    visited.add(start)  
    parent = {start: None}  
  
    while queue:  
        current, distance = queue.popleft()  
        if current == end:  
            # Reconstruct the path  
            path = []  
            while current is not None:  
                path.append(current)  
                current = parent[current]  
            path.reverse()  
            return distance, path  
  
        for neighbor in get_neighbors(current):  
            if neighbor not in visited:  
                visited.add(neighbor)  
                parent[neighbor] = current  
                queue.append((neighbor, distance + 1))  
  
        # Check if the current cell is a conveyor start point  
        if current in conveyor_endpoints:  
            conveyor_destination = conveyor_endpoints[current]  
            if conveyor_destination not in visited:  
                visited.add(conveyor_destination)  
                parent[conveyor_destination] = current  
                queue.append((conveyor_destination, distance + 1))  
  
    return float('inf'), []  
  
matrix = [  
    ['x', 'c1', ' ', ' ', 'x'],  
    [' ', ' ', ' ', ' ', ' '],  
    ['x', 'x', 'c2', ' ', ' '],  
    [' ', ' ', 'x', 'x', ' '],  
    ['c1', ' ', ' ', 'c2', ' ']  
]  
  
start = (1, 0)  
end = (4, 4)  
shortest_distance, path = find_fastest_path_with_conveyors(matrix, start, end)  
print("Fastest Path Distance with Multiple Conveyors:", shortest_distance)  
print("Path Taken:", path)  


Fastest Path Distance with Multiple Conveyors: 5
Path Taken: [(1, 0), (1, 1), (1, 2), (2, 2), (4, 3), (4, 4)]


In [1]:
import os

def remove_highlighted_files(pdf_dir):
    # Walk through the directory and subdirectories
    for root, dirs, files in os.walk(pdf_dir):
        for file in files:
            if file.startswith("highlighted_"):
                file_path = os.path.join(root, file)
                os.remove(file_path)
                print(f"Removed file: {file_path}")

pdf_dir = 'C:/Users/akash/Desktop/IntelUnnaati-BusinessValidation/CUAD_v1/full_contract_pdf'
remove_highlighted_files(pdf_dir)


Removed file: C:/Users/akash/Desktop/IntelUnnaati-BusinessValidation/CUAD_v1/full_contract_pdf\Part_I\Affiliate_Agreements\highlighted_CreditcardscomInc_20070810_S-1_EX-10.33_362297_EX-10.33_Affiliate Agreement.pdf
Removed file: C:/Users/akash/Desktop/IntelUnnaati-BusinessValidation/CUAD_v1/full_contract_pdf\Part_I\Affiliate_Agreements\highlighted_CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605784_EX-10.27_Affiliate Agreement.pdf
Removed file: C:/Users/akash/Desktop/IntelUnnaati-BusinessValidation/CUAD_v1/full_contract_pdf\Part_I\Affiliate_Agreements\highlighted_DigitalCinemaDestinationsCorp_20111220_S-1_EX-10.10_7346719_EX-10.10_Affiliate Agreement.pdf
Removed file: C:/Users/akash/Desktop/IntelUnnaati-BusinessValidation/CUAD_v1/full_contract_pdf\Part_I\Affiliate_Agreements\highlighted_LinkPlusCorp_20050802_8-K_EX-10_3240252_EX-10_Affiliate Agreement.pdf
Removed file: C:/Users/akash/Desktop/IntelUnnaati-BusinessValidation/CUAD_v1/full_contract_pdf\Part_I\Affiliate_Agreements\highlighted

: 

In [20]:
import fitz  # PyMuPDF  
import os  
  
def parse_pdfs(pdf_dir):  
    parsed_texts = {}  
      
    # Walk through the directory and subdirectories  
    for root, dirs, files in os.walk(pdf_dir):  
        for pdf_file in files:  
            if pdf_file.endswith('.pdf'):  
                pdf_path = os.path.join(root, pdf_file)  
                doc = fitz.open(pdf_path)  
                text = ""  
                for page in doc:  
                    text += page.get_text()  
                parsed_texts[pdf_path] = text  # Store the full path as the key  
  
    return parsed_texts  
  
pdf_dir = 'C:/Users/akash/Desktop/IntelUnnaati-BusinessValidation/CUAD_v1/full_contract_pdf'  
parsed_texts = parse_pdfs(pdf_dir)  


In [15]:
import spacy  
  
# Load pre-trained model  
nlp = spacy.load("en_core_web_sm")  
  
def extract_entities(text):  
    doc = nlp(text)  
    entities = {}  
    for ent in doc.ents:  
        entities[ent.label_] = ent.text  
    return entities  
  
# Extract entities from the parsed texts  
entities = {file: extract_entities(text) for file, text in parsed_texts.items()}  
entities

In [16]:
entities

{'CreditcardscomInc_20070810_S-1_EX-10.33_362297_EX-10.33_Affiliate Agreement.pdf': {'DATE': '8/10/2007',
  'ORG': 'Continental',
  'GPE': 'S-1',
  'CARDINAL': '23',
  'WORK_OF_ART': 'Universal\xa0Movies',
  'PERSON': 'CREDITCARDS.COM',
  'ORDINAL': 'first',
  'MONEY': '55.00',
  'NORP': 'Continental',
  'PRODUCT': 'Marathon\xa0Marathon',
  'FAC': 'Park\xa0Avenue',
  'LOC': 'Marina',
  'TIME': 'last\xa0minute'},
 'CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605784_EX-10.27_Affiliate Agreement.pdf': {'CARDINAL': '10',
  'ORG': 'Technology',
  'DATE': '5/20/2014',
  'GPE': 'Company',
  'WORK_OF_ART': 'Service\xa0Fee',
  'PERSON': 'MA',
  'ORDINAL': 'first',
  'PRODUCT': 'Clients',
  'NORP': 'Company',
  'LAW': 'Section 1',
  'MONEY': '15,000.00',
  'PERCENT': '15%',
  'LANGUAGE': 'English'},
 'DigitalCinemaDestinationsCorp_20111220_S-1_EX-10.10_7346719_EX-10.10_Affiliate Agreement.pdf': {'ORG': 'DIGITAL CINEMA DESTINATIONS CORP.',
  'DATE': 'January 1, 2011',
  'GPE': 'Bloomfield',
  'WOR

In [17]:
def classify_clauses(text, entities):  
    clauses = {}  
    for entity, value in entities.items():  
        if entity in text:  
            clauses[entity] = text.split(entity)[1].split('.')[0]  # simple clause extraction  
    return clauses  
  
# Classify clauses for each document  
classified_clauses = {file: classify_clauses(text, entities[file]) for file, text in parsed_texts.items()}  
classified_clauses

{'CreditcardscomInc_20070810_S-1_EX-10.33_362297_EX-10.33_Affiliate Agreement.pdf': {'ORG': 'AN\n•\xa0CHASE\n•\xa0MANHATTAN\n•\xa0AARP\n•\xa0AMAZON',
  'FAC': 'T\xa0THAT\xa0THIS\xa0AGREEMENT\xa0HAS\xa0BEEN\xa0DRAFTED\xa0BY\xa0CHASE,\xa0AND\xa0IT\xa0SHALL\nNOT\xa0ASSERT\xa0THAT\xa0THIS\xa0AGREEMENT\xa0IS\xa0UNENFORCEABLE\xa0OR\xa0INVALID\xa0ON\xa0THE\xa0GROUNDS\xa0THAT\xa0IT\xa0IS\xa0A\xa0CONTRACT\xa0OF\xa0ADHESION,\nTHAT\xa0IT\xa0IS\xa0UNCONSCIONABLE,\xa0OR\xa0ANY\xa0SIMILAR\xa0THEORY',
  'TIME': '\xa0(DIRECTLY\xa0OR\nINDIRECTLY)\xa0SOLICIT\xa0CUSTOMER\xa0REFERRALS\xa0ON\xa0TERMS\xa0THAT\xa0MAY\xa0DIFFER\xa0FROM\xa0THOSE\xa0CONTAINED\xa0IN\xa0THIS\xa0AGREEMENT'},
 'CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605784_EX-10.27_Affiliate Agreement.pdf': {'DATE': ' OF SHIPMENT',
  'PERSON': 'S\xa0–\xa0MA\xa0shall\xa0at\xa0all\xa0times\xa0have\xa0certified\xa0sales\xa0persons\xa0trained\xa0by\nCompany on staff in accordance with certain minimums defined by each Purchase Level'},
 'DigitalCine

In [22]:
def compare_clauses(template, clauses):  
    differences = {}  
    for clause, content in template.items():  
        if clause in clauses and clauses[clause] != content:  
            differences[clause] = {'template': content, 'document': clauses[clause]}  
    return differences  
  
# Define a simple template for demonstration purposes  
template = {  
    'EFFECTIVE_DATE': 'January 1, 2020',  
    'PARTIES': 'Company A and Company B',  
    # Add more template clauses as needed  
}  
  
# Compare each document's clauses with the template  
differences = {file: compare_clauses(template, classified_clauses[file]) for file in classified_clauses}  


In [25]:
def highlight_differences(differences):  
    for pdf_path, diff in differences.items():  
        doc = fitz.open(pdf_path)  
        for page in doc:  
            for clause, contents in diff.items():  
                text_instances = page.search_for(contents['document'])  
                for inst in text_instances:  
                    highlight = page.add_highlight_annot(inst)  
                    highlight.update()  
        output_path = os.path.join(os.path.dirname(pdf_path), 'highlighted_' + os.path.basename(pdf_path))  
        doc.save(output_path)  
  
# highlight_differences(differences)  


In [24]:
pdf_dir = 'C:/Users/akash/Desktop/IntelUnnaati-BusinessValidation/CUAD_v1/full_contract_pdf'  
parsed_texts = parse_pdfs(pdf_dir)  
  
# Proceed with NER, classification, comparison, and highlighting as outlined  
entities = {file: extract_entities(text) for file, text in parsed_texts.items()}  
classified_clauses = {file: classify_clauses(text, entities[file]) for file, text in parsed_texts.items()}  
differences = {file: compare_clauses(template, classified_clauses[file]) for file in classified_clauses}  
  
highlight_differences(differences)  


In [26]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Extract text from both PDFs
original_pdf_path = 'C:/Users/akash/Desktop/IntelUnnaati-BusinessValidation/CUAD_v1/full_contract_pdf/Part_I/Affiliate_Agreements/DigitalCinemaDestinationsCorp_20111220_S-1_EX-10.10_7346719_EX-10.10_Affiliate Agreement.pdf'
highlighted_pdf_path = 'C:/Users/akash/Desktop/IntelUnnaati-BusinessValidation/CUAD_v1/full_contract_pdf/Part_I/Affiliate_Agreements/highlighted_DigitalCinemaDestinationsCorp_20111220_S-1_EX-10.10_7346719_EX-10.10_Affiliate Agreement.pdf'

original_text = extract_text_from_pdf(original_pdf_path)
highlighted_text = extract_text_from_pdf(highlighted_pdf_path)

# Compare texts to find differences
import difflib

differ = difflib.Differ()
diff = differ.compare(original_text.splitlines(), highlighted_text.splitlines())
diff_result = '\n'.join(diff)

diff_result


"  \xa0 \n  DIGITAL CINEMA DESTINATIONS CORP. \n  \xa0 \n  NETWORK AFFILIATE AGREEMENT \n  \xa0 \n  THIS NETWORK AFFILIATE AGREEMENT (this “Agreement”) is made as of this 14th day of March, 2011 by and between National CineMedia, \n  LLC, a Delaware limited liability company (“NCM”), and Digital Cinema Destinations Corp., a Delaware corporation (“Network Affiliate”\xa0and with \n  NCM, each a “Party”\xa0and collectively, the “Parties”). \n  \xa0 \n  BACKGROUND \n  \xa0 \n  WHEREAS, NCM operates a “Digital Content Network”\xa0of proprietary and third-party hardware and software pursuant to which the \n  Service may be digitally transmitted to equipment and facilities installed in, and displayed on movie screens, video display terminals and similar \n  equipment located in, movie theatres or other high traffic retail establishments, as further described herein; \n  \xa0 \n  WHEREAS, Network Affiliate owns and operates a theatre circuit with a patron base in excess of 400,000 patrons; and