In [None]:
# ! pip install python-doctr mplcursors matplotlib

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


#### Task 7: Determine the length of the agreement

In [None]:
import os
import json
import pandas as pd

root_dir = 'drive/MyDrive/NLP/OCR_output'

data_entries = []

for state_folder in os.listdir(root_dir):
  state_path = os.path.join(root_dir, state_folder)
  for filename in os.listdir(state_path):
    if filename.endswith('.json'):
        file_path = os.path.join(state_path, filename)

        with open(file_path, 'r', encoding='utf-8') as f:
            json_output = json.load(f)

        num_pages = len(json_output['pages'])
        total_words = 0

        for page in json_output['pages']:
            for block in page['blocks']:
                for line in block['lines']:
                    total_words += len(line['words'])

        data_entries.append({
            'State': state_folder,
            'Filename': filename,
            'Num_Pages': num_pages,
            'Total_Words': total_words
        })

df = pd.DataFrame(data_entries)

df.to_csv('drive/MyDrive/NLP/Agreements_metadata.csv', index=False)

#### Task 9: Analysis of Frequency of Recurring Clauses

In [None]:
# !pip install sentence-transformers hdbscan

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
import hdbscan

ROOT_DIR = 'drive/MyDrive/NLP/OCR_output'
MIN_WORDS_IN_CLAUSE = 10  # ignoring short lines (headers, page numbers)
MIN_CLUSTER_SIZE = 2      # a clause must appear in at least 2 docs to be a "cluster"

documents_data = [] # stores metadata for every clause found
corpus_sentences = [] # stores the actual text for embedding

for state_folder in os.listdir(ROOT_DIR):
  state_path = os.path.join(ROOT_DIR, state_folder)

  for filename in os.listdir(state_path):
    if not filename.endswith('.json'): continue

    file_path = os.path.join(state_path, filename)

    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # iterate through blocks (paragraphs)
    for page_num, page in enumerate(data['pages']):
        for block in page['blocks']:

            # reconstruct text from the block
            lines_text = []
            for line in block['lines']:
                words = [w['value'] for w in line['words']]
                lines_text.append(" ".join(words))

            block_text = " ".join(lines_text).strip()

            if len(block_text.split()) >= MIN_WORDS_IN_CLAUSE:
                corpus_sentences.append(block_text)
                documents_data.append({
                    'State': state_folder,
                    'Filename': filename,
                    'Page': page_num + 1,
                    'Text': block_text
                })

model = SentenceTransformer('all-MiniLM-L6-v2') # fast, lightweight model
embeddings = model.encode(corpus_sentences, show_progress_bar=True)

clusterer = hdbscan.HDBSCAN(
    min_cluster_size=MIN_CLUSTER_SIZE,
    min_samples=1,
    metric='euclidean'
)
cluster_labels = clusterer.fit_predict(embeddings)

df = pd.DataFrame(documents_data)
df['Cluster_ID'] = cluster_labels

cluster_counts = df['Cluster_ID'].value_counts()
total_docs = df['Filename'].nunique()

def classify_frequency(cluster_id, count):
    if cluster_id == -1:
        return "Rarely (Unique/Custom)"

    # If a clause appears in > 80% of documents, it's boilerplate
    elif count >= (total_docs * 0.8):
        return "Always (Boilerplate)"
    else:
        return "Often (Standard Clause)"

df['Frequency_Label'] = df.apply(
    lambda x: classify_frequency(x['Cluster_ID'], cluster_counts[x['Cluster_ID']]),
    axis=1
)

df.sort_values(by='Cluster_ID', inplace=True)

df.to_csv('drive/MyDrive/NLP/clause_analysis.csv', index=False)


Task 1: Identification of Areas of Cooperation

In [None]:
# !pip install transformers torch

In [None]:
import os
import json
import pandas as pd
from transformers import pipeline

ROOT_DIR = 'drive/MyDrive/NLP/OCR_output'

CANDIDATE_LABELS = [
    "Culture & Arts",
    "Education & Students",
    "Trade & Economic Development",
    "Environment & Green Energy",
    "Tourism",
    "Infrastructure & Transport",
    "Technology & Innovation",
    "Public Health",
    "Friendship & Goodwill"
]

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

results = []

for state_folder in os.listdir(ROOT_DIR):
    state_path = os.path.join(ROOT_DIR, state_folder)
    if not os.path.isdir(state_path): continue

    for filename in os.listdir(state_path):
        if not filename.endswith('.json'): continue

        file_path = os.path.join(state_path, filename)

        try:
            # Load OCR Data
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            full_text_list = []
            for page in data['pages']:
                for block in page['blocks']:
                    for line in block['lines']:
                        words = [w['value'] for w in line['words']]
                        full_text_list.append(" ".join(words))

            full_text = " ".join(full_text_list)

            truncated_text = full_text[:3000]

            prediction = classifier(truncated_text, CANDIDATE_LABELS, multi_label=True)

            scores = dict(zip(prediction['labels'], prediction['scores']))

            # Only keep topics with > 50% confidence
            active_topics = [label for label, score in scores.items() if score > 0.5]

            # If nothing scored high, grab the top 1
            if not active_topics:
                active_topics = [prediction['labels'][0]]

            results.append({
                'State': state_folder,
                'Filename': filename,
                'Detected_Topics': ", ".join(active_topics),
                'Top_Topic': prediction['labels'][0],
                'Top_Score': round(prediction['scores'][0], 3)
            })

            print(f"Processed {filename}: {active_topics}")

            df = pd.DataFrame(results)

            output_file = 'drive/MyDrive/NLP/cooperation_areas.csv'
            df.to_csv(output_file, index=False)

        except Exception as e:
            print(f"Error processing {filename}: {e}")

df = pd.DataFrame(results)

output_file = 'drive/MyDrive/NLP/cooperation_areas.csv'
df.to_csv(output_file, index=False)

