### Import libraries

In [1]:
# Import required libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from nltk.tree import Tree
from docx import Document
from difflib import SequenceMatcher

### Download data file (one time)

In [2]:
# Download required NLTK data (only once. Remove the # sign to run the code)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\veena\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\veena\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\veena\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\veena\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\veena\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

### Customize 

In [3]:
# Customize parameters

my_similarity_threshold=0.6
my_report="0906 NLP_Compliance_Report.docx"

my_policy_clauses=  [
    "The institution shall maintain client data for a minimum of 3 years, unless otherwise mandated by the jurisdiction.",
    "All transactions above $10,000 must be reported to the compliance officer within 24 hours.",
    "Sensitive employee information may be disclosed only upon executive approval.",
    "No monitoring is required for accounts inactive for more than 12 months.",
    "Customer complaints will not be logged if received verbally without supporting documentation."
]

my_regulatory_clauses = [
    "Client data must be retained for at least 5 years.",
    "All cash transactions above $10,000 must be reported immediately.",
    "Disclosure of employee information must be governed by consent and legal necessity.",
    "Accounts inactive for more than 6 months must undergo periodic review.",
    "Customer complaints must be logged and responded to regardless of format."
]
my_red_flags=["unless", "not required", "only upon", "no monitoring", "not be logged"]

### Load model and data

In [4]:
# Internal Policy Clauses
policy_clauses = my_policy_clauses

# Regulatory Requirements
regulatory_requirements = my_regulatory_clauses

### Prepare and run model

In [5]:
# Helper Functions

def preprocess_text(text):
    """Tokenize, lowercase, remove stopwords, and lemmatize the input text."""
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [w for w in tokens if w not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(w) for w in filtered_tokens]
    return lemmatized

def sentence_similarity(sent1, sent2):
    """Return similarity ratio between two sentences."""
    return SequenceMatcher(None, sent1, sent2).ratio()

def extract_named_entities(text):
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    tree = ne_chunk(tags, binary=False)

    entities = []
    for subtree in tree:
        if isinstance(subtree, Tree):
            entity_name = " ".join([token for token, pos in subtree.leaves()])
            entity_type = subtree.label()
            entities.append(f"{entity_name} ({entity_type})")
    return entities


# Red Flag Phrases and Similarity Threshold
red_flags = my_red_flags
SIMILARITY_THRESHOLD = my_similarity_threshold

### Run analysis and prepare and save report

In [6]:
# Prepare Word Document and Console Output
doc = Document()
doc.add_heading('Compliance Monitoring NLP Report', 0)

print("\n===========================\nCompliance Monitoring NLP Report\n===========================\n")

for i, clause in enumerate(policy_clauses):
    header = f"Clause {i+1}:"
    print(header)
    doc.add_heading(header, level=1)

    print("Text:", clause)
    doc.add_paragraph("Text: " + clause)

    # Preprocessing
    tokens = preprocess_text(clause)
    print("Preprocessed Tokens:", tokens)
    doc.add_paragraph("Preprocessed Tokens: " + str(tokens))

    # Named Entity Recognition
    ents = extract_named_entities(clause)
    ner_text = "Named Entities: " + (", ".join(ents) if ents else "None")
    print(ner_text)
    doc.add_paragraph(ner_text)

    # Red flag detection
    flagged = False
    for flag in red_flags:
        if flag in clause.lower():
            flag_msg = f"Red Flag Phrase Detected: '{flag}'"
            print(flag_msg)
            doc.add_paragraph(flag_msg)
            flagged = True
            break
    if not flagged:
        print("No red flag phrases detected.")
        doc.add_paragraph("No red flag phrases detected.")

    # Compare with regulatory requirements
    matched = False
    for reg in regulatory_requirements:
        similarity = sentence_similarity(clause.lower(), reg.lower())
        if similarity > SIMILARITY_THRESHOLD:
            sim_msg = f"Matches Regulation (Similarity: {similarity:.2f}): {reg}"
            print(sim_msg)
            doc.add_paragraph(sim_msg)
            matched = True
    if not matched:
        print("No strong match found in regulatory requirements. Review recommended.")
        doc.add_paragraph("No strong match found in regulatory requirements. Review recommended.")
    
    print("-" * 60)

# Save Word Document
doc.save(my_report)
print(f"\n Report saved as {my_report}")


Compliance Monitoring NLP Report

Clause 1:
Text: The institution shall maintain client data for a minimum of 3 years, unless otherwise mandated by the jurisdiction.
Preprocessed Tokens: ['institution', 'shall', 'maintain', 'client', 'data', 'minimum', 'year', 'unless', 'otherwise', 'mandated', 'jurisdiction']
Named Entities: None
Red Flag Phrase Detected: 'unless'
No strong match found in regulatory requirements. Review recommended.
------------------------------------------------------------
Clause 2:
Text: All transactions above $10,000 must be reported to the compliance officer within 24 hours.
Preprocessed Tokens: ['transaction', 'must', 'reported', 'compliance', 'officer', 'within', 'hour']
Named Entities: None
No red flag phrases detected.
Matches Regulation (Similarity: 0.68): All cash transactions above $10,000 must be reported immediately.
------------------------------------------------------------
Clause 3:
Text: Sensitive employee information may be disclosed only upon ex