### Import libraries

In [1]:
# Load required libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize
from nltk.tree import Tree
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from docx import Document

### Download NLTK resources (One time)

In [2]:
# Download (Use only once, remove # to run) 
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('punkt_tab')
#nltk.download('averaged_perceptron_tagger_eng')
#nltk.download('maxent_ne_chunker_tab')

### Customize

In [3]:
# Customize parameters
my_output_report="0907 Compliance_Automation_Report.docx"
# These are internal messages labeled as 1 = non-compliant, 0 = compliant
my_labeled_messages = [
    ("The transaction was approved by the manager after the cut-off time.", 1),
    ("We reported all client data transfers to the compliance desk.", 0),
    ("Funds were moved without notifying compliance.", 1),
    ("Customer complaints were logged and forwarded.", 0),
    ("Executive approval was bypassed for this urgent request.", 1),
    ("Quarterly audit data submitted as per guidelines.", 0),
    ("There was no report submitted for the flagged transaction.", 1),
    ("New compliance rules were implemented by the team.", 0),
    ("The request was executed even though review was pending.", 1),
    ("The report was checked and signed off.", 0)
]
# Define a list of phrases known to indicate violations
my_flags=[
    "without notifying", "bypassed", "no report", 
    "review was pending", "unreported", "not informed"
]

my_text_messages=[
    "Transaction was completed without notifying compliance team.",
    "Audit report submitted to the risk committee.",
    "The review was pending, but funds were already moved.",
    "Customer concerns were documented and handled.",
    "Policy update approved by Director Smith on April 3rd."
]



### Load models and preprocess data

In [4]:
# Sample Labeled Communications
messages = my_labeled_messages

# Split text and labels for ML training
texts, labels = zip(*messages)

### Preprocess function 

In [5]:
# Preprocessing Function to clean, tokenize, remove stopwords, and lemmatize

def preprocess(text):
    tokens = word_tokenize(text.lower())
    words = [w for w in tokens if w.isalpha()]
    stops = set(stopwords.words('english'))
    filtered = [w for w in words if w not in stops]
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(w) for w in filtered]
    return ' '.join(lemmas)

# Apply preprocessing to all messages
preprocessed_texts = [preprocess(t) for t in texts]

### Train a Logistic Regression Model

In [6]:
# Convert text to TF-IDF vectors, then train classifier
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_texts)
y = labels

model = LogisticRegression()
model.fit(X, y)

### Defining rules for detection

In [7]:
# Rule-Based Phrase Detection

# Load list of flagged phrases
rule_based_flags = my_flags

# Function to match known red flag phrases
def rule_based_check(text):
    lower = text.lower()
    for phrase in rule_based_flags:
        if phrase in lower:
            return True, phrase
    return False, None


### Entity Extraction

In [8]:
# Extract named entities like PERSON, DATE, ORG

def extract_named_entities(text):
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    tree = ne_chunk(tags, binary=False)

    entities = []
    for subtree in tree:
        if isinstance(subtree, Tree):
            entity_name = " ".join([token for token, pos in subtree.leaves()])
            entity_type = subtree.label()
            entities.append((entity_name, entity_type))
    return entities

### Analyze, write, and save report

In [9]:
# These are test cases our system will analyze
test_messages = my_text_messages

# Run Analysis and Write Report

doc = Document()
doc.add_heading("Automated Compliance Breach Detection Report", 0)

print("\n Compliance Analysis:\n")

for i, msg in enumerate(test_messages):
    print(f"🔹 Message {i+1}: {msg}")
    doc.add_heading(f"Message {i+1}", level=1)
    doc.add_paragraph(f"Text: {msg}")

    # Predict using trained ML model
    vec = vectorizer.transform([preprocess(msg)])
    prediction = model.predict(vec)[0]
    prob = model.predict_proba(vec)[0][prediction]
    ml_result = "Non-Compliant" if prediction == 1 else "Compliant"

    print(f"ML Prediction: {ml_result} (Confidence: {prob:.2f})")
    doc.add_paragraph(f"ML Prediction: {ml_result} (Confidence: {prob:.2f})")

    # Apply rule-based check
    flagged, phrase = rule_based_check(msg)
    if flagged:
        print(f"Rule-Based Alert: Phrase matched → '{phrase}'")
        doc.add_paragraph(f"Rule-Based Alert: Phrase matched → '{phrase}'")
    else:
        doc.add_paragraph("No rule-based red flag detected.")

    # Extract entities using spaCy
    ents = extract_named_entities(msg)

    if ents:
        print("Entities found:", ents)
        ent_text = ", ".join([f"{e[0]} ({e[1]})" for e in ents])
        doc.add_paragraph("Named Entities: " + ent_text)
    else:
        doc.add_paragraph("No named entities found.")

    print("-" * 50)
    doc.add_paragraph("")

# Save Report to Word Document

doc.save(my_output_report)
print(f"\n📄 Report saved as {my_output_report}")



 Compliance Analysis:

🔹 Message 1: Transaction was completed without notifying compliance team.
ML Prediction: Non-Compliant (Confidence: 0.55)
Rule-Based Alert: Phrase matched → 'without notifying'
Entities found: [('Transaction', 'GPE')]
--------------------------------------------------
🔹 Message 2: Audit report submitted to the risk committee.
ML Prediction: Compliant (Confidence: 0.52)
Entities found: [('Audit', 'GPE')]
--------------------------------------------------
🔹 Message 3: The review was pending, but funds were already moved.
ML Prediction: Non-Compliant (Confidence: 0.59)
Rule-Based Alert: Phrase matched → 'review was pending'
--------------------------------------------------
🔹 Message 4: Customer concerns were documented and handled.
ML Prediction: Compliant (Confidence: 0.55)
Entities found: [('Customer', 'GPE')]
--------------------------------------------------
🔹 Message 5: Policy update approved by Director Smith on April 3rd.
ML Prediction: Non-Compliant (Confi