# Multi-Channel Embedding + BiLSTM (Semantic Diversity)

## Setup  

In [9]:
# ==== Setup ====
import os, random, re, json, math
import numpy as np
import pandas as pd
from pathlib import Path

# Reproducibility
SEED = 42
random.seed(SEED); np.random.seed(SEED)
import tensorflow as tf
tf.random.set_seed(SEED)

# Paths
DATA_PATH = Path(r"F:\semester 9\Legal_clause_NLP\legal_clauses_clean.csv")
ARTIFACT_DIR = Path(r"F:\semester 9\Legal_clause_NLP\artifacts_multichannel_bilstm")
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

print("TF version:", tf.__version__)
print("Data file exists:", DATA_PATH.exists())


TF version: 2.20.0
Data file exists: True


## Load Cleaned Legal Clause Dataset

In [10]:
# ==== Load cleaned dataset (2 columns: clause_text, clause_type) ====
df = pd.read_csv(DATA_PATH)
print("Shape:", df.shape)
print(df.columns.tolist())
display(df.head(5))

# Basic checks
assert set(df.columns)=={"clause_text","clause_type"}, "Dataset must have exactly clause_text, clause_type"
df = df.dropna(subset=["clause_text","clause_type"]).copy()
df["clause_text"] = df["clause_text"].astype(str).str.strip()
df["clause_type"] = df["clause_type"].astype(str).str.strip()
df = df[(df["clause_text"]!="") & (df["clause_type"]!="")]
print("After cleaning empties:", df.shape)

# Quick class distribution
counts = df["clause_type"].value_counts()
print("Unique clause types:", counts.shape[0])
print(counts.head(10))


Shape: (150865, 2)
['clause_text', 'clause_type']


Unnamed: 0,clause_text,clause_type
0,Absence of Certain Changes or Events. Except a...,absence_of_certain_changes_or_events
1,Absence of Certain Changes or Events. Since Ma...,absence_of_certain_changes_or_events
2,Absence of Certain Changes or Events. (a) Sinc...,absence_of_certain_changes_or_events
3,Absence of Certain Changes or Events. Since th...,absence_of_certain_changes_or_events
4,Absence of Certain Changes or Events. Except a...,absence_of_certain_changes_or_events


After cleaning empties: (150865, 2)
Unique clause types: 395
clause_type
time_of_essence                   630
time_of_the_essence               620
capitalized_terms                 590
definitions_and_interpretation    590
captions                          580
now_therefore                     570
headings                          570
exhibits                          570
section_headings                  570
definitions                       560
Name: count, dtype: int64


## Pair generation functions

In [11]:
# ==== Pair generation functions ====
def clean_text_minimal(t:str)->str:
    t = t.lower()
    t = re.sub(r"[^a-z0-9\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

# group clauses by type
grouped = df.groupby("clause_type")["clause_text"].apply(list).to_dict()
types = list(grouped.keys())

def build_pairs(grouped_dict, max_pos_per_type=300, neg_per_type=300, seed=SEED):
    rng = random.Random(seed)
    pairs, labels = [], []
    all_types = list(grouped_dict.keys())
    for t in all_types:
        texts = grouped_dict[t]
        if len(texts) < 2:
            continue
        # Positive pairs (same type)
        pos_iters = min(max_pos_per_type, len(texts)//2)
        for _ in range(pos_iters):
            a, b = rng.sample(texts, 2)
            pairs.append((a, b)); labels.append(1)

        # Negative pairs (different types)
        for _ in range(neg_per_type):
            other = rng.choice(all_types)
            if other == t or len(grouped_dict[other])==0:
                continue
            a = rng.choice(texts)
            b = rng.choice(grouped_dict[other])
            pairs.append((a, b)); labels.append(0)

    pairs_df = pd.DataFrame(pairs, columns=["text1","text2"])
    pairs_df["label"] = labels
    # Drop exact duplicate rows if any
    pairs_df = pairs_df.drop_duplicates().reset_index(drop=True)

    return pairs_df

pairs_df = build_pairs(grouped, max_pos_per_type=300, neg_per_type=300)
print("Pairs shape:", pairs_df.shape)
display(pairs_df.sample(min(5, len(pairs_df)), random_state=SEED))
print(pairs_df["label"].value_counts(normalize=True))


Pairs shape: (193537, 3)


Unnamed: 0,text1,text2,label
137052,Publicity. Neither party will make any press r...,"Intellectual Property. (a) WWNI owns, or posse...",0
47180,Delegation of Duties. Notwithstanding anything...,Title. Each of the Credit Parties and each of ...,0
4568,AFFIRMATIVE COVENANTS. Until all obligations o...,Independent Contractor. While engaged in carry...,0
20787,Cancellation. 12.1 Either party hereto has the...,"Cancellation. After all Principal, accrued Int...",1
156200,Salary. The Company shall pay the Employee a b...,Section Headings. The article and section head...,0


label
0    0.610622
1    0.389378
Name: proportion, dtype: float64


## Tokenization 

In [12]:
# ==== Tokenization ====
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def basic_clean_series(s):
    return s.astype(str).apply(clean_text_minimal)

texts_all = pd.concat([pairs_df["text1"], pairs_df["text2"]], axis=0)
texts_all = basic_clean_series(texts_all)

NUM_WORDS = 30000
OOV_TOKEN = "<OOV>"

tokenizer = Tokenizer(num_words=NUM_WORDS, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(texts_all.tolist())

def to_seqs(series):
    return tokenizer.texts_to_sequences(basic_clean_series(series))

seq1 = to_seqs(pairs_df["text1"])
seq2 = to_seqs(pairs_df["text2"])

lengths = [len(s) for s in seq1] + [len(s) for s in seq2]
p95 = int(np.percentile(lengths, 95))
MAX_LEN = max(120, min(300, p95))  # cap for stability
print("Tokenized vocab size (capped):", min(NUM_WORDS, len(tokenizer.word_index)+1))
print("Len stats: mean=%.1f, p95=%d, max=%d" % (np.mean(lengths), p95, max(lengths)))
print("Using MAX_LEN:", MAX_LEN)

X1 = pad_sequences(seq1, maxlen=MAX_LEN, padding="post", truncating="post")
X2 = pad_sequences(seq2, maxlen=MAX_LEN, padding="post", truncating="post")
y  = pairs_df["label"].values.astype(np.int32)

print("X shapes:", X1.shape, X2.shape, " y:", y.shape)

# Train/val split (stratified)
from sklearn.model_selection import train_test_split
X1_train, X1_val, X2_train, X2_val, y_train, y_val = train_test_split(
    X1, X2, y, test_size=0.2, random_state=SEED, stratify=y
)
X1_train.shape, X1_val.shape


Tokenized vocab size (capped): 30000
Len stats: mean=100.9, p95=311, max=691
Using MAX_LEN: 300
X shapes: (193537, 300) (193537, 300)  y: (193537,)


((154829, 300), (38708, 300))

## Building Siamese Encoder: Multi-Channel Embeddings + BiLSTM

In [13]:
# ==== Build Siamese Encoder: Multi-Channel Embeddings + BiLSTM ====
from tensorflow.keras.layers import (
    Input, Embedding, SpatialDropout1D, Bidirectional, LSTM,
    Dropout, Dense, Lambda, Concatenate, GlobalMaxPooling1D, GlobalAveragePooling1D,
    LayerNormalization, BatchNormalization
)
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers
import tensorflow.keras.backend as K

VOCAB_SIZE = min(NUM_WORDS, len(tokenizer.word_index)+1)
EMB_DIM = 128
LSTM_UNITS = 64
DENSE_UNITS = 64
DROPOUT_RATE = 0.5
L2 = 1e-5

# Shared encoder definition
def build_shared_encoder():
    inp = Input(shape=(MAX_LEN,), name="tokens")

    # Multi-channel: one static (non-trainable), one trainable
    emb_static = Embedding(VOCAB_SIZE, EMB_DIM, name="emb_static", trainable=False)(inp)
    emb_train  = Embedding(VOCAB_SIZE, EMB_DIM, name="emb_train",  trainable=True)(inp)

    # Optional: SpatialDropout to regularize embeddings
    emb_static = SpatialDropout1D(0.2)(emb_static)
    emb_train  = SpatialDropout1D(0.2)(emb_train)

    # Concatenate channels along features
    emb_cat = Concatenate(name="emb_concat")([emb_static, emb_train])  # shape: (T, 2*EMB_DIM)

    # BiLSTM with return_sequences=True, then pooling to capture both global max & mean
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(emb_cat)

    # LayerNorm to stabilize
    x = LayerNormalization()(x)

    # Feature pooling
    max_p = GlobalMaxPooling1D()(x)
    avg_p = GlobalAveragePooling1D()(x)
    x = Concatenate()([max_p, avg_p])  # shape: (2*LSTM_UNITS*2) = 256

    # MLP head inside encoder (small)
    x = Dense(DENSE_UNITS, activation="relu",
              kernel_regularizer=regularizers.l2(L2))(x)
    x = BatchNormalization()(x)
    x = Dropout(DROPOUT_RATE)(x)

    return Model(inp, x, name="shared_encoder")

encoder = build_shared_encoder()
encoder.summary()

# Siamese inputs
inp_a = Input(shape=(MAX_LEN,), name="input_a")
inp_b = Input(shape=(MAX_LEN,), name="input_b")
enc_a = encoder(inp_a)
enc_b = encoder(inp_b)

# Similarity features: |diff|, elementwise * , cosine similarity
abs_diff = Lambda(lambda t: K.abs(t[0] - t[1]))([enc_a, enc_b])
elem_mul = Lambda(lambda t: t[0] * t[1])([enc_a, enc_b])
# cosine similarity as a scalar feature
def cosine_sim(tensors):
    a, b = tensors
    a = K.l2_normalize(a, axis=-1)
    b = K.l2_normalize(b, axis=-1)
    return K.sum(a*b, axis=-1, keepdims=True)
cos_sim = Lambda(cosine_sim)([enc_a, enc_b])

merged = Concatenate()([enc_a, enc_b, abs_diff, elem_mul, cos_sim])

# Classifier head
z = Dense(128, activation="relu", kernel_regularizer=regularizers.l2(L2))(merged)
z = Dropout(0.5)(z)
z = Dense(64, activation="relu", kernel_regularizer=regularizers.l2(L2))(z)
z = Dropout(0.3)(z)
out = Dense(1, activation="sigmoid")(z)

model = Model([inp_a, inp_b], out, name="Siamese_MultiChannel_BiLSTM")
model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()


In [14]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

ckpt_path = str(ARTIFACT_DIR / "mc_bilstm_best.keras")
callbacks = [
    EarlyStopping(monitor="val_loss", patience=4, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-5, verbose=1),
    ModelCheckpoint(ckpt_path, monitor="val_loss", save_best_only=True, verbose=1)
]

history = model.fit(
    [X1_train, X2_train], y_train,
    validation_data=([X1_val, X2_val], y_val),
    epochs=20,
    batch_size=64,
    callbacks=callbacks,
    verbose=1
)

# Save final model + tokenizer
final_path = ARTIFACT_DIR / "mc_bilstm_final.keras"
model.save(str(final_path))
import joblib
joblib.dump(tokenizer, ARTIFACT_DIR / "tokenizer.pkl")
print("Saved:", final_path)


Epoch 1/20
[1m2420/2420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.7047 - loss: 0.5420
Epoch 1: val_loss improved from None to 0.20069, saving model to F:\semester 9\Legal_clause_NLP\artifacts_multichannel_bilstm\mc_bilstm_best.keras
[1m2420/2420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15166s[0m 6s/step - accuracy: 0.8204 - loss: 0.3643 - val_accuracy: 0.9016 - val_loss: 0.2007 - learning_rate: 0.0010
Epoch 2/20
[1m2420/2420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.9187 - loss: 0.2024
Epoch 2: val_loss improved from 0.20069 to 0.15994, saving model to F:\semester 9\Legal_clause_NLP\artifacts_multichannel_bilstm\mc_bilstm_best.keras
[1m2420/2420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4583s[0m 1s/step - accuracy: 0.9219 - loss: 0.1944 - val_accuracy: 0.9294 - val_loss: 0.1599 - learning_rate: 0.0010
Epoch 3/20
[1m2420/2420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.9

KeyboardInterrupt: 

## Evaluation report , Confusion Matrix and Graphs (Training curves , ROC curve)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
import matplotlib.pyplot as plt

# Predictions
y_prob = model.predict([X1_val, X2_val]).ravel()
y_pred = (y_prob >= 0.5).astype(int)

acc  = accuracy_score(y_val, y_pred)
prec = precision_score(y_val, y_pred)
rec  = recall_score(y_val, y_pred)
f1   = f1_score(y_val, y_pred)
auc  = roc_auc_score(y_val, y_prob)

print("Accuracy :", f"{acc:.4f}")
print("Precision:", f"{prec:.4f}")
print("Recall   :", f"{rec:.4f}")
print("F1-score :", f"{f1:.4f}")
print("ROC-AUC  :", f"{auc:.4f}")

print("\nClassification Report:\n", classification_report(y_val, y_pred, digits=4))

# Confusion matrix
cm = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:\n", cm)

# Training curves
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(history.history["loss"], label="train loss")
plt.plot(history.history["val_loss"], label="val loss")
plt.title("Loss"); plt.legend()

plt.subplot(1,2,2)
plt.plot(history.history["accuracy"], label="train acc")
plt.plot(history.history["val_accuracy"], label="val acc")
plt.title("Accuracy"); plt.legend()
plt.show()

# ROC curve
fpr, tpr, thr = roc_curve(y_val, y_prob)
plt.figure(figsize=(5,4))
plt.plot(fpr, tpr, label=f"AUC={auc:.3f}")
plt.plot([0,1], [0,1], linestyle="--")
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC Curve"); plt.legend()
plt.show()

# Save evaluation report
report = {
    "accuracy": float(acc),
    "precision": float(prec),
    "recall": float(rec),
    "f1": float(f1),
    "roc_auc": float(auc),
    "confusion_matrix": cm.tolist()
}
with open(ARTIFACT_DIR / "evaluation_multichannel_bilstm.json", "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2)
print("Saved report to:", ARTIFACT_DIR / "evaluation_multichannel_bilstm.json")


[1m 398/1210[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m1:47[0m 132ms/step