In [1]:
import PyPDF2

In [2]:
def pdf_to_txt(pdf_file_path, txt_file_path):
    with open(pdf_file_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text().rstrip()

    with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)

In [None]:
pdf_to_txt('/Users/Desktop/ACM CONFERENCE WORK/FINAL_LIST_DOCS/Supreme-Court-of-India-Judgment-WPC-No.118-of-2016-Triple-Talaq.pdf', 'talaq.txt')


In [None]:
import pandas as pd
import nltk
nltk.download('punkt')  # Download the required NLTK data

In [5]:
def find_related_sentences(sentences, keyword):
    related_sentences = []

    for i, sentence in enumerate(sentences):
        if keyword in sentence:
            prev_sentence = sentences[i - 1].strip() if i > 0 else None
            next_sentence = sentences[i + 1].strip() if i < len(sentences) - 1 else None
            related_sentences.append({
                "Sentence": sentence.strip(),
                "Previous": prev_sentence,
                "Next": next_sentence
            })

    return related_sentences

In [None]:
with open('/Users/Desktop/ACM CONFERENCE WORK/codes/Talaq Case/talaq.txt', "r") as file:
    text = file.read()

# Tokenize into sentences using nltk.sent_tokenize()
sentences = nltk.sent_tokenize(text)

keyword = "Law"
result = []

related_sentences = find_related_sentences(sentences, keyword)
result.extend(related_sentences)

# Create a DataFrame with columns "Sentence," "Previous," and "Next"
output_df = pd.DataFrame(result, columns=["Sentence", "Previous", "Next"])

# Save results to a new CSV
output_df.to_csv("output_sentences_talaq.csv", index=False)



In [7]:
encodings_to_try = ['utf-8', 'latin1', 'utf-16', 'ISO-8859-1']

# Try reading the CSV file with different encodings
for encoding in encodings_to_try:
    try:
        output_df = pd.read_csv("output_sentences_talaq.csv", encoding=encoding)
        print("CSV file read successfully with encoding:", encoding)
        break  # Exit loop if successful
    except UnicodeDecodeError:
        print("Failed to read CSV file with encoding:", encoding)
        continue  # Try next encoding

CSV file read successfully with encoding: utf-8


In [8]:
output_df

Unnamed: 0,Sentence,Previous,Next
0,Laws of Arab States (i) – (xiii) \nB.,Part -5 Abrogation of the practice of ‘talaq -...,Laws of Southeast Asian States (i) – (iii) \n...
1,Laws of Southeast Asian States (i) – (iii) \n...,Laws of Arab States (i) – (xiii) \nB.,"Part -6 Judicial pronouncements, on the subjec..."
2,140 -14 5 \nV. Did the Muslim Personal Law (S...,"If yes, whether it is a \nconstituent of their...",14 6-15 7 \nVI.
3,"It is also her contention, that such a divorce...","The petitioner has sought a declaration , that...","During the course of hearing, it was submitted..."
4,"It is \ntherefore, that the Muslim Personal La...","Muslim women \nclaimed, that the Muslim ‘perso...","It is essential to \nunderstand, the backgroun..."
...,...,...,...
157,(See Dicey — “Law of the \nConstitution” — 10t...,If a 340 decision is taken without any princip...,"“Law has reached its finest moments”, \nstated..."
158,"“Law has reached its finest moments”, \nstated...",(See Dicey — “Law of the \nConstitution” — 10t...,"Where discretion, is \nabsolute, man has alway..."
159,"Dicey, the great \nexpounder of the rule of la...",And the \nConstitutional law of 1975 has under...,"But \nso much, I suppose, can be said with \nr..."
160,[See Wade and Phillips: \nConstitutional Law (...,We \nmay not therefore rely wholly on Dicey’s ...,70-73)] \nThe International Commission of Juri...


In [9]:
concatenated_sen = pd.DataFrame(output_df['Previous'].astype(str) + ' ' + output_df['Sentence'].astype(str) + ' ' + output_df['Next'].astype(str), columns=['concat'])


In [10]:
print(concatenated_sen['concat'].iloc[0])


Part -5 Abrogation of the practice of ‘talaq -e-biddat’ by 
legislation, the world over, in Islamic, as well 
as, non-Islamic States  28 - 29  
A. Laws of Arab States  (i) – (xiii) 
B. Laws of Southeast Asian States  (i) – (iii) 
C.  Laws of Sub -continental States  (i) – (ii) 
6.


In [11]:
concatenated_sen.to_csv('sentences_final_concatenated_talaq.csv', index=False)

In [12]:
import csv

In [13]:
def create_person_id_map(file_path):
    person_id_map = {}

    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)
        
        for row in reader:
            person_id_list = row[0]
            person = person_id_list.strip()
            person_id = row[1].strip()
            person_id_map[person] = person_id

    return person_id_map

def create_location_id_map(file_path):
    location_id_map = {}

    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)
        
        for row in reader:
            location_id_list = row[0]
            location = location_id_list.strip()
            location_id = row[1].strip()
            location_id_map[location] = location_id

    return location_id_map

def create_time_id_map(file_path):
    time_id_map = {}

    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)
        
        for row in reader:
            time_id_list = row[0].split(',')
            time = time_id_list.strip()
            time_id = row[1].strip()
            time_id_map[time] = time_id

    return time_id_map

def create_event_id_map(file_path):
    event_id_map = {}

    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)
        
        for row in reader:
            event_id_list = row[0].split(',')
            event = event_id_list.strip()
            event_id = row[1].strip()
            event_id_map[event] = event_id

    return event_id_map

def create_other_id_map(file_path):
    other_id_map = {}

    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)
        
        for row in reader:
            other_id_list = row[0].split(',')
            other = other_id_list.strip()
            other_id = row[1].strip()
            other_id_map[other] = other_id

    return other_id_map

In [None]:
file_path_person = '/Users/Desktop/ACM CONFERENCE WORK/Entities/Talaq Case/person.csv'
file_path_location = '/Users/Desktop/ACM CONFERENCE WORK/Entities/Talaq Case/location.csv'
file_path_time = '/Users/Desktop/ACM CONFERENCE WORK/Entities/Talaq Case/time.csv'
file_path_event = '/Users/Desktop/ACM CONFERENCE WORK/Entities/Talaq Case/event.csv'
file_path_other = '/Users/Desktop/ACM CONFERENCE WORK/Entities/Talaq Case/activity.csv'
person_id_map = create_person_id_map(file_path_person)
location_id_map = create_location_id_map(file_path_location)
time_id_map = create_location_id_map(file_path_time)
event_id_map = create_location_id_map(file_path_event)
other_id_map = create_location_id_map(file_path_other)

In [15]:
def generate_ngrams(text, n=6):
    words = text.split()
    ngrams = []
    for j in range(n, 0, -1):
        for i in range(len(words)):
            if i + j <= len(words):
                ngrams.append(' '.join(words[i:i+j]))
    return ngrams

In [16]:
def preprocess_text(text):
    text = text.replace("'s", "")
    text = text.replace(".", "")
    text = text.replace(";", "")
#     text = text.replace("-", "")
    text = text.replace("!", "")
    text = text.replace("?", "")
#     text = text.replace("/", "")
    text = text.replace("@", "")
    text = text.replace("#", "")
    text = text.replace(",", "")
    return text

In [17]:
def replace_entities(text):
    text = text.lower()
    ngrams = generate_ngrams(text)
    replaced_text = text

    for ngram in ngrams:
        original = ngram.lower()
        
        if original in person_id_map:
            entity_id = person_id_map[original]
            replaced_text = replaced_text.replace(ngram, entity_id)
        else:
            preprocessed_ngram = preprocess_text(ngram.lower())
            if preprocessed_ngram in person_id_map:
                entity_id = person_id_map[preprocessed_ngram]
                replaced_text = replaced_text.replace(ngram, entity_id)
            else:
                if original in location_id_map:
                    entity_id = location_id_map[original]
                    replaced_text = replaced_text.replace(ngram, entity_id)
                elif preprocessed_ngram in location_id_map:
                    entity_id = location_id_map[preprocessed_ngram]
                    replaced_text = replaced_text.replace(ngram, entity_id)
                elif original in time_id_map:
                    entity_id = time_id_map[original]
                    replaced_text = replaced_text.replace(ngram, entity_id)
                elif preprocessed_ngram in time_id_map:
                    entity_id = time_id_map[preprocessed_ngram]
                    replaced_text = replaced_text.replace(ngram, entity_id)
                elif original in event_id_map:
                    entity_id = event_id_map[original]
                    replaced_text = replaced_text.replace(ngram, entity_id)
                elif preprocessed_ngram in event_id_map:
                    entity_id = event_id_map[preprocessed_ngram]
                    replaced_text = replaced_text.replace(ngram, entity_id)
                elif original in other_id_map:
                    entity_id = other_id_map[original]
                    replaced_text = replaced_text.replace(ngram, entity_id)
                elif preprocessed_ngram in other_id_map:
                    entity_id = other_id_map[preprocessed_ngram]
                    replaced_text = replaced_text.replace(ngram, entity_id)
                    
    return replaced_text

In [18]:
with open('sentences_final_concatenated_talaq.csv', 'r', encoding='latin-1') as csvfile:
     reader = csv.reader(csvfile)
     next(reader)
     ref_sen = []
     for row in reader:
         sentence = row[0]
         ref_sen.append(sentence)

In [59]:
from IPython.display import HTML, display
import html

CSS = """
<style>
.annotated { font-family: Inter, Arial; line-height:1.6; }
.entity {
  display: inline-block;
  position: relative;
  padding: 2px 6px;
  margin: 0 2px;
  border-radius: 6px;
  font-weight: 600;
}
.entity .label {
  display: block;
  font-size: 10px;
  text-transform: uppercase;
  letter-spacing: .6px;
  margin-top: 2px;
  opacity: 0.9;
}
.badge {
  opacity: 0.9;
  padding: 1px 5px;
  border-radius: 6px;
  font-size: 10px;
  font-weight: 700;
  display:inline-block;
}
</style>
"""

# A palette of colors to cycle through for labels
PALETTE = {
    "PERSON":    ("#ffe8e6", "#c23b3b"),
    "LOCATION":  ("#e8f7ff", "#1e90ff"),
    "TIME":      ("#fff8e6", "#d98e00"),
    "EVENT":     ("#f0e9ff", "#7a4bb2"),
    "OTHER":     ("#e6fff2", "#0f9d58"),
    # fallback
    "DEFAULT":   ("#f2f2f2", "#333")
}

def highlight_html_from_spans(original_text, spans):
    """
    Given original_text (string) and spans (list of dicts with start,end,label,text),
    produce HTML with colored chips and small label below each entity.
    Safe when some spans have start=None (not located).
    """
    parts = []
    last = 0

    # Only consider spans that actually have start offsets for inline placement
    inline_spans = [s for s in spans if s.get('start') is not None]
    inline_spans = sorted(inline_spans, key=lambda x: int(x['start']))  # safe: all starts are ints

    for s in inline_spans:
        # If there's text between last and this entity, add it escaped
        if s['start'] > last:
            parts.append(html.escape(original_text[last:s['start']]))

        label = s.get('label', 'DEFAULT').upper()
        bg, fg = PALETTE.get(label, PALETTE['DEFAULT'])
        ent_html = (
            f"<span class='entity' style='background:{bg}; color:{fg};'>"
            f"{html.escape(s['text'])}"
            f"<span class='label' style='color:{fg};'>{html.escape(label)}</span>"
            f"</span>"
        )
        parts.append(ent_html)
        last = s['end'] if s.get('end') is not None else last

    # Add any remaining tail text
    if last < len(original_text):
        parts.append(html.escape(original_text[last:]))

    # Spans that could not be located in the original (start is None)
    not_located = [s for s in spans if s.get('start') is None]

    html_sentence = "<div class='annotated normal-text'>" + "".join(parts) + "</div>"

    if not_located:
        chips = " ".join(
            f"<span class='entity' style='background:{PALETTE.get(s['label'],PALETTE['DEFAULT'])[0]}; color:{PALETTE.get(s['label'],PALETTE['DEFAULT'])[1]}; margin-right:6px'>"
            f"{html.escape(s['text'])}"
            f"<span class='label' style='color:inherit'>{html.escape(s['label'])}</span>"
            f"</span>"
            for s in not_located
        )
        html_sentence += "<div style='margin-top:6px;'>" + chips + "</div>"

    return CSS + html_sentence


In [60]:
# ---------------------------
# Intermediate visualization cell
# ---------------------------

from IPython.display import HTML, display
import re

# Reuse your generate_ngrams function and the person/location/time/event/other id maps:
# - generate_ngrams(text)
# - person_id_map, location_id_map, time_id_map, event_id_map, other_id_map
# These are from your notebook already.

def detect_entities_in_sentence(text):
    """
    Return list of detected entities as dicts:
      {'id': 'P12', 'label': 'PERSON', 'text': 'John Doe'}
    Detection is done by checking n-grams against your *_id_map dictionaries.
    """
    detected = []
    seen_ids = set()
    ngrams = generate_ngrams(text, n=6)  # reuse your ngram generator

    # lower-case keys for safer matching (your maps may already be lowercase)
    person_keys = {k.lower(): v for k, v in person_id_map.items()}
    location_keys = {k.lower(): v for k, v in location_id_map.items()}
    time_keys = {k.lower(): v for k, v in time_id_map.items()}
    event_keys = {k.lower(): v for k, v in event_id_map.items()}
    other_keys = {k.lower(): v for k, v in other_id_map.items()}

    for ng in ngrams:
        candidate = ng.strip()
        lower_cand = candidate.lower()
        if not lower_cand:
            continue

        if lower_cand in person_keys:
            idval = person_keys[lower_cand]
            if idval not in seen_ids:
                detected.append({'id': idval, 'label': 'PERSON', 'text': candidate})
                seen_ids.add(idval)
        elif lower_cand in location_keys:
            idval = location_keys[lower_cand]
            if idval not in seen_ids:
                detected.append({'id': idval, 'label': 'LOCATION', 'text': candidate})
                seen_ids.add(idval)
        elif lower_cand in time_keys:
            idval = time_keys[lower_cand]
            if idval not in seen_ids:
                detected.append({'id': idval, 'label': 'TIME', 'text': candidate})
                seen_ids.add(idval)
        elif lower_cand in event_keys:
            idval = event_keys[lower_cand]
            if idval not in seen_ids:
                detected.append({'id': idval, 'label': 'EVENT', 'text': candidate})
                seen_ids.add(idval)
        elif lower_cand in other_keys:
            idval = other_keys[lower_cand]
            if idval not in seen_ids:
                detected.append({'id': idval, 'label': 'OTHER', 'text': candidate})
                seen_ids.add(idval)

    # Return in order found (shorter ngrams appear later due to your generator ordering — that's fine)
    return detected

def build_spans_from_detected(original_text, detected_entities):
    """
    For each detected entity dict {'id','label','text'}, attempt to find its first occurrence
    in the original_text (case-insensitive). Return list of spans {'start','end','label','text'}.
    If not found, span will have start=None (so renderer will put a chip below sentence).
    """
    spans = []
    cursor = 0
    for ent in detected_entities:
        ent_text = ent['text']
        label = ent['label']
        # try find from cursor
        if ent_text and ent_text.strip():
            pat = re.escape(ent_text)
            m = re.search(pat, original_text[cursor:], flags=re.IGNORECASE)
            if m:
                start = cursor + m.start()
                end = cursor + m.end()
                spans.append({'start': start, 'end': end, 'label': label, 'text': original_text[start:end]})
                cursor = end
                continue

        # fallback if not found
        spans.append({'start': None, 'end': None, 'label': label, 'text': ent['id']})  # show ID as fallback
    return spans

# Iterate through ref_sen, find one sentence with >1 detected entity, visualize it
chosen_index = None
chosen_sentence = None
chosen_detected = None

for i, sent in enumerate(ref_sen):
    detected = detect_entities_in_sentence(sent)
    if len(detected) > 1:
        chosen_index = i
        chosen_sentence = sent
        chosen_detected = detected
        break

if chosen_index is None:
    print("No sentence found with more than one detected entity (as per current ID maps).")
else:
    print(f"Visualizing sentence index {chosen_index} (shows {len(chosen_detected)} entities):")
    print(chosen_sentence)
    # Build spans and render using the renderer you already have
    spans = build_spans_from_detected(chosen_sentence, chosen_detected)

    # The renderer cell you ran earlier defined `highlight_html_from_spans(original_text, spans)`
    # If you've run that cell, call it:
    try:
        display(HTML(highlight_html_from_spans(chosen_sentence, spans)))
    except NameError:
        # If renderer not defined, show a simple fallback HTML
        print("Renderer functions not found in this session. Showing simple fallback output:")
        chips = " ".join([f"[{e['label']}:{e['text']}]" for e in chosen_detected])
        display(HTML(f"<div><p>{html.escape(chosen_sentence)}</p><div style='margin-top:8px'>{chips}</div></div>"))


Visualizing sentence index 0 (shows 2 entities):
Part -5 Abrogation of the practice of âtalaq -e-biddatâ by 
legislation, the world over, in Islamic, as well 
as, non-Islamic States  28 - 29  
A. Laws of Arab States  (i) â (xiii) 
B. Laws of Southeast Asian States  (i) â (iii) 
C.  Laws of Sub -continental States  (i) â (ii) 
6.


In [61]:
# ---------------------------
# Show top-5 sentences with >1 detected entity (distinct color per entity)
# ---------------------------

from IPython.display import HTML, display
import re
import hashlib
import html

# deterministic color generator per entity id/text
def color_for_entity(key):
    """
    Return (bg_color, fg_color) for a given entity key (string).
    Deterministic via md5 hash -> hue.
    bg: light; fg: darker for contrast.
    """
    h = int(hashlib.md5(key.encode('utf-8')).hexdigest()[:8], 16) % 360
    # use HSL for pleasant distinct colors
    bg = f"hsl({h} 80% 92%)"
    fg = f"hsl({h} 55% 20%)"
    return bg, fg

# Reuse detect_entities_in_sentence from your previous cell (or paste again here if not present)
def detect_entities_in_sentence(text):
    detected = []
    seen_ids = set()
    ngrams = generate_ngrams(text, n=6)

    person_keys = {k.lower(): v for k, v in person_id_map.items()}
    location_keys = {k.lower(): v for k, v in location_id_map.items()}
    time_keys = {k.lower(): v for k, v in time_id_map.items()}
    event_keys = {k.lower(): v for k, v in event_id_map.items()}
    other_keys = {k.lower(): v for k, v in other_id_map.items()}

    for ng in ngrams:
        candidate = ng.strip()
        lower_cand = candidate.lower()
        if not lower_cand:
            continue

        if lower_cand in person_keys:
            idval = person_keys[lower_cand]
            if idval not in seen_ids:
                detected.append({'id': idval, 'label': 'PERSON', 'text': candidate})
                seen_ids.add(idval)
        elif lower_cand in location_keys:
            idval = location_keys[lower_cand]
            if idval not in seen_ids:
                detected.append({'id': idval, 'label': 'LOCATION', 'text': candidate})
                seen_ids.add(idval)
        elif lower_cand in time_keys:
            idval = time_keys[lower_cand]
            if idval not in seen_ids:
                detected.append({'id': idval, 'label': 'TIME', 'text': candidate})
                seen_ids.add(idval)
        elif lower_cand in event_keys:
            idval = event_keys[lower_cand]
            if idval not in seen_ids:
                detected.append({'id': idval, 'label': 'EVENT', 'text': candidate})
                seen_ids.add(idval)
        elif lower_cand in other_keys:
            idval = other_keys[lower_cand]
            if idval not in seen_ids:
                detected.append({'id': idval, 'label': 'OTHER', 'text': candidate})
                seen_ids.add(idval)

    return detected

def build_spans_from_detected_with_colors(original_text, detected_entities):
    """
    Like build_spans_from_detected, but attaches bg/fg colors for each span using color_for_entity.
    Returns spans list with entries:
      {'start': int|None, 'end': int|None, 'label': str, 'text': str, 'bg':str, 'fg':str}
    """
    spans = []
    cursor = 0
    for ent in detected_entities:
        ent_text = ent['text']
        label = ent['label']
        ent_key = ent['id'] if 'id' in ent else ent_text  # prefer canonical id for color hashing
        bg, fg = color_for_entity(ent_key)

        # try to find ent_text in original from cursor
        if ent_text and ent_text.strip():
            pat = re.escape(ent_text)
            m = re.search(pat, original_text[cursor:], flags=re.IGNORECASE)
            if m:
                start = cursor + m.start()
                end = cursor + m.end()
                spans.append({'start': start, 'end': end, 'label': label, 'text': original_text[start:end], 'bg': bg, 'fg': fg})
                cursor = end
                continue

        # fallback: not found inline
        spans.append({'start': None, 'end': None, 'label': label, 'text': ent['id'], 'bg': bg, 'fg': fg})
    return spans

# Replace the renderer to respect per-span colors (if you already have a renderer, keep it; below is a small inline renderer)
def highlight_html_from_spans_with_colors(original_text, spans):
    """
    Build HTML using per-span bg/fg when available. Falls back to label-based palette if not provided.
    This returns a full HTML string (with CSS header included).
    """
    CSS_local = """
    <style>
    .annotated { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial; line-height:1.6; }
    .entity { display: inline-block; position: relative; padding: 3px 6px; margin: 0 3px; border-radius: 8px; font-weight: 600; vertical-align: middle; }
    .entity .label { display:block; font-size:10px; text-transform:uppercase; letter-spacing:.5px; margin-top:3px; opacity:0.95; }
    .normal-text { white-space: pre-wrap; }
    </style>
    """
    parts = []
    last = 0
    inline_spans = [s for s in spans if s.get('start') is not None]
    inline_spans = sorted(inline_spans, key=lambda x: int(x['start']))

    for s in inline_spans:
        if s['start'] > last:
            parts.append(html.escape(original_text[last:s['start']]))
        bg = s.get('bg')
        fg = s.get('fg')
        label = s.get('label', 'DEFAULT').upper()
        text = s.get('text', '')
        # Build entity html using the provided colors
        ent_html = (
            f"<span class='entity' style='background:{bg}; color:{fg};'>"
            f"{html.escape(text)}"
            f"<span class='label' style='color:{fg};'>{html.escape(label)}</span>"
            f"</span>"
        )
        parts.append(ent_html)
        last = s['end'] if s.get('end') is not None else last

    if last < len(original_text):
        parts.append(html.escape(original_text[last:]))

    not_located = [s for s in spans if s.get('start') is None]
    html_sentence = "<div class='annotated normal-text'>" + "".join(parts) + "</div>"

    if not_located:
        chips = " ".join(
            f"<span class='entity' style='background:{s.get('bg')}; color:{s.get('fg')}; margin-right:6px'>"
            f"{html.escape(s.get('text'))}<span class='label' style='color:inherit'>{html.escape(s.get('label'))}</span></span>"
            for s in not_located
        )
        html_sentence += "<div style='margin-top:6px;'>" + chips + "</div>"

    return CSS_local + html_sentence

# collect all sentences with >1 detected entity
candidates = []
for i, sent in enumerate(ref_sen):
    detected = detect_entities_in_sentence(sent)
    if len(detected) > 1:
        candidates.append({'index': i, 'sentence': sent, 'detected': detected, 'count': len(detected)})

if not candidates:
    print("No sentence found with more than one detected entity.")
else:
    # sort by descending entity count, then by original order
    candidates = sorted(candidates, key=lambda x: (-x['count'], x['index']))
    topk = candidates[:5]  # top 5

    for item in topk:
        idx = item['index']
        sent = item['sentence']
        detected = item['detected']
        print(f"\n--- Sentence index {idx} — detected {len(detected)} entities ---\n{sent}\n")
        spans = build_spans_from_detected_with_colors(sent, detected)
        html_out = highlight_html_from_spans_with_colors(sent, spans)
        display(HTML(html_out))



--- Sentence index 136 — detected 4 entities ---
In A. S. Parveen Akthar  v. The Union of India 60 , the 
High Court of Madras was posed with the question on  the 
validity and constitutionality of Section 2 of the 1937 Act in so 
far as it recognises triple talaq as a valid form o f divorce. The 
Court referred to the provisions of the Quran, opin ions of 
various eminent scholars of Islamic Law and previou s judicial 
pronouncements including Shamim Ara  and came to the 
following conclusion:  
 
â45.Thus, the law with regard to talaq, as declared  
by the apex Court, is that talaq must be for a 
reasonable cause and must be preceded by 
attempt at reconciliation between the husband 
and the wife by two arbiters one chosen by wife's 
family and the other from husband's family and it 
is only if their attempts fail, talaq may be effect ed. XXXX 
48.Having regard to the law now declared by the 
apex Court in the case of Shamim Ara , 2002 AIR 
SCW 4162, talaq, in whatever form, must 


--- Sentence index 133 — detected 3 entities ---
The above view has been endorsed by various High Co urts, 
finally culminating in Shamim Ara  by this Court which has since 
                                                 
50  Verses from the Holy Quran as extracted above are taken from âThe Holy Qurâanâ 
translated by Abdullah Yusuf Ali which was agreed t o be a fair translation by all parties. 51 Similar observations were made by the High Court of  Gauhati through Baharul Islam, J. in 
Jiauddin Ahmed v. Anwara Begum (1981) 1 Gau LR 358 wherein he noted that âthough 
marriage under Muslim Law is only a civil contract yet the rights and responsibilities 
consequent upon it are of such importance to the we lfare of humanity, that a high degree 
of sanctity is attached to it. But in spite of the sacredness of the character of the marriage-
tie, Islam recognizes the necessity, in exceptional  circumstances, of keeping the way open 
for its dissolution â.




--- Sentence index 0 — detected 2 entities ---
Part -5 Abrogation of the practice of âtalaq -e-biddatâ by 
legislation, the world over, in Islamic, as well 
as, non-Islamic States  28 - 29  
A. Laws of Arab States  (i) â (xiii) 
B. Laws of Southeast Asian States  (i) â (iii) 
C.  Laws of Sub -continental States  (i) â (ii) 
6.




--- Sentence index 1 — detected 2 entities ---
Laws of Arab States  (i) â (xiii) 
B. Laws of Southeast Asian States  (i) â (iii) 
C.  Laws of Sub -continental States  (i) â (ii) 
6. Part -6 Judicial pronouncements, on the subject of 
âtalaq-e-biddatâ  30 - 34  
7.




--- Sentence index 3 — detected 2 entities ---
The petitioner has sought a declaration , that the âtalaq-e-
biddatâ pronounced by her husband on 10.10.2015 be declared as void ab 
initio . It is also her contention, that such a divorce which abruptly, 
unilaterally and irrevocably terminates the ties of matr imony, purportedly 
under Section 2 of the Muslim Personal Law (Shariat) A pplication Act, 1937 
(hereinafter referred to as, the Shariat Act), be decl ared unconstitutional. During the course of hearing, it was submitted, that the  âtalaq-e-biddatâ      
(-triple talaq), pronounced by her husband is not vali d, as it is not a part of 
âShariatâ (Muslim âpersonal lawâ).



In [None]:
# final_sentences = []

# for sen in ref_sen:
#      s = replace_entities(sen)
#      final_sentences.append(s)

In [20]:
with open('cleaned_sentences_talaq.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Sentences'])
#      for sen in final_sentences:
    for sen in final_sentences:
        writer.writerow([sen])

## Matrix Creation

In [21]:
# with the sentences
import pandas as pd
import csv
import re

# Load the CSV files
time_df = pd.read_csv(file_path_time)
location_df = pd.read_csv(file_path_location)
event_df = pd.read_csv(file_path_event)
person_df = pd.read_csv(file_path_person)
activity_df = pd.read_csv(file_path_other)

location_ID = location_df['Id'].tolist()
location_df = location_df['Location'].tolist()

time_ID = time_df['Id'].tolist()
time_df = time_df['Time'].tolist()

# event_ID = event_df['ID'].tolist()
# event_df = event_df['Event'].tolist()

# person_ID = person_df['ID'].tolist()
# person_df = person_df['Person'].tolist()

# activity_ID = activity_df['ID'].tolist()
# activity_df = activity_df['Others'].tolist()

event_ID_map = dict(zip(event_df['Id'], event_df['Event']))
person_ID_map = dict(zip(person_df['Id'], person_df['Person']))
activity_ID_map = dict(zip(activity_df['Id'], activity_df['Activity']))

matrix = {}

In [22]:
sentences = []
with open('cleaned_sentences_talaq.csv', 'r', encoding='latin-1') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        sentence = row[0]
        sentences.append(sentence)

In [23]:
for s in sentences:
    pattern = r'([PTLEA]\d+)'
    matches = re.findall(pattern, s)

    persons = []
    time = []
    location = []
    event = []
    activity = []

    for match in matches:
        if match[0] == 'P':
            persons.append(person_ID_map.get(match, 'Unknown Person'))
        elif match[0] == 'L':
             location.append(location_ID.index(match))
        elif match[0] == 'E':
            event.append(event_ID_map.get(match, 'Unknown Event'))
        elif match[0] == 'A':
            activity.append(activity_ID_map.get(match, 'Unknown Activity'))
        elif match[0] == 'T':
            time.append(time_ID.index(match))

    if location and time:
        for loc_index in location:
            loc_key = location_df[loc_index]
            for time_index in time:
                time_key = time_df[time_index]
                if time_key not in matrix:
                    matrix[time_key] = {}
                if loc_key not in matrix[time_key]:
                    matrix[time_key][loc_key] = []
                matrix[time_key][loc_key].extend([(entity, s) for entity in (persons + event + activity)])
    elif location:
        for loc_index in location:
            loc_key = location_df[loc_index]
            if loc_key not in matrix:
                matrix[loc_key] = {}
            if 'NULL' not in matrix:
                matrix['NULL'] = {}
            if loc_key not in matrix["NULL"]:
                matrix["NULL"][loc_key] = []
            matrix["NULL"][loc_key].extend([(entity, s) for entity in (persons + event + activity)])
    elif time:
        for time_index in time:
            time_key = time_df[time_index]
            if time_key not in matrix:
                matrix[time_key] = {}
            if 'NULL' not in matrix[time_key]:
                matrix[time_key]['NULL'] = []
            matrix[time_key]["NULL"].extend([(entity, s) for entity in (persons + event + activity)])
    else:
        if 'NULL' not in matrix:
            matrix['NULL'] = {}
        if 'NULL' not in matrix['NULL']:
            matrix['NULL']['NULL'] = []
        matrix["NULL"]["NULL"].extend([(entity, s) for entity in (persons + event + activity)])

matrix_df = pd.DataFrame.from_dict(matrix, orient='index')

In [24]:
matrix_df.to_csv('matrix_with_sen_talaq.csv')

In [25]:
import networkx as nx

In [26]:
df = pd.read_csv('matrix_with_sen_talaq.csv')

In [27]:
df.rename(columns={'Unnamed: 0': 'Time'}, inplace=True)

In [62]:
import ast
from tabulate import tabulate
import pandas as pd

def show_10x10_entity_matrix(df, max_entities=6):
    # Take EXACT first 10 rows and first 10 columns
    sub = df.iloc[:10, :10].copy()

    # Convert each cell into entity-only listing
    def extract_entities(cell):
        if pd.isnull(cell) or cell == "":
            return ""
        try:
            items = ast.literal_eval(cell)
            ents = [e for e, _ in items]   # keep only entity IDs
            if len(ents) > max_entities:
                ents = ents[:max_entities] + ["..."]
            return ", ".join(ents)
        except:
            return str(cell)

    # Apply to all non-Time columns
    for col in sub.columns:
        if col != "Time":
            sub[col] = sub[col].apply(extract_entities)

    print(tabulate(sub, headers="keys", tablefmt="github", showindex=False))
    return sub


# RUN IT
show_10x10_entity_matrix(df)


| Time       | NULL                                                                                                   |
|------------|--------------------------------------------------------------------------------------------------------|
| nan        | dissolution of marriage, khula, mubaraat, dissolution of marriage, dissolution of marriage, khula, ... |
| 10.10.2015 | divorce                                                                                                |
| one year   | divorce, divorce                                                                                       |
| 1400 years |                                                                                                        |


Unnamed: 0,Time,NULL
0,,"dissolution of marriage, khula, mubaraat, diss..."
1,10.10.2015,divorce
2,one year,"divorce, divorce"
3,1400 years,


In [28]:
G = nx.Graph()

for index, row in df.iterrows():
    time = row['Time']
    for location in df.columns:
        if location == 'Time':
            continue
        # Convert the tuple to a string and add it as a node
        G.add_node(f"{time}, {location}")

In [29]:
import ast

In [30]:
final = set()

# Update the final set with only the entities
for value in pd.unique(df.drop(columns='Time').values.ravel()):
    if pd.notnull(value):
        value_list = ast.literal_eval(value)
        entity_list = [entity for entity, sentence in value_list]
        final.update(entity_list)

In [31]:
G.add_nodes_from(final)

In [32]:
for index, row in df.iterrows():
    time = row['Time']
    for location in df.columns:
        if location == 'Time':
            continue
        if pd.notnull(row[location]):
            value_list = ast.literal_eval(row[location])
            entity_list = [entity for entity, sentence in value_list]
            for entity in entity_list:
                if entity in final:
                    # Convert the tuples to strings and add them as an edge
                    G.add_edge(f"{time}, {location}", entity)

In [33]:
for node in list(G.nodes):
    if not list(G.neighbors(node)):
        G.remove_node(node)
        

In [34]:
for node1 in G.nodes:
    # Check if node1 can be split into a pair
    if ', ' in node1:
        parts1 = node1.split(", ")
        if len(parts1) == 2:  # Ensure it splits into exactly two parts
            time1, location1 = parts1
            for node2 in G.nodes:
                # Check if node2 can be split into a pair
                if ', ' in node2:
                    parts2 = node2.split(", ")
                    if len(parts2) == 2:  # Ensure it splits into exactly two parts
                        time2, location2 = parts2
                        # Add edge if the time or location matches
                        if time1 == time2 or location1 == location2:
                            G.add_edge(node1, node2)

In [35]:
degree_dict = dict(G.degree(G.nodes()))
nx.set_node_attributes(G, degree_dict, 'degree')


In [36]:
# Create a dictionary to store the sentences for each node
sentences_dict = {}

# Create a dictionary to store the sentences for each edge i.e. between two nodes
edge_sentences_dict = {}

# Update the sentences_dict and edge_sentences_dict
for index, row in df.iterrows():
    time = row['Time']
    for location in df.columns:
        if location == 'Time':
            continue
        if pd.notnull(row[location]):
            value_list = ast.literal_eval(row[location])
            for entity, sentence in value_list:
                if entity in final:
                    # Add the sentence to the sentences_dict for the entity
                    if entity not in sentences_dict:
                        sentences_dict[entity] = set()
                    sentences_dict[entity].add(sentence)

                    # Add the sentence to the sentences_dict for the (time, location) node
                    time_location_node = f"{time}, {location}"
                    if time_location_node not in sentences_dict:
                        sentences_dict[time_location_node] = set()
                    sentences_dict[time_location_node].add(sentence)

                    # Add the sentence to the edge_sentences_dict for the edge
                    edge = (time_location_node, entity)
                    if edge not in edge_sentences_dict:
                        edge_sentences_dict[edge] = set()
                    edge_sentences_dict[edge].add(sentence)

# Convert the sets back to lists if needed
for node, sentences in sentences_dict.items():
    sentences_dict[node] = list(sentences)

for edge, sentences in edge_sentences_dict.items():
    edge_sentences_dict[edge] = list(sentences)


In [37]:
pip install node2vec


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [38]:
from node2vec import Node2Vec

# Create a Node2Vec instance
node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4)

# Generate embeddings
model = node2vec.fit(window=10, min_count=1, batch_words=4)


Computing transition probabilities:   0%|          | 0/8 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 50/50 [00:00<00:00, 1332.29it/s]
Generating walks (CPU: 2): 100%|██████████| 50/50 [00:00<00:00, 1332.69it/s]
Generating walks (CPU: 3): 100%|██████████| 50/50 [00:00<00:00, 1330.33it/s]
Generating walks (CPU: 4): 100%|██████████| 50/50 [00:00<00:00, 1313.96it/s]


In [39]:
from sklearn.metrics.pairwise import cosine_similarity

# Get the node embeddings
node_embeddings = {node: model.wv[node] for node in G.nodes}

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(list(node_embeddings.values()))


In [40]:
from sklearn.cluster import DBSCAN

# Initialize DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=3)  # Adjust parameters as needed

# Fit and predict clusters
labels = dbscan.fit_predict(similarity_matrix)

In [41]:
cluster_colors = [
    'rgb(31, 119, 180)',  # Blue
    'rgb(255, 127, 14)',  # Orange
    'rgb(44, 160, 44)',   # Green
    'rgb(214, 39, 40)',   # Red
    'rgb(148, 103, 189)', # Purple
    'rgb(140, 86, 75)',   # Brown
    'rgb(227, 119, 194)', # Pink
    'rgb(127, 127, 127)', # Gray
    'rgb(188, 189, 34)',  # Olive
    'rgb(23, 190, 207)',  # Teal
    'rgb(255, 187, 120)', # Peach
    'rgb(214, 39, 40)',   # Maroon
    'rgb(77, 175, 74)',   # Light Green
    'rgb(152, 78, 163)',  # Plum
    'rgb(255, 152, 150)'  # Salmon
]


In [42]:
import networkx as nx
import plotly.graph_objs as go

# Assuming 'G' is your graph and 'labels' is the list of labels
# Renumber clusters from 1 to 7 if needed, otherwise ensure you have clusters numbered 1 to 7
unique_labels = list(set(labels))
label_mapping = {label: idx+1 for idx, label in enumerate(unique_labels)}

def draw_subgraph_plotly(subgraph, cluster_id, cluster_color):
    pos = nx.spring_layout(subgraph, seed=42)  # Fixed seed for reproducibility
    
    # Extract the node positions
    x_nodes = [pos[node][0] for node in subgraph.nodes()]
    y_nodes = [pos[node][1] for node in subgraph.nodes()]

    # Extract the edges
    edge_x = []
    edge_y = []
    for edge in subgraph.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    # Edge trace
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=1, color='gray'),
        hoverinfo='none',
        mode='lines')

    # Node trace
    node_trace = go.Scatter(
        x=x_nodes, y=y_nodes,
        mode='markers+text',
        text=[f'{node}' for node in subgraph.nodes()],
        textposition="top center",
        marker=dict(
            showscale=False,
            color=cluster_color,  # Assign the cluster color
            size=20,
            line=dict(width=2, color='black')
        ),
        hoverinfo='text'
    )

    # Create figure
    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title=f'Cluster {cluster_id}',
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        height=600,
                        width=600,
                        paper_bgcolor='white',
                        plot_bgcolor='white'
                    ))

    fig.show()

# Iterate over each cluster and draw the subgraph
for original_cluster_id in set(labels):
    cluster_id = label_mapping[original_cluster_id]
    cluster_color = cluster_colors[cluster_id - 1]  # Assign a unique color to each cluster
    cluster_nodes = [node for node, label in zip(G.nodes, labels) if label == original_cluster_id]
    subgraph = G.subgraph(cluster_nodes)
    draw_subgraph_plotly(subgraph, cluster_id, cluster_color)

In [43]:
# # Initialize lists for nodes and links
# nodes = []
# links = []

# # Add nodes to the list
# for node, sentences in sentences_dict.items():
#     nodes.append({
#         "id": node,
#         "group": 1,  # Update this as needed
#         "size": len(sentences)  # The size could be based on the number of sentences
#     })

# # Add links to the list
# for edge, sentences in edge_sentences_dict.items():
#     source, target = edge
#     links.append({
#         "source": source,
#         "target": target,
#         "value": len(sentences)  # The value could be based on the number of sentences
#     })

# # Combine nodes and links into a single dictionary
# graph_data = {
#     "nodes": nodes,
#     "links": links
# }


In [None]:
# import os

# os.environ['METIS_DLL'] = '/Users/.local/lib/libmetis.dylib'

In [45]:
# import metis

In [46]:
# edgecuts, parts = metis.part_graph(G)

In [47]:
import json

# Create a mapping from nodes to integers
node_to_int = {node: i for i, node in enumerate(G.nodes)}


In [48]:
degree_dict = dict(G.degree(G.nodes()))
nx.set_node_attributes(G, degree_dict, 'degree')

data = nx.node_link_data(G)




The default value will be `edges="edges" in NetworkX 3.6.


  nx.node_link_data(G, edges="links") to preserve current behavior, or
  nx.node_link_data(G, edges="edges") for forward compatibility.



In [49]:
import numpy as np

In [50]:
labels = labels.astype(np.int32)

In [51]:
graph_data = {
    "nodes": [{"name": str(node), "n": degree_dict[node], "grp": int(labels[i]), "id": str(node)} for i, node in enumerate(G.nodes())],
    "links": [{"source": str(link_data['source']), "target": str(link_data['target']), "value": 1} for link_data in data['links']]
}

In [52]:
json_data = json.dumps(graph_data)
with open('data_talaq.json', 'w') as f:
    f.write(json_data)

In [53]:
# from collections import defaultdict

# # Create a dictionary where keys are part numbers and values are lists of nodes
# part_dict = defaultdict(list)
# for node, part in enumerate(parts):
#     part_dict[part].append(node)

# # Print the number of parts
# print("Number of parts:", len(part_dict))

# # Print the nodes in each part
# for part, nodes in part_dict.items():
#     print("Part", part, ":", nodes)

In [54]:
# Assuming you have already created clusters using DBSCAN and have 'labels' and 'G' available

# Initialize an empty dictionary to store sentences for each cluster
cluster_sentences_dict = {}

# Iterate over each cluster
for original_cluster_id in set(labels):
    cluster_id = label_mapping[original_cluster_id]
    cluster_nodes = [node for node, label in zip(G.nodes, labels) if label == original_cluster_id]
    
    # Initialize a set to store sentences for this cluster
    cluster_sentences = set()
    
    # Iterate over nodes in the cluster
    for node in cluster_nodes:
        # Assuming 'node' contains the relevant information (e.g., entity, time, location)
        # Extract sentences associated with this node and add them to the cluster_sentences set
        if node in sentences_dict:
            cluster_sentences.update(sentences_dict[node])
    
    # Store the cluster_sentences set in the cluster_sentences_dict
    cluster_sentences_dict[cluster_id] = cluster_sentences

In [55]:
sentences_dict

{'dissolution of marriage': ['xxx  xxx  xxx \n5. E2 by court in certain circu mstances.-the \ndistrict judge may, on petition made by a muslim marrie d woman, \ndissolve a marriage on any ground recognized by mus lim personal law \n(shariat).Ã¢Â\x80Â\x9d \n \na close examination of section 2, extracted above, lea ves no room for any \ndoubt, that custom and usage, as it existed amongst musli ms, were sought 30 to be expressly done away with, to the extent the same w ere contrary to \nmuslim Ã¢Â\x80Â\x98personal lawÃ¢Â\x80Â\x99. section 2 also mandated, that m uslim Ã¢Â\x80Â\x98personal lawÃ¢Â\x80Â\x99 \n(shariat) would be exclusively adopted as Ã¢Â\x80Â\x9cÃ¢Â\x80Â¦ the rul e of decision Ã¢Â\x80Â¦Ã¢Â\x80Â\x9d in \nmatters of intestate succession, special property of fe males, including all \nquestions pertaining to Ã¢Â\x80Â\x9cÃ¢Â\x80Â¦ personal property inherite d or obtained under \ncontract or gift or any other provision of Ã¢Â\x80Â\x98personal la wÃ¢Â\x80Â\x99, marriage, \nE2, incl

In [56]:
with open("output.txt", "w") as f:
    f.write(str(sentences_dict))