In [1]:
# !pip install transformers torch scikit-learn pandas IProgress
# !pip install pdfplumber
# !pip install python-docx

In [1]:
import torch
import numpy as np
import pandas as pd
import pdfplumber
from docx import Document
import re
import pickle
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, DistilBertModel

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
distilbert = DistilBertModel.from_pretrained(model_name)
distilbert.to(device)
distilbert.eval()



DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

# Convert PDF to text and extract sections

In [3]:
pdf_path = "RSM.pdf"

pages = []

with pdfplumber.open(pdf_path) as pdf:
    for page_num, page in enumerate(pdf.pages):
        text = page.extract_text()
        if text is None:
            text = ""
        pages.append(text)

full_text = "\n\n".join(pages)
table_of_contents = full_text[3250:20000].split('\n')

In [4]:
def isSection(line):
    elems = line.split(' ')
    if not line or len(elems) == 0:
        return False
    if elems[0][0].isnumeric():
        return True
    elif elems[0] == 'Appendix':
        return True
    return False

def getSectionMeta(section):
    multi_line_section_map = {
        '2.2.3': 17,
        '2.3.4.1': 22,
        '2.3.6': 22,
        '2.4.6': 35,
        '2.4.14.4': 54,
        'Appendix 4': 71,
    }
    
    removed_page = section[:-2]
    elems = removed_page.split(' ')[:-1]
    is_appendix = elems[0] == 'Appendix'
    section_number = " ".join(elems[:2]) if is_appendix else elems[0]
    title = " ".join(elems[3:]) if is_appendix else " ".join(elems[1:])
    page = multi_line_section_map[section_number]-1 if section_number in multi_line_section_map else int(section[-2:].strip(' '))
    return {
        "section_number": section_number.upper(),
        "title": title,
        "page": page,
    }


def getSections(section_metas, pages):
    full_sections = []
    n_sections = len(section_metas)
    n_pages = len(pages)

    for i, meta in enumerate(section_metas):
        start_page_idx = meta["page"] - 1
        if i + 1 < n_sections:
            next_meta = section_metas[i + 1]
            end_page_idx = next_meta["page"] - 1
        else:
            next_meta = None
            end_page_idx = n_pages - 1

        start_page_text = pages[start_page_idx]
        start_idx = start_page_text.find(meta['section_number'])
        
        if next_meta is not None:
            end_page_text = pages[end_page_idx]
            end_idx = end_page_text.find(next_meta['section_number'])

        else:
            # Last section goes to end of last page
            end_idx = len(pages[n_pages-1])

        if start_page_idx == end_page_idx:
            text_chunks = [pages[start_page_idx][start_idx:end_idx]]
        else:
            text_chunks = []
            text_chunks.append(pages[start_page_idx][start_idx:])
            for p in range(start_page_idx + 1, end_page_idx):
                text_chunks.append(pages[p])

            text_chunks.append(pages[end_page_idx][:end_idx])

        if len(text_chunks) == 1 and not text_chunks[0]:
            continue

        section_text = "\n\n".join(text_chunks)

        # Skip sections that are just the title (+space +newline char)
        if len(section_text) <= len(meta['section_number'])+len(meta['title'])+2:
            continue
        
        full_sections.append({
            **meta,
            "text": section_text,
        })

    return full_sections

def printSection(section_number):
    section = sections.loc[sections.section_number == section_number].to_dict('records')[0]
    print(f"{section['section_number']} {section['title']} (page {section['page']+1})\n\n{section['text']}")

section_metas = [getSectionMeta(s) for s in table_of_contents if isSection(s)]
sections = pd.DataFrame(getSections(section_metas, pages))

In [5]:
chars = sections.text.apply(lambda x: len(x))
print(f"Characters per section:\t\tmin {chars.min()}\tmax {chars.max()}\tmean {chars.mean():.2f}\tstd {chars.std():.2f}")

words = sections.text.apply(lambda x: len(x.replace('\n',' ').split(' ')))
print(f"Characters per section:\t\tmin {words.min()}\tmax {words.max()}\tmean {words.mean():.2f}\tstd {words.std():.2f}")

Characters per section:		min 196	max 15603	mean 1551.63	std 1928.66
Characters per section:		min 31	max 2475	mean 245.23	std 307.98


# Convert Questions to pandas df

In [6]:
doc = Document("RSM_Questions.docx")

questions = pd.DataFrame(columns=['question','section'])
text = [p.text.strip(' ') for p in doc.paragraphs if len(p.text) > 1]
t = 0

cur_question = ''
cur_labels = []
while t < len(text):
    components = text[t].split(' ')

    # labels
    if components[0].lower() == 'section':
        cur_labels.append(components[-1])
    elif components[0].lower() == 'appendix':
        cur_labels.append(text[t])

    # question
    else:
        # Store prev question
        if cur_question:
            questions.loc[len(questions),:] = [cur_question, cur_labels]

        # Reset 
        cur_question = text[t]
        cur_labels = []

    t += 1

questions

Unnamed: 0,question,section
0,What is the non-fixed radiation contamination ...,"[2.4.6.2, 2.4.6.4]"
1,What procedures should be followed if the non-...,"[2.4.6.2, 2.4.6.4]"
2,When should the wipe tests be done after using...,"[2.4.6.1, 2.3.6, 2.4.14.1]"
3,When is leak testing of sealed sources or devi...,"[2.4.8, 2.3.8, 2.4.11]"
4,What is the schedule for leak testing the seal...,"[2.4.8, 2.3.8]"
5,What is the leakage limit for sealed sources o...,"[2.4.8, 2.3.8]"
6,What happens if the sealed source leakage limi...,[2.4.8]
7,What is the annual whole body dose limit for N...,[2.4.4.1]
8,What is the annual dose limit in mSv for pregn...,[2.4.4.1]
9,What are the dosimetry requirements at the Uni...,[2.4.4.2]


# TF_IDF

In [7]:
all_texts = pd.concat(
    [sections.text, questions.question],
    ignore_index=True
)

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(all_texts)

n_sections = len(sections.text)
section_tfidf = tfidf_matrix[:n_sections]
question_tfidf = tfidf_matrix[n_sections:] 

def get_scores_tfidf(q_idx):
    q_vec = question_tfidf[q_idx]
    scores = cosine_similarity(q_vec, section_tfidf)[0]
    return scores

def rank_sections_tfidf(q_idx):
    q_vec = question_tfidf[q_idx]
    sims = cosine_similarity(q_vec, section_tfidf)[0]
    ranked = np.argsort(-sims)
    return ranked

def top_n_sections_tfidf(q_idx, n_sections):
    ranks = rank_sections_tfidf(q_idx)
    return ranks[:n_sections]

# DistilBERT embedding baseline

In [8]:
@torch.no_grad()
def encode_texts(texts, max_length=256):
    all_embs = []
    batch_size = 8
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        tokens = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        ).to(device)

        outputs = distilbert(**tokens)
        hidden = outputs.last_hidden_state

        # Compute mean embedding
        mask = tokens["attention_mask"].unsqueeze(-1)
        emb = (hidden * mask).sum(dim=1) / mask.sum(dim=1)
        all_embs.append(emb.cpu().numpy())

    return np.vstack(all_embs)


In [9]:
def get_scores_embedding(q_idx):
    section_texts = sections.text.tolist()
    question_texts = questions.question.tolist()
    section_embeddings = encode_texts(section_texts)
    question_embeddings = encode_texts(question_texts)
    q_vec = question_embeddings[q_idx:q_idx+1]
    scores = cosine_similarity(q_vec, section_embeddings)[0]    
    return scores

def rank_sections_embedding(q_idx):
    section_texts = sections.text.tolist()
    question_texts = questions.question.tolist()
    section_embeddings = encode_texts(section_texts)
    question_embeddings = encode_texts(question_texts)
    q_vec = question_embeddings[q_idx:q_idx+1]
    sims = cosine_similarity(q_vec, section_embeddings)[0]
    ranked = np.argsort(-sims)
    return ranked

def top_n_sections_embedding(q_idx, n_sections):
    ranks = rank_sections_embedding(q_idx)
    return ranks[:n_sections]

# Attention-Based Method

In [10]:
@torch.no_grad()
def attention_scores_for_question(question_text, section_texts, max_length=256, batch_size=8, return_attention=False):
    num_sections = len(section_texts)
    num_layers = distilbert.config.num_hidden_layers
    num_heads = distilbert.config.n_heads 

    if not return_attention:
        scores = np.zeros((num_layers, num_heads, num_sections), dtype=np.float32)
    else:
        q2s_blocks = [
            [[None for _ in range(num_sections)] for _ in range(num_heads)]
            for _ in range(num_layers)
        ]
        s2q_blocks = [
            [[None for _ in range(num_sections)] for _ in range(num_heads)]
            for _ in range(num_layers)
        ]
        q2q_blocks = [
            [[None for _ in range(num_sections)] for _ in range(num_heads)]
            for _ in range(num_layers)
        ]
        s2s_blocks = [
            [[None for _ in range(num_sections)] for _ in range(num_heads)]
            for _ in range(num_layers)
        ]
        
    for start in range(0, num_sections, batch_size):
        end = min(start + batch_size, num_sections)
        num_sections_in_batch = end - start
        batch_sections = section_texts[start:end]
        batch_question = [question_text] * num_sections_in_batch

        tokens = tokenizer(
            batch_question,
            batch_sections,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        ).to(device)

        outputs = distilbert(**tokens, output_attentions=True)
        attentions = outputs.attentions
        input_ids = tokens["input_ids"]

        for i in range(num_sections_in_batch):
            section_idx = start + i
            
            ids = input_ids[i]
            sep_id = tokenizer.sep_token_id
            sep_positions = (ids == sep_id).nonzero(as_tuple=False).flatten().tolist()

            # should be of the form [CLS] question [SEP] section [SEP]
            first_sep, second_sep = sep_positions[0], sep_positions[1]
            q_idx = list(range(1, first_sep))
            s_idx = list(range(first_sep + 1, second_sep))

            for layer_idx, batch_layer_attentions in enumerate(attentions):
                layer_attentions = batch_layer_attentions[i]
                for head_idx in range(num_heads):
                    head_attention = layer_attentions[head_idx]

                    q2s = head_attention[np.ix_(q_idx, s_idx)]
                    s2q = head_attention[np.ix_(s_idx, q_idx)]

                    if not return_attention:
                        val = q2s.mean().item() + s2q.mean().item()
                        scores[layer_idx, head_idx, section_idx] = val
                    else:
                        q2q = head_attention[np.ix_(q_idx, q_idx)]
                        s2s = head_attention[np.ix_(s_idx, s_idx)]

                        q2s_blocks[layer_idx][head_idx][section_idx] = q2s.cpu().numpy()
                        s2q_blocks[layer_idx][head_idx][section_idx] = s2q.cpu().numpy()
                        q2q_blocks[layer_idx][head_idx][section_idx] = q2q.cpu().numpy()
                        s2s_blocks[layer_idx][head_idx][section_idx] = s2s.cpu().numpy()

    if not return_attention:
        return scores
    else:
        return q2s_blocks, s2q_blocks, q2q_blocks, s2s_blocks

In [11]:
def collect_attention_blocks_stream(questions, sections, path="attention_blocks_stream.pkl",
                                    max_length=256, batch_size=8):
    sec_texts = sections["text"].tolist()
    with open(path, "wb") as f:
        for q_idx, row in questions.reset_index().iterrows():
            print(f"Question {q_idx+1}/{len(questions)}", end="\r")
            q_text = row["question"]
            true_label = row["section"]

            q2s_blocks, s2q_blocks, q2q_blocks, s2s_blocks = attention_scores_for_question(
                q_text,
                sec_texts,
                max_length=max_length,
                batch_size=batch_size,
                return_attention=True
            )

            obj = {
                "question_ind": q_idx,
                "question": q_text,
                "true_sections_label": true_label,
                "q2s": q2s_blocks,
                "s2q": s2q_blocks,
                "q2q": q2q_blocks,
                "s2s": s2s_blocks,
            }

            pickle.dump(obj, f)

        f.flush()
        os.fsync(f.fileno())

    print("\nDone saving.")

def load_attention_blocks_stream(path="attention_blocks_stream.pkl"):
    blocks = []
    with open(path, "rb") as f:
        while True:
            try:
                obj = pickle.load(f)
                blocks.append(obj)
            except EOFError:
                break
    return blocks    

# collect_attention_blocks_stream(questions, sections, path="attention_blocks_stream.pkl")
all_attn_blocks = load_attention_blocks_stream("attention_blocks_stream.pkl")

Question 54/54
Done.


In [129]:
def score_fn(q2s, s2q, q2q=None, s2s=None):
    # q2s_rm = q2s.max(axis=1).mean()
    # s2q_rm = s2q.max(axis=1).mean()
    # row_mean = float((q2s_rm + s2q_rm) / 2.0)
    
    cross_attention = (q2s.mean() + s2q.mean())
    self_attention = (q2q.mean() + s2s.mean())
    return float(cross_attention - self_attention)

def get_scores_attention_from_blocks(q_idx):
    entry = all_attn_blocks[q_idx]
    q2s_blocks = entry["q2s"]
    s2q_blocks = entry["s2q"]
    q2q_blocks = entry['q2q']
    s2s_blocks = entry['s2s']
    
    num_layers = len(q2s_blocks)
    num_heads = len(q2s_blocks[0])
    num_sections = len(q2s_blocks[0][0])

    scores = np.zeros((num_layers, num_heads, num_sections), dtype=np.float32)

    for l in range(num_layers):
        for h in range(num_heads):
            for s in range(num_sections):
                q2s_block = q2s_blocks[l][h][s]
                s2q_block = s2q_blocks[l][h][s]
                q2q_block = q2q_blocks[l][h][s]
                s2s_block = s2s_blocks[l][h][s]
                scores[l, h, s] = score_fn(q2s_block, s2q_block, q2q_block, s2s_block)

    return scores

In [130]:
def get_scores_attention(q_idx):
    q_text = questions.question.tolist()[q_idx]
    sec_texts = sections.text.tolist()

    scores = attention_scores_for_question(
        q_text,
        sec_texts,
        max_length=256,
        batch_size=8
    )  
    return scores

def rank_sections_attention(q_idx):
    q_text = questions.question.tolist()[q_idx]
    sec_texts = sections.text.tolist()

    scores = attention_scores_for_question(
        q_text,
        sec_texts,
        max_length=256,
        batch_size=8
    )

    ranked = np.argsort(-scores) # (layers, heads, ranks)
    return ranked

def top_n_sections_attention(q_idx, n_sections):
    ranks = rank_sections_attention(q_idx)
    return ranks[:n_sections]

# Create And Save Scores

In [131]:
def createScoresDf(save=True):
    num_questions = len(questions)
    all_dfs = []
    for q in range(num_questions):
        attention_scores = get_scores_attention_from_blocks(q)
        
        tfidf_scores = get_scores_tfidf(q)

        results = {'question_ind':q, 'section': sections['section_number'].values,'tfidf': tfidf_scores}
        for l,layer in enumerate(attention_scores):
            for h,head_scores in enumerate(layer):
                results[f"attention_{l}_{h}"] = head_scores
        all_dfs.append(pd.DataFrame(results))
    
    scores = pd.concat(all_dfs)
    if save:
        scores.to_csv('scores.csv', index=False)
    return scores

In [132]:
results = createScoresDf()
results

Unnamed: 0,question_ind,section,tfidf,attention_0_0,attention_0_1,attention_0_2,attention_0_3,attention_0_4,attention_0_5,attention_0_6,...,attention_5_2,attention_5_3,attention_5_4,attention_5_5,attention_5_6,attention_5_7,attention_5_8,attention_5_9,attention_5_10,attention_5_11
0,0,1.1,0.141959,-0.001850,-0.003470,-0.030337,-0.032375,-0.000245,-0.010635,-0.005902,...,-0.010182,-0.005750,-0.016651,-0.012952,-0.002663,-0.004979,-0.013655,-0.005167,-0.004561,-0.008556
1,0,1.1.1.1,0.068223,-0.003319,-0.008449,-0.034703,-0.040875,-0.001435,-0.016360,-0.010439,...,-0.009765,-0.004754,-0.015211,-0.011014,-0.002073,-0.003183,-0.013406,-0.006263,-0.003729,-0.005264
2,0,1.1.2,0.064733,-0.001663,-0.003967,-0.030635,-0.032820,-0.000059,-0.010789,-0.006206,...,-0.010911,-0.006617,-0.016391,-0.012922,-0.002704,-0.005267,-0.016693,-0.005858,-0.004498,-0.008793
3,0,1.1.3,0.133600,-0.001984,-0.004018,-0.030571,-0.032302,-0.000135,-0.010756,-0.006233,...,-0.007984,-0.004231,-0.014007,-0.009952,-0.001873,-0.002845,-0.008911,-0.004988,-0.003308,-0.005151
4,0,1.2,0.200803,-0.002420,-0.012149,-0.047518,-0.055708,-0.003121,-0.024479,-0.017048,...,-0.009816,-0.003648,-0.013043,-0.007568,-0.001790,-0.002649,-0.008857,-0.008659,-0.003660,-0.003903
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,53,APPENDIX 11,0.028444,-0.004507,-0.016231,-0.063943,-0.070457,-0.001955,-0.029509,-0.014686,...,-0.018455,-0.009574,-0.021246,-0.013489,-0.001125,-0.004132,-0.038567,-0.016195,-0.008294,-0.006900
111,53,APPENDIX 12,0.016162,-0.004033,-0.005404,-0.054783,-0.054713,-0.000499,-0.020555,-0.011267,...,-0.024386,-0.010943,-0.020207,-0.014436,-0.002883,-0.008998,-0.036962,-0.011540,-0.008568,-0.007757
112,53,APPENDIX 13,0.060854,-0.005041,-0.012235,-0.059614,-0.065577,-0.002125,-0.027671,-0.013460,...,-0.018101,-0.009892,-0.017657,-0.011195,-0.001658,-0.003830,-0.031453,-0.017844,-0.006732,-0.005284
113,53,APPENDIX 14,0.163219,-0.001838,-0.004411,-0.054257,-0.052804,0.000085,-0.018213,-0.008228,...,-0.017061,-0.008809,-0.019528,-0.012614,-0.002043,-0.003192,-0.024365,-0.011895,-0.006417,-0.004770


# Testing & Accuracies

In [133]:
def add_layer_mean_attention_scores(scores):
    attention_cols = [c for c in scores.columns if c.startswith("attention_") and "layer" not in c]
    layer_to_cols = {}

    for col in attention_cols:
        parts = col.split("_")
        if len(parts) != 3:
            continue
        
        _, layer_str, head_str = parts
        layer = int(layer_str)

        if layer not in layer_to_cols:
            layer_to_cols[layer] = []
        layer_to_cols[layer].append(col)

    for layer, cols in layer_to_cols.items():
        layer_col = f"attention_{layer}"
        scores[layer_col] = scores[cols].mean(axis=1)

    return scores

In [134]:
# add_layer_mean_attention_scores(results)

In [135]:
def top_n_accuracy_for_method(scores, questions, method_col, top_n=1):
    num_questions = len(questions)
    correct = 0

    for q_idx in range(num_questions):
        df_q = scores[scores["question_ind"] == q_idx]
        top_rows = df_q[method_col].nlargest(top_n).index
        pred_secs = df_q.loc[top_rows, "section"].tolist()
        true_secs = questions.loc[q_idx, "section"]
        if any(p in true_secs for p in pred_secs):
            correct += 1

    return correct / num_questions


def compute_top_n_accuracy(scores, questions, method_cols=None, top_n = 1):
    if method_cols is None:
        method_cols = [
            c for c in scores.columns
            if c not in ["question_ind", "section"]
        ]

    rows = []
    for method in method_cols:
        acc = top_n_accuracy_for_method(scores, questions, method, top_n)
        rows.append({"method": method, f"top_{top_n}_accuracy": acc})

    return pd.DataFrame(rows).sort_values(f"top_{top_n}_accuracy", ascending=False)


In [143]:
acc_1_df = compute_top_n_accuracy(results, questions, top_n=1)
acc_3_df = compute_top_n_accuracy(results, questions, top_n=3)
acc_5_df = compute_top_n_accuracy(results, questions, top_n=5)

acc_df = (
    acc_1_df
    .merge(acc_3_df[["method", "top_3_accuracy"]], on="method", how="outer")
    .merge(acc_5_df[["method", "top_5_accuracy"]], on="method", how="outer")
).sort_values("top_1_accuracy", ascending=False)

In [144]:
acc_df

Unnamed: 0,method,top_1_accuracy,top_3_accuracy,top_5_accuracy
72,tfidf,0.500000,0.814815,0.851852
20,attention_1_6,0.444444,0.685185,0.703704
66,attention_5_4,0.370370,0.518519,0.574074
11,attention_0_9,0.370370,0.592593,0.685185
57,attention_4_7,0.333333,0.555556,0.703704
...,...,...,...,...
17,attention_1_3,0.000000,0.000000,0.000000
19,attention_1_5,0.000000,0.055556,0.092593
21,attention_1_7,0.000000,0.074074,0.092593
23,attention_1_9,0.000000,0.037037,0.092593
