In [35]:
from encoder_layer import Encoder_block
from positional_encoding import Positional_Encoding
from transformers import BertTokenizerFast, DistilBertModel, DistilBertTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F

# --- TOKENIZER (MUST BE SAME AS TRAINING) ---
tokenizer = DistilBertTokenizer.from_pretrained(
    "distilbert-base-uncased",
    model_max_length=256
)

# --- DISTILBERT EMBEDDINGS ---
bert_emb = DistilBertModel.from_pretrained("distilbert-base-uncased").get_input_embeddings()

# --- PROJECTION (YOU USED THIS IN TRAINING) ---
projection = nn.Linear(768, 512)

d_model = 512
num_heads = 8
d_ff = 2048

encoder = Encoder_block(d_model=d_model, num_heads=num_heads, d_ff=d_ff)
classifier = nn.Linear(d_model, 3)

# --- LOAD CHECKPOINT ---
ckpt = torch.load("restored_model.pt", map_location="cpu")
encoder.load_state_dict(ckpt["encoder_state_dict"])
classifier.load_state_dict(ckpt["classifier_state_dict"])
projection.load_state_dict(ckpt["projection_state_dict"])

device = "cuda" if torch.cuda.is_available() else "cpu"
encoder.to(device)
classifier.to(device)
projection.to(device)
bert_emb.to(device)

encoder.eval()
classifier.eval()

def extract_keyphrases(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        x = bert_emb(input_ids)
        x = projection(x)
        encoder_out, attn_weights = encoder(x, mask=attention_mask)
        logits = classifier(encoder_out)
        preds = torch.argmax(logits, dim=-1)[0].cpu().numpy()

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    phrases = []
    current = []

    for tok, tag in zip(tokens, preds):
        tok = tok.replace("##", "")
        if tok in ["[CLS]","[SEP]","[PAD]"]:
            continue

        if tag == 1:  # B
            if current:
                phrases.append(" ".join(current))
            current = [tok]
        elif tag == 2:  # I
            if current:
                current.append(tok)
        else:
            if current:
                phrases.append(" ".join(current))
                current = []

    if current:
        phrases.append(" ".join(current))

    return phrases

In [6]:
text = "Machine learning models are widely used for prediction tasks."
print(extract_keyphrases(text))

['machine', 'learning models', 'are', 'widely', 'used', 'for prediction', 'tasks', '.']


In [7]:
phrases = extract_keyphrases("Transformers use attention to process sequences efficiently.")
print(phrases)

['transformers', 'use', 'attention', 'to', 'efficiently', '.']


In [8]:
texts = [
    "Neural networks are powerful models.",
    "Self-attention changed NLP forever.",
    "Transformers replaced RNN-based architectures."
]

for t in texts:
    print(t, "=>", extract_keyphrases(t))

Neural networks are powerful models. => ['neural', 'networks', 'are', '.']
Self-attention changed NLP forever. => ['attention changed', 'p forever', '.']
Transformers replaced RNN-based architectures. => ['transformers', 'rn n', 'based', 'architecture', '.']


In [9]:
def debug_token_predictions(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        x = bert_emb(input_ids)
        x = projection(x)
        encoder_out, _ = encoder(x, mask=attention_mask)
        logits = classifier(encoder_out)
        preds = torch.argmax(logits, dim=-1)[0].cpu().numpy()

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    print("\n--- DEBUG TOKEN PREDICTIONS ---")
    for tok, pid, pred in zip(tokens, input_ids[0].tolist(), preds):
        print(f"{tok:15}  ID={pid:5}  →  Pred={pred}")
    print("--------------------------------\n")

In [10]:
debug_token_predictions("Neural networks are powerful models.")


--- DEBUG TOKEN PREDICTIONS ---
[CLS]            ID=  101  →  Pred=0
neural           ID=15756  →  Pred=1
networks         ID= 6125  →  Pred=1
are              ID= 2024  →  Pred=1
powerful         ID= 3928  →  Pred=0
models           ID= 4275  →  Pred=2
.                ID= 1012  →  Pred=1
[SEP]            ID=  102  →  Pred=1
--------------------------------



In [14]:
id2tag = {0: "O", 1: "B", 2: "I"}

def predict_sentence(tokens):
    text = " ".join(tokens)
    encoded = tokenizer(text, return_tensors="pt", is_split_into_words=False)

    input_ids = encoded["input_ids"].to(device)
    attention_mask = encoded["attention_mask"].to(device)

    with torch.no_grad():
        x = bert_emb(input_ids)
        x = projection(x)
        enc_out, _ = encoder(x, mask=attention_mask)
        logits = classifier(enc_out)
        preds = torch.argmax(logits, dim=-1)[0].tolist()

    tokens_decoded = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())

    clean_output = []
    for tok, pred in zip(tokens_decoded, preds):
        if tok in ["[CLS]", "[SEP]"]:
            continue
        clean_output.append((tok, pred, id2tag[pred]))

    return clean_output

In [15]:
test_tokens = ["the", "double", "helix", "structure", "of", "dna"]
print(predict_sentence(test_tokens))

[('the', 1, 'B'), ('double', 1, 'B'), ('helix', 2, 'I'), ('structure', 1, 'B'), ('of', 0, 'O'), ('dna', 1, 'B')]


In [16]:
sentence = "The double helix structure of DNA was published in 1953"
tokens = sentence.lower().split()
predict_sentence(tokens)

[('the', 1, 'B'),
 ('double', 1, 'B'),
 ('helix', 2, 'I'),
 ('structure', 1, 'B'),
 ('of', 0, 'O'),
 ('dna', 1, 'B'),
 ('was', 1, 'B'),
 ('published', 0, 'O'),
 ('in', 0, 'O'),
 ('1953', 1, 'B')]

In [17]:
def align_labels(batch):
    new_batch_labels = []

    for tags, word_ids in zip(batch["doc_bio_tags"], batch["word_ids"]):
        tags = [tag2id[t] for t in tags]
        aligned = []

        prev_wid = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            else:
                if wid != prev_wid:
                    aligned.append(tags[wid])   # use true tag
                else:
                    if tags[wid] == 0:
                        aligned.append(0)
                    else:
                        aligned.append(2)
            prev_wid = wid
        new_batch_labels.append(aligned)

    batch["labels"] = new_batch_labels
    return batch

In [20]:
from transformers import DistilBertTokenizerFast, DistilBertModel
import torch
import torch.nn as nn
from encoder_layer import Encoder_block

device = "cuda" if torch.cuda.is_available() else "cpu"

# --- 1) Correct Tokenizer ---
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# --- 2) Correct Embedding ---
bert_emb = DistilBertModel.from_pretrained("distilbert-base-uncased").get_input_embeddings().to(device)

# --- 3) Load your modules ---
d_model = 512
num_heads = 8
d_ff = 2048

projection = nn.Linear(768, 512)
encoder = Encoder_block(d_model=d_model, num_heads=num_heads, d_ff=d_ff)
classifier = nn.Linear(512, 3)

ckpt = torch.load("restored_model.pt", map_location=device)

projection.load_state_dict(ckpt["projection_state_dict"])
encoder.load_state_dict(ckpt["encoder_state_dict"])
classifier.load_state_dict(ckpt["classifier_state_dict"])

projection.to(device)
encoder.to(device)
classifier.to(device)

encoder.eval()
classifier.eval()

STOPWORDS = {"the", "a", "an", "is", "am", "are", "was", "were", "be", "to", "in", "of"}

def clean_phrase(p):
    words = p.split()
    words = [w for w in words if w.lower() not in STOPWORDS]
    return " ".join(words).strip()

def extract_keyphrases(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        x = bert_emb(input_ids)
        x = projection(x)
        enc_out, attn = encoder(x, mask=attention_mask)
        logits = classifier(enc_out)
        preds = torch.argmax(logits, dim=-1)[0].cpu().tolist()

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    phrases = []
    curr = []

    for tok, p in zip(tokens, preds):
        if tok in ["[CLS]", "[SEP]"]:
            continue

        tok = tok.replace("##", "")

        if p == 1:  # B
            if curr:
                phrases.append(clean_phrase(" ".join(curr)))
            curr = [tok]

        elif p == 2:  # I
            curr.append(tok)

        else:  # O
            if curr:
                phrases.append(clean_phrase(" ".join(curr)))
                curr = []

    if curr:
        phrases.append(clean_phrase(" ".join(curr)))

    # remove empty
    phrases = [p for p in phrases if p]

    return phrases

In [21]:
print(extract_keyphrases("The double helix structure of DNA was published in 1953"))

['double helix', 'structure', 'dna', '1953']


In [23]:
print(extract_keyphrases("Artificial intelligence is transforming modern healthcare through advanced innovative models."))

['intelligence', 'transforming modern', 'healthcare through', 'advanced', 'models', '.']


In [24]:
texts = [
    "Neural networks are powerful models.",
    "Self-attention changed NLP forever.",
    "Transformers replaced RNN-based architectures."
]

for t in texts:
    print(t, "=>", extract_keyphrases(t))

Neural networks are powerful models. => ['neural', 'networks', 'models', '.']
Self-attention changed NLP forever. => ['attention changed', 'p forever', '.']
Transformers replaced RNN-based architectures. => ['transformers', 'rn n', 'based', 'architecture', '.']


In [32]:
###############################

In [47]:
def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    with torch.no_grad():
        x = bert_emb(input_ids)
        x = projection(x)
        enc_out, _ = encoder(x, mask=attention_mask)

        # Mean pooling (mask-aware)
        mask_expanded = attention_mask.unsqueeze(-1).expand(enc_out.size())
        sum_emb = torch.sum(enc_out * mask_expanded, dim=1)
        length = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)
        sentence_embedding = sum_emb / length

    return sentence_embedding

mlp = nn.Sequential(
    nn.Linear(512*3, 256),
    nn.ReLU(),
    nn.Linear(256, 1),
    nn.Sigmoid()
).to(device)

mlp.eval()

def similarity(sent1, sent2):
    e1 = get_sentence_embedding(s1)
    e2 = get_sentence_embedding(s2)
    
    cos = torch.nn.functional.cosine_similarity(e1, e2)
    
    abs_diff = torch.abs(e1 - e2)
    
    mlp_input = torch.cat([e1, e2, abs_diff], dim=-1)
    score = mlp(mlp_input)
    return score

In [48]:
s1 = "Artificial intelligence is transforming the world."
s2 = "AI is changing how we live."

score = similarity(s1, s2)
print("Similarity Score:", score)

Similarity Score: tensor([[0.5044]], device='cuda:0', grad_fn=<SigmoidBackward0>)


In [49]:
s1 = "The moon orbits around the Earth and reflects sunlight."
s2 = "Fresh vegetables should be stored in a cool and dry place."

score = similarity(s1, s2)
print("Similarity Score:", score)

Similarity Score: tensor([[0.4882]], device='cuda:0', grad_fn=<SigmoidBackward0>)


In [50]:
s1 = "Deep learning is revolutionizing computer vision."
s2 = "Neural networks are transforming the field of computer vision."

print("Similarity Score:", similarity(s1, s2))

s1 = "The cat slept peacefully on the sofa."
s2 = "Quantum mechanics describes the behavior of particles."

print("Similarity Score:", similarity(s1, s2))

Similarity Score: tensor([[0.5065]], device='cuda:0', grad_fn=<SigmoidBackward0>)
Similarity Score: tensor([[0.4914]], device='cuda:0', grad_fn=<SigmoidBackward0>)


In [53]:
a1= "Climate change is affecting global weather patterns."
a2= "Environmental changes are impacting weather across the world."

print("Similarity Score:", similarity(a1, a2))

s1= "I am planning to travel to Japan next month."
s2= "The bakery sells fresh bread every morning."

print("Similarity Score:", similarity(s1, s2))

Similarity Score: tensor([[0.4914]], device='cuda:0', grad_fn=<SigmoidBackward0>)
Similarity Score: tensor([[0.4909]], device='cuda:0', grad_fn=<SigmoidBackward0>)


In [54]:
a1= "He bought a new smartphone yesterday."
a2= "Yesterday, he purchased a brand-new phone."

print("Similarity Score:", similarity(a1, a2))

s1= "The ocean waves were calm and soothing."
s2= "He solved a complex algebraic equation in seconds."

print("Similarity Score:", similarity(s1, s2))

Similarity Score: tensor([[0.4909]], device='cuda:0', grad_fn=<SigmoidBackward0>)
Similarity Score: tensor([[0.4933]], device='cuda:0', grad_fn=<SigmoidBackward0>)


In [57]:
def emb(sent):
    inputs = tokenizer(sent, return_tensors="pt", truncation=True, max_length=256).to(device)
    with torch.no_grad():
        x = bert_emb(inputs["input_ids"])
        x = projection(x)
        enc_out, _ = encoder(x, mask=inputs["attention_mask"])
        # sent_emb = enc_out.mean(dim=1)
        sent_emb = enc_out[:, 0, :] 
    return sent_emb

def cosine_sim(a, b):
    return torch.nn.functional.cosine_similarity(a, b).item()

print(cosine_sim(emb(s1), emb(s2)))

0.9991123676300049


In [58]:
print(cosine_sim(emb("The cat is sleeping."), emb("The stock market crashed yesterday.")))

0.9988912343978882
