In [None]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
from torch.nn.functional import cosine_similarity

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set model to evaluation mode

# Load dataset
file_path = "/content/KASH TO ENG.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

# Extract translations and references
predictions = df["Translated KASH TO ENG"].astype(str).tolist()
references = df["Ground Truth"].astype(str).tolist()

def get_bert_embedding(text):
    """Returns BERT sentence embedding using mean pooling over valid tokens."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    with torch.no_grad():
        outputs = model(**inputs)

    last_hidden_state = outputs.last_hidden_state

    attention_mask = inputs["attention_mask"].unsqueeze(-1)
    masked_embeddings = last_hidden_state * attention_mask
    sentence_embedding = masked_embeddings.sum(dim=1) / attention_mask.sum(dim=1)

    return sentence_embedding.squeeze(0)

pred_embeddings = torch.stack([get_bert_embedding(text) for text in predictions])
ref_embeddings = torch.stack([get_bert_embedding(text) for text in references])

similarity_scores = cosine_similarity(pred_embeddings, ref_embeddings).tolist()

average_similarity = sum(similarity_scores) / len(similarity_scores)

print(f"✅ Average BERT Similarity: {average_similarity:.4f}")


In [None]:


import pandas as pd
import nltk
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction

# Load Excel file
df = pd.read_excel("/content/KASH TO ENG.xlsx")


references = df["Ground Truth"].astype(str).tolist()
hypotheses = df["Translated KASH TO ENG"].astype(str).tolist()


references_tokenized = [[ref.split()] for ref in references]
hypotheses_tokenized = [hyp.split() for hyp in hypotheses]

# Calculate corpus BLEU score
bleu_score = corpus_bleu(references_tokenized, hypotheses_tokenized,
                         smoothing_function=SmoothingFunction().method1)

print(f"Corpus BLEU score: {bleu_score * 100:.2f}")


In [None]:


import pandas as pd
import sacrebleu

df = pd.read_excel("/content/KASH TO ENG.xlsx")

references = df["Ground Truth"].astype(str).tolist()
hypotheses = df["Translated KASH TO ENG"].astype(str).tolist()

score = sacrebleu.corpus_bleu(hypotheses, [references])

print(f"SacreBLEU score: {score.score:.2f}")


In [None]:
import torch
from transformers import BertTokenizer, BertModel
from torch.nn.functional import cosine_similarity


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()

references = [
    "In a film that acts as both a musical homage and a cultural documentation, Songs of Paradise showcases the life and artistic contributions of Raj Begum, a woman renowned by many as the 'Melody Queen' of Kashmir.",
    "The breathtaking beauty of Kashmir has always been a major draw for filmmakers, with the valley featuring in many popular Bollywood movies. Betaab is one such movie that was set in the valley and became immensely popular with audience.",
    "The beginning and the end of cinema halls in Kashmir is linked with the turmoil it has witnessed for about nine decades now."
]

predictions = [
    "In a film that serves as both a musical tribute and a cultural record, Songs of Paradise highlights the life and artistic contributions of Raj Begum, a woman who is considered by many as the 'Melody Queen' of Kashmir.",
    "The remarkable beauty of Kashmir has continually drawn filmmakers, and the region has appeared in numerous popular Bollywood films. Bettab is one such movie set in the valley that has achieved great popularity among viewers.",
    "The history of movie theaters in Kashmir is connected to the turmoil that has persisted for almost ninety years."
]

def get_bert_embedding(text):
    """Returns BERT sentence embedding using mean pooling over valid tokens."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    with torch.no_grad():
        outputs = model(**inputs)

    last_hidden_state = outputs.last_hidden_state

    attention_mask = inputs["attention_mask"].unsqueeze(-1)
    masked_embeddings = last_hidden_state * attention_mask
    sentence_embedding = masked_embeddings.sum(dim=1) / attention_mask.sum(dim=1)

    return sentence_embedding.squeeze(0)

pred_embeddings = torch.stack([get_bert_embedding(text) for text in predictions])
ref_embeddings = torch.stack([get_bert_embedding(text) for text in references])


similarity_scores = cosine_similarity(pred_embeddings, ref_embeddings).tolist()


for i, score in enumerate(similarity_scores, 1):
    print(f"Sentence {i} Similarity: {score:.4f}")

average_similarity = sum(similarity_scores) / len(similarity_scores)
print(f"\n✅ Average BERT Similarity: {average_similarity:.4f}")
