In [1]:
import pandas as pd
from sympy import im
import torch
from datasets import Dataset
from rouge_score import rouge_scorer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import string
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np




In [8]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [2]:
model_path = "./Result/result/checkpoint-36450"
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, legacy = False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
model.to("cuda")

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [3]:
df_test = pd.read_csv("./Dataset/Processed data/processed_test_data.csv")

In [4]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = ["\n".join(pred.strip().split(". ")) for pred in decoded_preds]
    decoded_labels = ["\n".join(label.strip().split(". ")) for label in decoded_labels]

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(pred, ref) for pred, ref in zip(decoded_preds, decoded_labels)]

    rouge1_f1 = sum(s["rouge1"].fmeasure for s in scores) / len(scores)
    rouge2_f1 = sum(s["rouge2"].fmeasure for s in scores) / len(scores)
    rougeL_f1 = sum(s["rougeL"].fmeasure for s in scores) / len(scores)

    return {
        "rouge1": rouge1_f1,
        "rouge2": rouge2_f1,
        "rougeL": rougeL_f1,
    }

In [17]:
def get_title_mmr(input_text, tokenizer, model, lambda_param=0.5, top_k=1):
    # 1. Generate a large pool of candidates
    input_ids = tokenizer(input_text, return_tensors="pt", max_length=512, padding="max_length", truncation=True)
    candidate_ids = model.generate(
        input_ids=input_ids["input_ids"].to('cuda'),
        attention_mask=input_ids["attention_mask"].to('cuda'),
        length_penalty=1.2,
        max_length=48,
        num_return_sequences=20, # Generate 20 to pick from
        do_sample=True,
        top_p=0.95, 
        top_k=50
    )
    
    candidates = [tokenizer.decode(c, skip_special_tokens=True) for c in candidate_ids]
    candidates = list(set(candidates)) # Remove exact duplicates
    
    # 2. Embed input and candidates
    query_embedding = embedder.encode([input_text])
    candidate_embeddings = embedder.encode(candidates)
    
    # 3. MMR Algorithm
    selected_indices = []
    unselected_indices = list(range(len(candidates)))
    
    # Calculate similarities once
    doc_similarities = cosine_similarity(candidate_embeddings, query_embedding).flatten()
    
    for _ in range(min(top_k, len(candidates))):
        mmr_scores = []
        for idx in unselected_indices:
            # Relevance: similarity to document
            relevance = doc_similarities[idx]
            
            # Diversity: similarity to already selected items
            if not selected_indices:
                diversity = 0
            else:
                diversity = max(cosine_similarity([candidate_embeddings[idx]], 
                                                 candidate_embeddings[selected_indices])[0])
            
            # MMR Formula: lambda * Relevance - (1 - lambda) * Diversity
            score = lambda_param * relevance - (1 - lambda_param) * diversity
            mmr_scores.append(score)
            
        best_idx = unselected_indices[np.argmax(mmr_scores)]
        selected_indices.append(best_idx)
        unselected_indices.remove(best_idx)
        
    return [candidates[i] for i in selected_indices]

In [18]:
df_test['input'][0]

"C#: Basically NUnit , xUnit , MbUnit , MsTest and the like have methods similar to the following : However , there are a limited number of such comparison operators built-in ; and they duplicate the languages operators needlessly . When I want anything even slightly complex , such as ... I 'm often either left digging through the manual to find the equivalent of the expression in NUnit-speak , or am forced to fall-back to plain boolean assertions with less helpful error messages.C # , however , integrates well with arbitrary Expressions - so it should be possible to have a method with the following signature : Such a method could be used to both execute the test ( i.e . validate the assertion ) and to also provide less-opaque diagnostics in case of test failure ; after all , an expression can be rendered to pseudo-code to indicate which expression failed ; and with some effort , you could even evaluate failing expressions intelligently to give some clue of the value of subexpressions.

In [22]:
df_test['pred'] = df_test['input'].apply(lambda x: get_title_mmr(x, tokenizer, model))

In [34]:
def get_grouped_rouge_metrics(df, prediction_col, reference_col):
    all_tag_metrics = {}
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    grouped = df.groupby('tag')

    for tag, group_df in grouped:
        predictions = group_df[prediction_col].tolist()
        references = group_df[reference_col].tolist()

        processed_predictions = ["\n".join(pred[0].strip().split(". ")) for pred in predictions]
        processed_references = ["\n".join(label.strip().split(". ")) for label in references]

        scores = [scorer.score(pred, ref) for pred, ref in zip(processed_predictions, processed_references)]

        rouge1_f1 = sum(s["rouge1"].fmeasure for s in scores) / len(scores) if scores else 0
        rouge2_f1 = sum(s["rouge2"].fmeasure for s in scores) / len(scores) if scores else 0
        rougeL_f1 = sum(s["rougeL"].fmeasure for s in scores) / len(scores) if scores else 0

        all_tag_metrics[tag] = {
            "rouge1": rouge1_f1*100,
            "rouge2": rouge2_f1*100,
            "rougeL": rougeL_f1*100,
        }
    return all_tag_metrics



In [35]:
get_grouped_rouge_metrics(df_test, 'pred', 'title')

{'C#': {'rouge1': 33.694448935561326,
  'rouge2': 12.827051861614075,
  'rougeL': 28.340302640282435},
 'JS': {'rouge1': 34.524836357667915,
  'rouge2': 12.567505409623323,
  'rougeL': 29.041498868026665},
 'Java': {'rouge1': 33.265439662715664,
  'rouge2': 11.811701439511568,
  'rougeL': 27.612675718577805},
 'Python': {'rouge1': 37.70961592708645,
  'rouge2': 14.227529200179772,
  'rougeL': 31.298282650549}}

In [11]:
def get_title(input, tokenizer, model):
    input_ids = tokenizer(input,return_tensors="pt", max_length=512, padding="max_length", truncation=True)
    summary_text_ids = model.generate(
        input_ids=input_ids["input_ids"].to('cuda'),
        attention_mask=input_ids["attention_mask"].to('cuda'),
        bos_token_id= model.config.bos_token_id,
        eos_token_id= model.config.eos_token_id,
        length_penalty=1.2,
        top_k=5,
        top_p=0.95,
        max_length=48,
        min_length=2,
        num_beams=1, # Changed to 1 for sampling
        do_sample=True, # Added to enable sampling
        num_return_sequences=20, # Changed to 1 for sampling
    )
    titles = []
    for i in summary_text_ids:
        title = tokenizer.decode(i, skip_special_tokens=True)
        if(title[-1] in string.punctuation):
          title = title[:-1] + " " +title[-1]
        titles.append(title)
    return titles

In [12]:
get_title(df_test['input'][0], tokenizer, model)

The following generation flags are not valid and may be ignored: ['length_penalty']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


['Is there a way to have a method with a boolean signature in C #  ?',
 "Is there any way to have a method with the '' signature of an Expression  ?",
 'Is it possible to use a method that is a generic equivalent of a NUnit expression  ?',
 'Can arbitrary expressions be used to evaluate a method  ?',
 'Is there a way to have a method to compare a string to a boolean expression  ?',
 "Does there exist a method that compares two expressions ' expressions in c #  ?",
 'Using a boolean expression to compare two Expressions  ?',
 'How to find a way to compare expressions in c #  ?',
 'Is there a way to use a method to compare a boolean expression to a boolean expression in C  #',
 'Is there an equivalent of a NUnit expression in C #  ?',
 'Is there a way to use an expression in C # with arbitrary expressions  ?',
 'Does XUnit have a way to have an expression with a boolean signature  ?',
 'Does a method have a generic syntax for comparison of a nullable expression  ?',
 'Does NUnit have a m