# Library

In [8]:
import pandas as pd
import torch
import string

In [18]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("NTUYG/SOTitle-Gen-T5")
SOTitle_model = AutoModelForSeq2SeqLM.from_pretrained("NTUYG/SOTitle-Gen-T5")
SOTitle_model.to("cuda")
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [19]:
#DEVICE = torch.device('cpu')
def get_title(input):
    input_ids = tokenizer(input,return_tensors="pt", max_length=512, padding="max_length", truncation=True)
    summary_text_ids = SOTitle_model.generate(
        input_ids=input_ids["input_ids"].to(DEVICE),
        attention_mask=input_ids["attention_mask"].to(DEVICE),
        bos_token_id= SOTitle_model.config.bos_token_id,
        eos_token_id= SOTitle_model.config.eos_token_id,
        length_penalty=1.2,
        top_k=5,
        top_p=0.95,
        max_length=48,
        min_length=2,
        num_beams=1, # Changed to 1 for sampling
        do_sample=True, # Added to enable sampling
        num_return_sequences=20, # Changed to 1 for sampling
    )
    titles = []
    for i in summary_text_ids:
        title = tokenizer.decode(i, skip_special_tokens=True)
        if(title[-1] in string.punctuation):
          title = title[:-1] + " " +title[-1]
        titles.append(title)
    return titles

In [11]:
#df1 = pd.read_csv('/kaggle/input/filler/processed_train_data.csv')
df1 = pd.read_csv("./Dataset/valid_based_data.csv")
df2 = pd.read_csv('./Dataset/processed_valid_data.csv')

# Data Augmenting

In [20]:
def TextRank_score_titles(original_text, generated_titles, damping_factor=0.23):
    def preprocess_text(text):
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        return set(text.split())

    original_keywords = preprocess_text(original_text)

    scored_titles = []
    for title in generated_titles:
        title_keywords = preprocess_text(title)

        if not original_keywords:
            overlap_score = 0
        else:
            overlap_score = len(original_keywords.intersection(title_keywords)) / len(original_keywords)

        final_score = (1 - damping_factor) + damping_factor * overlap_score
        scored_titles.append((title, final_score))

    scored_titles.sort(key=lambda x: x[1], reverse=True)
    return scored_titles

In [25]:
def augment_data(input_df, valid_df):
    # Explicitly create a copy to avoid SettingWithCopyWarning
    valid_df = valid_df.copy()
    good_titles = []
    #bad_titles = []

    # Loop through each row of df
    for index, row in input_df.iterrows():
        print(index)
        original_text = row['input']

        # Generate candidate titles
        generated_titles = get_title(original_text)

        # Score the generated titles
        scored_titles = TextRank_score_titles(original_text, generated_titles)

        # Append the highest scoring title to good_titles
        good_titles.append(scored_titles[0][0])
        # else:
        #     good_titles.append(None) # Handle cases where no titles are generated

        # # Append the lowest scoring title to bad_titles
        # if scored_titles:
        #     bad_titles.append(scored_titles[-1][0])
        # else:
        #     bad_titles.append(None) # Handle cases where no titles are generated

    valid_df.loc[:, 'SOTitle title'] = good_titles
    #input_df.loc[:, 'bad_title'] = bad_titles

    return valid_df

In [None]:
#augmented_train.to_csv('/kaggle/working/augmented_train.csv', index=False)
augmented_valid.to_csv('/kaggle/working/augmented_valid.csv', index=False)