In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


## Part 1

In [2]:
df = pd.read_csv('../data/satirical_news_articles_v2.csv')

In [None]:
df.columns

In [None]:
print(df.body[0])

In [6]:
# Drop unnecessary columns
df = df.drop(['url', 'image_link'], axis=1)

# Generate random 8 digit non-repeating numbers for docids
n_docs = len(df)
docids = np.random.choice(range(10000000, 100000000), size=n_docs, replace=False)
df['docid'] = docids

# Reorder columns to have docid first
df = df[['docid', 'headline', 'body', 'website']]

In [None]:
df.info()

In [8]:
df.to_csv('../data/processed_articles.csv', index=False)

## Part 2

In [2]:
df = pd.read_csv("../data/processed_articles_dedup.csv")

In [3]:
df

Unnamed: 0,docid,headline,body,website
0,64471158,Relaxed Marie Kondo Now Says She Perfectly Hap...,LOS ANGELES-Admitting that she'd made some maj...,The Onion
1,84486776,U.S. Officials Call For Correct Amount Of Viol...,WASHINGTON-Addressing the need for swift and m...,The Onion
2,36589089,Kamala Harris Asks Communications Assistant If...,WASHINGTON-In an effort to expand her professi...,The Onion
3,15387752,25 Arrested In Fake Nursing School Diploma Scheme,Federal agents have arrested 25 suspects accus...,The Onion
4,41114025,World's Oldest American Dies At 72,"OXNARD, CA-Living longer than any citizen in t...",The Onion
...,...,...,...,...
35291,42409600,Feel Free To Keep Doing This After I Leave,"As my legendary tenure comes to an end, the ed...",The Every Three Weekly
35292,15458400,Life Hack: Join A Bunch Of Clubs Senior Year F...,If you're a regular reader of this paper or ev...,The Every Three Weekly
35293,54713667,Having An Accent Is The Only Reason That Stran...,"Alright, mate. Sorry, my bad, 'How's it going....",The Every Three Weekly
35294,86169291,I'm Still Using MGuest And I'm Not Ashamed To ...,Like I don't even understand why it's a big de...,The Every Three Weekly


In [4]:
# Set random seed for reproducibility
np.random.seed(42)

# Sample 60 documents randomly
sampled_df = df.sample(n=60)

In [5]:
# Set random seed for reproducibility
torch.manual_seed(42)

tokenizer = T5Tokenizer.from_pretrained('BeIR/query-gen-msmarco-t5-large-v1')
model = T5ForConditionalGeneration.from_pretrained('BeIR/query-gen-msmarco-t5-large-v1')

# Generate one query per document
queries = []
for doc in sampled_df.body:
    # Add comprehensive prefix prompt for better query generation
    prefixed_doc = f"""
    Given this satirical news article, generate a natural search query that someone might use to find this content.
    Make the query conversational and focused on the main topic or claim.
    Article: {doc}
    Query:
    """
    input_ids = tokenizer.encode(prefixed_doc, return_tensors='pt')
    outputs = model.generate(
        input_ids=input_ids,
        max_length=64,
        do_sample=True,
        top_p=0.95,
        num_return_sequences=1
    )
    query = tokenizer.decode(outputs[0], skip_special_tokens=True)
    queries.append(query)

# Add queries to dataframe
sampled_df['generated_query'] = queries

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Token indices sequence length is longer than the specified maximum sequence length for this model (765 > 512). Running this sequence through the model will result in indexing errors


In [6]:
sampled_df.head()

Unnamed: 0,docid,headline,body,website,generated_query
1590,91156407,Study Finds Orlando Most Vacant Major U.S. City,A new study has ranked Orlando as the number o...,The Onion,what is the vacancy rate for orlando florida
18507,61253138,"NBC, Netflix Cancel Bill Cosby's Return To TV ...",After decades-old allegations resurfaced and m...,The Onion,how old is bill cosby
35001,70835954,5 Mindfulness Exercises You Can Use To Remind ...,"When life is busy and hectic, it can be easy t...",Clickhole,what makes you better than a frog
24190,34374619,Poll: 81% Of Office Workers Would Rather Stay ...,"BATON ROUGE, LA-More than four out of five emp...",The Onion,what percent of people prefer to avoid richard...
8238,21193305,Catholic Church Not About To Be Out-Molested B...,VATICAN CITY-Telling the youth organization th...,The Onion,are scouts molested by priests


In [7]:
# Export the sampled dataframe with generated queries to CSV
sampled_df.to_csv('../data/sampled_articles_with_queries.csv', index=False)