In [1]:
from ollama import chat, ChatResponse, generate
import pandas as pd
from tqdm import tqdm

In [2]:
data_file1 = 'datasets/Sarcasm_Headlines_Dataset.json'
data_file2 = 'datasets/Sarcasm_Headlines_Dataset_v2.json'

df1 = pd.read_json(data_file1, lines=True)
df2 = pd.read_json(data_file2, lines=True)

# good grief so many duplicates xd
df = pd.concat([df1, df2])
print(df.shape)
df.drop_duplicates(inplace=True)
print(df.shape)

(55328, 3)
(28617, 3)


In [4]:
sarcastic_df = df[df['is_sarcastic'] == 1]['headline']
non_sarcastic_df = df[df['is_sarcastic'] == 0]['headline']

print("Sarcastic Headlines:")
print(sarcastic_df.head())

print("\nNon-Sarcastic Headlines:")
print(non_sarcastic_df.head())

Sarcastic Headlines:
0    thirtysomething scientists unveil doomsday clo...
3    inclement weather prevents liar from getting t...
4    mother comes pretty close to using word 'strea...
7    richard branson's global-warming donation near...
8    shadow government getting too large to meet in...
Name: headline, dtype: object

Non-Sarcastic Headlines:
1    dem rep. totally nails why congress is falling...
2    eat your veggies: 9 deliciously different recipes
5                                 my white inheritance
6           5 ways to file your taxes with less stress
9                   lots of parents know this scenario
Name: headline, dtype: object


In [5]:
non_sarcastic_to_sarcastic_prompt = "<START OF PROMPT> Convert the given non-sarcastic news headline to a sarcastic headline. Return only the converted headline. <END OF PROMPT> <START OF HEADLINE> \"{}\" <END OF HEADLINE>"
sarcastic_lines = []
# llama3.1 must be installed from ollama: https://ollama.com/library/llama3.1 
for line in tqdm(non_sarcastic_df):
    response = generate(model="llama3.1:latest", prompt=non_sarcastic_to_sarcastic_prompt.format(line))
    sarcastic_lines.append(response.response)

100%|██████████| 14985/14985 [1:28:33<00:00,  2.82it/s]


In [7]:
pairs = pd.DataFrame({'non_sarcastic': non_sarcastic_df, 'sarcastic': sarcastic_lines})
pairs.head()

Unnamed: 0,non_sarcastic,sarcastic
1,dem rep. totally nails why congress is falling...,"""Woohoo, Another Congressman Finally Gets It R..."
2,eat your veggies: 9 deliciously different recipes,"""Because You Clearly Hate Yourself, Here Are 9..."
5,my white inheritance,"""Oh Joy, Another Person Privileging Their Whit..."
6,5 ways to file your taxes with less stress,"""Just What You Needed: 5 More Ways to Waste Yo..."
9,lots of parents know this scenario,"""Oh Joy, Another Parent Who's Had Their Kid As..."


In [9]:
# ideally clean it before saving but we do that in the main file
pairs.to_csv('datasets/sarcastic_headline_pairs.csv', index=False) 

In [37]:
pairs

Unnamed: 0,non_sarcastic,sarcastic
0,former versace store clerk sues over secret 'b...,"""Just what we all wanted: another lawsuit abou..."
1,the 'roseanne' revival catches up to our thorn...,"""Because What We Really Needed Was Another Rem..."
4,j.k. rowling wishes snape happy birthday in th...,"""BREAKING: J.K. Rowling still obsessed with Sn..."
5,advancing the world's women,"""Wonderful, Another Thing for Men to Feel Guil..."
6,the fascinating case for eating lab-grown meat,"""Because nothing says 'fascinating' like synth..."
...,...,...
26704,american politics in moral free-fall,"""Just Another Day in Politics: Business as Usu..."
26705,america's best 20 hikes,"""Oh Joy, America's Best 20 Hikes Are So Origin..."
26706,reparations and obama,"""Because Nothing Says 'Unity' Like Asking Some..."
26707,israeli ban targeting boycott supporters raise...,"""Israel Just Can't Get Enough of Free Speech, ..."
