# Data Processing

Here the idea is to map jokes in the dataset to the (topic, joke) pairs.
Also we clean out the worst jokes to increase quality of data a bit.

In [1]:
import pandas as pd

file_path = 'raw/shortjokes.csv'
jokes_df = pd.read_csv(file_path, sep=',', header=0)
print(jokes_df.shape)
jokes_df.head()

(231657, 2)


Unnamed: 0,ID,Joke
0,1,"[me narrating a documentary about narrators] ""..."
1,2,Telling my daughter garlic is good for you. Go...
2,3,I've been going through a really rough period ...
3,4,"If I could have dinner with anyone, dead or al..."
4,5,Two guys walk into a bar. The third guy ducks.


In [2]:
# Make header names lowercase
jokes_df.columns = jokes_df.columns.str.lower()

In [3]:
# American culture (then are not relevant to the global audience)
american_culture = ['Trump', 'Biden', 'Ted Cruz', 'Clinton', 'US', 'Republican', "Democrat", "government", "Hillary", "Tupac", "America", "Obama", "Thanksgiving", "Stevie Wonder"]
print(f"American culture filtered out: {jokes_df['joke'].str.count('|'.join(american_culture)).sum()}")
jokes_df = jokes_df[~jokes_df['joke'].str.contains('|'.join(american_culture))]

# Links to images/videos
print(f"Links filtered out: {jokes_df['joke'].str.contains('http').sum()}")
jokes_df = jokes_df[~jokes_df['joke'].str.contains('http')]

# Very dark jokes (out editorial decision)
dark_jokes = ['rape', 'suicide']
print(f"Dark jokes filtered out: {jokes_df['joke'].str.count('|'.join(dark_jokes)).sum()}")
jokes_df = jokes_df[~jokes_df['joke'].str.contains('|'.join(dark_jokes))]

# Reddit-related jokes (not funny - out editorial decision)
reddit_jokes = ['Reddit', 'r/', "reddit"]
print(f"Reddit jokes filtered out: {jokes_df['joke'].str.count('|'.join(reddit_jokes)).sum()}")
jokes_df = jokes_df[~jokes_df['joke'].str.contains('|'.join(reddit_jokes))]

American culture filtered out: 8360
Links filtered out: 614
Dark jokes filtered out: 1285
Reddit jokes filtered out: 2394


In [4]:
# Removing ";" symbols from jokes
jokes_df['joke'] = jokes_df['joke'].str.replace(';', '')

In [5]:
import os
from together import Together
from dotenv import load_dotenv

load_dotenv()
client = Together(api_key=os.environ.get('TOGETHER_API_KEY'))

system_prompt = {
    "role": "system",
    "content": """You are a system preprocessing dataset of jokes in batches. You are given a batch of jokes.
    For each given joke:
    1. Extract its topic (that would fit a form \"a joke about <topic>\"). Prefer a single word to describe the topic.
    2. Classify the joke as containing the 'qhtryeihsdpl' tag. The joke contains 'qhtryeihsdpl' if it contains ingrained prejudice against women, or implies men being superior to women or, describes women in a purely sexual objective way. Notice that not all jokes that mention women are 'qhtryeihsdpl'.
    
    Output only the csv with 3 columns:
    <number>;<joke-topic>;<1 if qhtryeihsdpl, 0 otherwise>
    ...
    """
}

system_prompt["content"] = system_prompt["content"].replace("\t", "")

def process_batch(batch):
    content = "\n".join([f"{i+1};{joke}" for i, joke in enumerate(batch)])
    
    response = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
        messages=[
            system_prompt,
            {
                "role": "user",
                "content": content,
            }
        ],
        max_tokens=256,
        temperature=0.4,
        top_p=0.7,
        top_k=50,
        repetition_penalty=1,
        stop=["<|eot_id|>","<|eom_id|>"],
        truncate=32256,
        stream=False
    )
    
    # print(content)
    # print(response.choices[0].message.content)
    return response.choices[0].message.content

print(process_batch(jokes_df.head(10)['joke'].tolist()))

1;documentary;0
2;parenting;1
3;work;1
4;dinner;0
5;bar;0
6;Barbie;1
7;music;0
8;lottery;0
9;dating;1
10;gentleman;1


In [6]:
# Running the processing in groups of batches

batch_size = 15 # of jokes
group_size = 250 # of batches

print(f"Dataset size: {jokes_df.shape[0]}")
print(f"So {jokes_df.shape[0] // (batch_size * group_size)} groups are available")

Dataset size: 221097
So 58 groups are available


In [7]:
target_groups = range(16, 18) # for out-of-order processing

output_folder = 'processed'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for group in target_groups:
    output_file = os.path.join(output_folder, f"processed_{group}.csv")

    if os.path.exists(output_file):
        print(f"Skipping group {group} because the file already exists")
        continue

    with open(output_file, 'a') as f:
        f.write('"Joke";"Joke topic"\n')

    batch_count = 0
    from_index = group * group_size * batch_size
    to_index = min(from_index + group_size * batch_size, jokes_df.shape[0])
    
    print(f"Processing group {group} from joke {from_index} to joke {to_index - 1}:")

    for batch_start in range(from_index, to_index, batch_size):
        batch = jokes_df['joke'][batch_start:min(batch_start+batch_size, to_index)].tolist()

        processed_batch = process_batch(batch)
        processed_batch = processed_batch.split('\n')
        processed_batch = [line for line in processed_batch if line.strip()]
        processed_batch = processed_batch[:len(batch)]

        jokes_to_write = []

        for joke_id, joke in enumerate(processed_batch):
            cols = joke.split(';')

            # error validation
            if len(cols) != 3: continue

            # skipping flagged jokes
            if cols[-1].strip() != '0': continue

            jokes_to_write.append((batch[joke_id], cols[1]))

        with open(output_file, 'a') as f:
            for joke, topic in jokes_to_write:
                f.write(f"{joke};{topic}\n")
        
        print(f"Processed batch {batch_count + 1} / {group_size}")
        batch_count += 1

    print(f"Finished processing group {group}")
    print()

Processing group 16 from joke 60000 to joke 63749:
Processed batch 1 / 250
Processed batch 2 / 250
Processed batch 3 / 250
Processed batch 4 / 250
Processed batch 5 / 250
Processed batch 6 / 250
Processed batch 7 / 250
Processed batch 8 / 250
Processed batch 9 / 250
Processed batch 10 / 250
Processed batch 11 / 250
Processed batch 12 / 250
Processed batch 13 / 250
Processed batch 14 / 250
Processed batch 15 / 250
Processed batch 16 / 250
Processed batch 17 / 250
Processed batch 18 / 250
Processed batch 19 / 250
Processed batch 20 / 250
Processed batch 21 / 250
Processed batch 22 / 250
Processed batch 23 / 250
Processed batch 24 / 250
Processed batch 25 / 250
Processed batch 26 / 250
Processed batch 27 / 250
Processed batch 28 / 250
Processed batch 29 / 250
Processed batch 30 / 250
Processed batch 31 / 250
Processed batch 32 / 250
Processed batch 33 / 250
Processed batch 34 / 250
Processed batch 35 / 250
Processed batch 36 / 250
Processed batch 37 / 250
Processed batch 38 / 250
Processe

In [8]:
import glob

# Now stitching all processed files into one
processed_files = glob.glob(os.path.join(output_folder, 'processed_*.csv'))
processed_files.sort()

# Stitching all files into one
with open('processed/all_processed.csv', 'w') as f:
    f.write('"Joke";"Joke topic"\n')
    
    for file in processed_files:
        with open(file, 'r') as infile:
            infile.readline()  # Skip the header line
            f.write(infile.read())
            
print("Done!")

Done!
