# Concatenating synthetic data

In [None]:
import json
import pandas as pd
import re

In [None]:
filenames = ["/content/gemini_sarcastic_fewshot.txt", "/content/gemini_sarcastic_oneshot.txt", "/content/gemini_simple_prompt.txt",
             "/content/openapi_sarcastic_fewshot.txt", "/content/openapi_sarcastic_oneshot.txt", "/content/openapi_simple_prompt.txt"]

column_names = [["gemini", "fewshot"],["gemini", "oneshot"], ["gemini", "zeroshot"],["openai", "fewshot"],["openai", "oneshot"],["openai", "zeroshot"]]

all_data = []

for filename, columns in zip(filenames, column_names):
    with open(filename) as file:
        data = file.read()

    parsed_data = json.loads(data)
    synthetic = [text for key, value in parsed_data.items() for text in value]

    for text in synthetic:
        text_dict = {'text': text, 'llm': columns[0], 'prompting': columns[1]}
        all_data.append(text_dict)

synth_sarc = pd.DataFrame(all_data)

In [None]:
synth_sarc

Unnamed: 0,text,llm,prompting
0,"Ой, так, тому що російські війська абсолютно в...",gemini,fewshot
1,"Я впевнений, що анексія української території ...",gemini,fewshot
2,Російська економіка процвітає під санкціями. Н...,gemini,fewshot
3,"О, так, \'спеціальна військова операція\' йде ...",gemini,fewshot
4,Російські ЗМІ такі ж об'єктивні та неупереджен...,gemini,fewshot
...,...,...,...
5488,"О, нові гучні роботи по будинку поруч з моїм б...",openai,zeroshot
5489,"Хто потребує свіжого повітря, коли можеш весь ...",openai,zeroshot
5490,"Ага, кому потрібні електронні книги, коли у те...",openai,zeroshot
5491,"Прекрасно, що ти розповідаєш всі сюжетні повор...",openai,zeroshot


In [None]:
# count the number of texts in each group
grouped_data = synth_sarc.groupby(['llm', 'prompting'])
grouped_counts = grouped_data.size()
print(grouped_counts)

llm     prompting
gemini  fewshot       911
        oneshot      1011
        zeroshot     1017
openai  fewshot       767
        oneshot       928
        zeroshot      859
dtype: int64


In [None]:
synth_sarc.to_csv("synthetic_data_combined.csv")

# Tokenizing real and synthetic sarcastic data for sampling

In [None]:
!pip install tokenize_uk

In [None]:
import pandas as pd
synth_sarc = pd.read_csv("synthetic_data_combined.csv")

In [None]:
import tokenize_uk
import re

In [None]:
real_full = pd.read_csv("/content/dataset_cleaned.csv")
real_sarc = real_full[real_full['is_sarcastic'] == True]

In [None]:
def filter_words(text):
    words_list = tokenize_uk.tokenize_words(text)
    filtered_words = [word for word in words_list if re.match(r"^[А-ЩЬЮЯҐЄІЇа-щьюяґєії][А-ЩЬЮЯҐЄІЇа-щьюяґєії’ʼ']*?[А-ЩЬЮЯҐЄІЇа-щьюяґєії]?$", word)]
    return filtered_words, len(filtered_words)

def substitute_user_mentions_and_links(text):
    # Regular expression to match user mentions
    user_mention_pattern = r'@\w+'

    # Regular expression to match links
    link_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

    text = re.sub(user_mention_pattern, '', text)

    text = re.sub(link_pattern, '', text)

    text = re.sub(r'[a-zA-Z]+', '', text)

    return text.lower()

In [None]:
real_sarc['text_mod'] = real_sarc['text'].apply(substitute_user_mentions_and_links)
real_sarc[['tokens', 'num_words']]  = real_sarc['text_mod'].apply(lambda x: pd.Series(filter_words(x)))

synth_sarc['text_mod'] = synth_sarc['text'].apply(substitute_user_mentions_and_links)
synth_sarc[['tokens', 'num_words']]  = synth_sarc['text_mod'].apply(lambda x: pd.Series(filter_words(x)))

In [None]:
duplicates = synth_sarc[synth_sarc.duplicated(subset='text', keep='first')]

synth_sarc_unique = synth_sarc.drop_duplicates(subset='text', keep='first')

print(synth_sarc.shape[0])
print(synth_sarc_unique.shape[0])
print(len(duplicates))

5493
5468
25


In [None]:
print(synth_sarc.groupby(['llm', 'prompting'])['num_words'].sum())
print(synth_sarc_unique.groupby(['llm', 'prompting'])['num_words'].sum())

llm     prompting
gemini  fewshot      11119
        oneshot      12846
        zeroshot     12903
openai  fewshot      10770
        oneshot      12290
        zeroshot     11434
Name: num_words, dtype: int64
llm     prompting
gemini  fewshot      11108
        oneshot      12733
        zeroshot     12785
openai  fewshot      10770
        oneshot      12290
        zeroshot     11434
Name: num_words, dtype: int64


In [None]:
grouped_synth_sarc_unique = synth_sarc_unique.groupby(['llm', 'prompting']).size()
print("\nNumber of rows per indicated columns in synth_sarc_unique:")
print(grouped_synth_sarc_unique)


Number of rows per indicated columns in synth_sarc_unique:
llm     prompting
gemini  fewshot       910
        oneshot      1000
        zeroshot     1004
openai  fewshot       767
        oneshot       928
        zeroshot      859
dtype: int64


In [None]:
print(synth_sarc_unique['num_words'].sum())
print(synth_sarc_unique[synth_sarc_unique['llm'] == 'openai']['num_words'].sum())
print(synth_sarc_unique[synth_sarc_unique['llm'] == 'gemini']['num_words'].sum())

71120
34494
36626


In [None]:
print(real_sarc['num_words'].sum())

In [None]:
import random
random.seed(28)

# Separate gemini and openai
gemini_subset = synth_sarc_unique[synth_sarc_unique['llm'] == 'gemini']
openai_subset = synth_sarc_unique[synth_sarc_unique['llm'] == 'openai']

# Initialize dictionaries to store sampled rows and word counts for each prompting strategy
gemini_sampled = {'zeroshot': [], 'oneshot': [], 'fewshot': []}
openai_sampled = {'zeroshot': [], 'oneshot': [], 'fewshot': []}

def sample_rows(subset, sampled_dict):
    word_counts = {'zeroshot': 0, 'oneshot': 0, 'fewshot': 0}
    rows = subset.index.tolist()  # Get indices of rows in the subset
    random.shuffle(rows)  # Shuffle the indices to perform random sampling
    for index in rows:
        row = subset.loc[index]  # Retrieve the row using the index
        prompting_strategy = row['prompting']
        if word_counts[prompting_strategy] + row['num_words'] <= 10001:
            sampled_dict[prompting_strategy].append(row)
            word_counts[prompting_strategy] += row['num_words']
        if all(count == 10000 for count in word_counts.values()):
            break

# Sample for gemini
sample_rows(gemini_subset, gemini_sampled)

# Sample for openai
sample_rows(openai_subset, openai_sampled)

# Concatenate sampled rows into final samples
gemini_final_sample = pd.concat([pd.DataFrame(sampled_list) for sampled_list in gemini_sampled.values()])
openai_final_sample = pd.concat([pd.DataFrame(sampled_list) for sampled_list in openai_sampled.values()])

# Reset index for final samples
gemini_final_sample.reset_index(drop=True, inplace=True)
openai_final_sample.reset_index(drop=True, inplace=True)

print(gemini_final_sample.groupby('prompting')['num_words'].sum())
print(gemini_final_sample['num_words'].sum())
print(openai_final_sample.groupby('prompting')['num_words'].sum())
print(openai_final_sample['num_words'].sum())

prompting
fewshot     10000
oneshot     10001
zeroshot    10001
Name: num_words, dtype: int64
30002
prompting
fewshot     10000
oneshot     10001
zeroshot    10001
Name: num_words, dtype: int64
30002


In [None]:
import random

def sample_rows_real_sarc(subset, total_word_limit, seed=None):
    if seed is not None:
        random.seed(seed)
    word_counts = {'telegram': 0, 'twitter': 0}
    rows = subset.index.tolist()
    random.shuffle(rows)
    sampled_rows = []
    for index in rows:
        row = subset.loc[index]
        source = row['source'].lower()
        if word_counts[source] + row['num_words'] <= total_word_limit:
            sampled_rows.append(row)
            word_counts[source] += row['num_words']
        if all(count == total_word_limit for count in word_counts.values()):
            break
    return pd.DataFrame(sampled_rows)


combined_sample = sample_rows_real_sarc(real_sarc, total_word_limit=15001, seed=2)


print("Combined Sample Size:", len(combined_sample))
print(combined_sample['num_words'].sum())
print(combined_sample.groupby('source')['num_words'].sum())

Combined Sample Size: 2158
30002
source
Telegram    15001
Twitter     15001
Name: num_words, dtype: int64


In [None]:
openai_final_sample['is_sarcastic'] = 1
gemini_final_sample['is_sarcastic'] = 1

In [None]:
gemini_final_sample.to_csv("gemini_sample.csv")
openai_final_sample.to_csv("openai_sample.csv")
combined_sample.to_csv("real_sarc_sample.csv")