In [None]:
%%capture
!pip install stanza

In [None]:
from nltk.tokenize import TweetTokenizer
import pandas as pd
import stanza
import re

data = pd.read_csv("dataset_cleaned.csv")

In [None]:
data['is_sarcastic'].value_counts()

In [None]:
data

In [None]:
tk = TweetTokenizer()
uk_nlp = stanza.Pipeline(lang='uk', verbose=False)

def substitute_user_mentions_and_links(text):
    # Regular expression to match user mentions
    user_mention_pattern = r'@\w+'

    # Regular expression to match links
    link_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

    text = re.sub(user_mention_pattern, '', text)

    text = re.sub(link_pattern, '', text)

    text = re.sub(r'[a-zA-Z]+', '', text)

    return text.lower()

def remove_some_punc_numbers(text):
    chars_to_remove = r'[\#\$\%\&\*\+\,\-\/\:\;\<\=\>\@\[\\\]\^\_\{\|\}\~\d\.\–]'

    result = re.sub(chars_to_remove, '', ' '.join(text))

    return result.lower()

pattern = r'\b(\w+)\s*\'\s*(\w+)\b'

# join words separated by apostrophe
def join_words(match):
    return match.group(1) + match.group(2)

def lemmatize(text):
    lemmas_st = []
    for sent in uk_nlp(text).sentences:
        for word in sent.words:
            lemmas_st.append(word.lemma)
    return lemmas_st

In [None]:
data['text_mod'] = data['text'].apply(substitute_user_mentions_and_links)
data['tokenized'] = data['text_mod'].apply(lambda x: tk.tokenize(x))
data['tokenized_cleaned'] = data['tokenized'].apply(remove_some_punc_numbers)
data['tokenized_cleaned'] = data['tokenized_cleaned'].str.replace(pattern, join_words, regex=True)
data['tokenized_cleaned'] = data['tokenized_cleaned'].str.replace(r'\s+', ' ', regex=True)
data['lemmatized'] = data['tokenized_cleaned'].apply(lemmatize)

In [None]:
data.to_csv("dataset_ready_for_models.csv")

# Creating synthetic data from open ai + not sarcastic left

In [None]:
import pandas as pd


synth_sarcastic = pd.read_csv("synthetic_data_combined.csv")

In [None]:
openai = synth_sarcastic[synth_sarcastic['llm'] == 'openai']

In [None]:
openai

In [None]:
openai['text_mod'] = openai['text'].apply(substitute_user_mentions_and_links)
openai['tokenized'] = openai['text_mod'].apply(lambda x: tk.tokenize(x))
openai['tokenized_cleaned'] = openai['tokenized'].apply(remove_some_punc_numbers)
openai['tokenized_cleaned'] = openai['tokenized_cleaned'].str.replace(pattern, join_words, regex=True)
openai['tokenized_cleaned'] = openai['tokenized_cleaned'].str.replace(r'\s+', ' ', regex=True)
openai['lemmatized'] = openai['tokenized_cleaned'].apply(lemmatize)

In [None]:
openai.shape

In [None]:
real_not_sarc = pd.read_csv("telegram_not_sarcastic_sample_left.csv")

In [None]:
import random

real_not_sarc_sample = real_not_sarc.sample(random_state=42, n=2554)
real_not_sarc_sample.rename(columns={'Message': 'text'}, inplace=True)
real_not_sarc_sample['text_mod'] = real_not_sarc_sample['text'].apply(substitute_user_mentions_and_links)
real_not_sarc_sample['tokenized'] = real_not_sarc_sample['text_mod'].apply(lambda x: tk.tokenize(x))
real_not_sarc_sample['tokenized_cleaned'] = real_not_sarc_sample['tokenized'].apply(remove_some_punc_numbers)
real_not_sarc_sample['tokenized_cleaned'] = real_not_sarc_sample['tokenized_cleaned'].str.replace(pattern, join_words, regex=True)
real_not_sarc_sample['tokenized_cleaned'] = real_not_sarc_sample['tokenized_cleaned'].str.replace(r'\s+', ' ', regex=True)
real_not_sarc_sample['lemmatized'] = real_not_sarc_sample['tokenized_cleaned'].apply(lemmatize)

In [None]:
openai = openai[['text', 'text_mod', 'tokenized','tokenized_cleaned', 'lemmatized']]
openai['is_sarcastic'] = 1

In [None]:
real_not_sarc_sample = real_not_sarc_sample[['text', 'text_mod', 'tokenized','tokenized_cleaned', 'lemmatized']]
real_not_sarc_sample['is_sarcastic'] = 0

In [None]:
pd.concat([real_not_sarc_sample, openai]).to_csv("synth_openai_sarc_and_not.csv")