In [1]:
!git clone https://github.com/bpuvaca/irony-detection-tar2024.git
%cd irony-detection-tar2024/nemojte/

Cloning into 'irony-detection-tar2024'...
remote: Enumerating objects: 3849, done.[K
remote: Counting objects: 100% (896/896), done.[K
remote: Compressing objects: 100% (383/383), done.[K
remote: Total 3849 (delta 623), reused 770 (delta 506), pack-reused 2953 (from 1)[K
Receiving objects: 100% (3849/3849), 82.18 MiB | 13.30 MiB/s, done.
Resolving deltas: 100% (2898/2898), done.
Updating files: 100% (1380/1380), done.
/content/irony-detection-tar2024/nemojte


In [28]:
!pip install emoji
import Loader

#tweets, labels = Loader.parse_dataset(fp="../datasets/crossval/irony.csv", remove_hashtags=True, balance=False, dataset_type='train')
tweets, labels = Loader.parse_dataset(fp="../datasets/crossval/sarcasm.csv", remove_hashtags=True, balance=False, dataset_type='train')
#tweets, labels = Loader.parse_dataset(fp="../datasets/crossval/semeval_mix.csv", remove_hashtags=True, balance=False, dataset_type='train')
tweets, labels = tweets[:350], labels[:350]

Parsed dataset type train with 1786 tweets, 893 1s and 893 0s


In [4]:
import torch


In [6]:

%%capture
major_version, minor_version = torch.cuda.get_device_capability()
print(f"CUDA Major Version: {major_version}")
print(f"CUDA Minor Version: {minor_version}")
print("CUDA version", torch.version.cuda)
print("torch version", torch.__version__)

In [None]:
!pip install unsloth
!pip install transformers
!pip install datasets
!pip install accelerate
!pip install evaluate

In [None]:
from unsloth import FastLanguageModel
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(model_name = "unsloth/llama-3-8b-bnb-4bit", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [9]:
sarcasm_prompt = """
### Instruction:
Analyze the following tweet to determine if it contains sarcasm. For this task, we define sarcasm as {}. Respond with a one-word answer: "Yes" if the tweet is sarcastic, or "No" if it is not.

### Input:
{}

### Response:
{}
"""

sarcasm_definition_cambridge = "the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way"
sarcasm_definition_iSarcasm = "a form of irony that occurs when there is some discrepancy between the literal and intended meanings of an utterance. This discrepancy is used to express dissociation towards a previous proposition, often in the form of contempt or derogation. Tweets that contain sarcasm are those that contradict the state of affairs and are critical towards an addressee."

irony_prompt = """
### Instruction:
Analyze the following tweet to determine if it contains irony. For this task, we define irony as {}. Respond with a one-word answer: "Yes" if the tweet is ironic, or "No" if it is not.

### Input:
{}

### Response:
{}
"""

irony_definition_webster = "the use of words to express something other than and especially the opposite of the literal meaning"
irony_definition_iSarcasm = "the use of words to express something other than and especially the opposite of the literal meaning. Tweets that contain irony are tweets that contradict the state of affairs but are not obviously critical towards an addressee"

In [23]:
def generate_predictions(tweets, prompt, definition):
  predictions = []
  for tweet in tweets:
    inputs = tokenizer([prompt.format(definition, tweet, "")], return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens = 5)
    prediction = tokenizer.batch_decode(outputs)[0]
    predictions.append(prediction)
    #print(f"Tweet: {tweet}\nPrediction: {prediction}\n")
  return predictions


In [36]:
predictions = generate_predictions(tweets, irony_prompt, irony_definition_iSarcasm)

In [37]:
new_predictions = []
for prediction in predictions:
  try:
    response = prediction.split("### Response:")[1].strip().split("\n")[0].strip()
    if "no" in response.lower():
      response = 0
    else:
      response = 1
    new_predictions.append(response)
  except IndexError:
    new_predictions.append(None)

predictions = new_predictions

In [None]:
for tweet, label, prediction in zip(tweets, labels, predictions):
  print(f"Tweet: {tweet}\nLabel: {label}\nPrediction: {prediction}\n")

In [38]:
import csv

def save_to_csv(tweets, labels, predictions, filename):
  with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Tweet', 'Label', 'Prediction'])
    for tweet, label, prediction in zip(tweets, labels, predictions):
      writer.writerow([tweet, label, prediction])

file_name = "LLM_sarcasm_results_irony.csv"
save_to_csv(tweets, labels, predictions, file_name)
