In [None]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.47.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.3
    Uninstalling tokenizers-0.20.3:
      Successfully uninstalled tokenizers-0.20

In [None]:
import pandas as pd
import transformers

In [None]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": """
    You are a sentiment analysis expert specializing in detecting sarcasm and irony in text. Your goal is to analyze the following restaurant review and determine if it contains sarcasm or irony.
    **Review:**
    "{REVIEW_TEXT}"

    **Task:**
    Provide a probability score (0-100%) for how likely you believe the review is sarcastic or ironic. Your response should be in the following format:
      - **Probability**: [0-100%]
    Please follow this format in your response:
    ---
    **Probability**: [Your answer here]
    ---
    """},
    {"role": "user", "content": "{The decor is stunning; too bad the food doesn't match the aesthetic.}"},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])

In [None]:
with open("../dataset/yelp_sarcasm_review_500.txt", 'r', encoding='utf-8') as file:
    lines = file.readlines()

lines = [line.strip() for line in lines]

df = pd.DataFrame(lines, columns=['Review'])

In [None]:
messages = [
    {"role": "system", "content": """
    You are a sentiment analysis expert specializing in detecting sarcasm in text. Your goal is to analyze the following restaurant review and determine if it contains sarcasm.
    **Review:**
    "{REVIEW_TEXT}"

    **Task:**
    Provide a probability score (0-1) for how likely you believe the review is sarcastic. Your response should be in the following format, no explaination needed:
      - **Probability**: [0-1]
    Please follow this format in your response:
    Probability: [Your answer here]
    """},
    {"role": "user", "content": "{The decor is stunning; too bad the food doesn't match the aesthetic.}"},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1]['content'])

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Probability: 0.8


In [None]:
probabilities = []

for review in df['Review']:
  messages = [
      {"role": "system", "content": """
      You are a sentiment analysis expert specializing in detecting sarcasm in text. Your goal is to analyze the following restaurant review and determine if it contains sarcasm.
      **Review:**
      "{REVIEW_TEXT}"

      **Task:**
      Provide a probability score (0-1) for how likely you believe the review is sarcastic. Your response should be in the following format, no explaination needed:
        - **Probability**: [0-1]
      Please follow this format in your response:
      Probability: [Your answer here]
      """.replace("{REVIEW_TEXT}", review)},
      {"role": "user", "content": f"{review}"},
  ]
  outputs = pipeline(
    messages,
    max_new_tokens=256,
  )
  response = outputs[0]["generated_text"][-1]['content']

  match = re.search(r'Probability:\s*([\d.]+)', response)
  if match:
      probability = float(match.group(1))
  else:
      probability = None
  probabilities.append(probability)

df['Probability'] = probabilities

In [None]:
df.to_csv("output/review_500_huggingface.txt", index=False)