In [1]:
 # @title
!pip install datasets
!pip install evaluate
!pip install transformers[torch]



In [2]:
from transformers import BertTokenizer, TFBertModel
from google.colab import drive
from datasets import load_from_disk

from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np
import pandas as pd
import evaluate

from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm

In [3]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [19]:
# Model with intermediate task
# model = AutoModelForSequenceClassification.from_pretrained('/content/gdrive/MyDrive/CSCI567 Project/sarcasm_detection_model', num_labels = 2)

# Model without intermediate task
model = AutoModelForSequenceClassification.from_pretrained('/content/gdrive/MyDrive/Intermediate Model/no_intermediate', num_labels = 2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
pipe = pipeline('text-classification', model = model, tokenizer = tokenizer)

In [21]:
test_dataset_gpt = pd.read_csv('/content/gdrive/MyDrive/CSCI567 Project/gpt-2_chunk_0_generations.csv')
test_dataset_mistral = pd.read_csv('/content/gdrive/MyDrive/CSCI567 Project/sarcasm_generator.csv')

In [22]:
gpt_list = test_dataset_gpt['gpt2_generated_text'].dropna().tolist()
print(len(gpt_list))
gpt_list = list(map(str, gpt_list))

mistral_list = test_dataset_mistral['Generated_Responses'].tolist()
mistral_list = list(map(str, mistral_list))
print(len(mistral_list))

8397
10001


In [23]:
print(gpt_list[0][0:256])

So, how many of you have masterclassists in the Amazon jungle, struggling to tell the difference between a Mommie Babies and a serial serial killer?


In [24]:
sarcastic_gpt = 0
not_sarcastic_gpt = 0
result_gpt = []

sarcastic_mistral = 0
not_sarcastic_mistral = 0
result_mistral = []

total = 0
count = 0

for i in range(len(gpt_list)):
  # Fix max length to prevent an error
  predict_gpt = pipe(gpt_list[i][0:256])
  predict_mistral = pipe(mistral_list[i][0:256])

  total += 1

  if predict_gpt[0]['label'] == 'LABEL_1':
    sarcastic_gpt += 1
    result_gpt.append(1)
  else:
    not_sarcastic_gpt += 1
    result_gpt.append(0)

  if predict_mistral[0]['label'] == 'LABEL_1':
    sarcastic_mistral +=1
    result_mistral.append(1)
  else:
    not_sarcastic_mistral += 1
    result_mistral.append(0)

  if count % 1000 == 0:
    # print(count, ": GPT - ", round(not_sarcastic_gpt / total, 3), round(sarcastic_gpt / total, 3))
    print(count, ": GPT - ", round(not_sarcastic_gpt / total, 3), round(sarcastic_gpt / total, 3), ". Mistral - ", round(not_sarcastic_mistral / total, 3), round(sarcastic_mistral / total, 3))

  count += 1

# print(len(result_gpt))
print(len(result_gpt), " ", len(result_mistral))

0 : GPT -  1.0 0.0 . Mistral -  0.0 1.0
1000 : GPT -  0.403 0.597 . Mistral -  0.065 0.935
2000 : GPT -  0.406 0.594 . Mistral -  0.061 0.939
3000 : GPT -  0.414 0.586 . Mistral -  0.062 0.938
4000 : GPT -  0.414 0.586 . Mistral -  0.064 0.936
5000 : GPT -  0.414 0.586 . Mistral -  0.064 0.936
6000 : GPT -  0.417 0.583 . Mistral -  0.063 0.937
7000 : GPT -  0.418 0.582 . Mistral -  0.061 0.939
8000 : GPT -  0.419 0.581 . Mistral -  0.063 0.937
8397   8397


In [26]:
from sklearn.metrics import f1_score, recall_score, precision_score

In [27]:
new_result_mistral = result_mistral.copy()
total = len(result_mistral)
count = 0

for i in range(len(result_mistral), len(mistral_list)):
  predict_mistral = pipe(mistral_list[i][0:256])

  total += 1

  if predict_mistral[0]['label'] == 'LABEL_1':
    sarcastic_mistral +=1
    new_result_mistral.append(1)
  else:
    not_sarcastic_mistral += 1
    new_result_mistral.append(0)

  if count % 1000 == 0:
    print(count, ": Mistral - ", round(not_sarcastic_mistral / total, 3), round(sarcastic_mistral / total, 3))

  count += 1

print(len(new_result_mistral))

0 : Mistral -  0.063 0.937
1000 : Mistral -  0.064 0.936
10001


In [28]:
print("Mistral - ", round(not_sarcastic_mistral / len(new_result_mistral), 3), round(sarcastic_mistral / len(new_result_mistral), 3))
print('\n')
print("GPT: Not Sarcastic - ", round(not_sarcastic_gpt / len(result_gpt), 3), end = '. ')
print('Sarcasctic - ', round(sarcastic_gpt / len(result_gpt), 3))
# print("GPT-2:", f1_score(y_true, result_gpt, average='macro', zero_division = 1.0))
print("Mistral: Not Sarcastic - ", round(not_sarcastic_mistral / len(new_result_mistral), 3), end = '. ')
print('Sarcastic - ', round(sarcastic_mistral / len(new_result_mistral), 3))

Mistral -  0.065 0.935


GPT: Not Sarcastic -  0.42. Sarcasctic -  0.58
Mistral: Not Sarcastic -  0.065. Sarcastic -  0.935


In [29]:
y_true_gpt = [1] * len(result_gpt)
y_true_mistral = [1] * len(new_result_mistral)

print("GPT-2 (Precision): ", precision_score(y_true_gpt, result_gpt, average = 'macro', zero_division = 1.0))
print("Mistral 7b (Precision): ", precision_score(y_true_mistral, new_result_mistral, average = 'macro', zero_division = 1.0))

print("GPT-2 (Recall): ", recall_score(y_true_gpt, result_gpt, average = 'macro', zero_division = 1.0))
print("Mistral 7b (Recall): ", recall_score(y_true_mistral, new_result_mistral, average = 'macro', zero_division = 1.0))

print("GPT-2:", f1_score(y_true_gpt, result_gpt, average='macro', zero_division = 1.0))
print("Mistral 7b: ", f1_score(y_true_mistral, new_result_mistral, average = 'macro', zero_division = 1.0))

GPT-2 (Precision):  0.5
Mistral 7b (Precision):  0.5
GPT-2 (Recall):  0.7899845182803382
Mistral 7b (Recall):  0.9675532446755324
GPT-2: 0.3670762041154745
Mistral 7b:  0.483232573761174


In [13]:
# fraction_gpt.insert(4, "No NaN F1 Score: " + str(f1_score(y_true, result_gpt, average='macro', zero_division = 1.0)))
# fraction_gpt.insert(4, "No NaN Predicted not sarcastic " + str(round(not_sarcastic_gpt / total, 3)))
# fraction_gpt.insert(4, "No NaN Predicted sarcastic: " + str(round(sarcastic_gpt / total, 3)))
# print(fraction_gpt[:7])
# test_dataset_gpt['Statistics'] = fraction_gpt[:10001]
# test_dataset_gpt.head()

In [14]:
#fraction_gpt = [""] * 9998
#fraction_gpt.insert(0, "F1 Score: " + str(f1_score(y_true, result_gpt, average='macro', zero_division = 1.0)))
#fraction_gpt.insert(0, "Predicted not sarcastic " + str(round(not_sarcastic_gpt / total, 3)))
#fraction_gpt.insert(0, "Predicted sarcastic: " + str(round(sarcastic_gpt / total, 3)))

#fraction_mistral = [""] * 9998
#fraction_mistral.insert(0, "F1 Score: " + str(f1_score(y_true, result_mistral, average='macro', zero_division = 1.0)))
#fraction_mistral.insert(0, "Predicted not sarcastic " + str(round(not_sarcastic_mistral / total, 3)))
#fraction_mistral.insert(0, "Predicted sarcastic: " + str(round(sarcastic_mistral / total, 3)))

#test_dataset_gpt['Predictions'] = result_gpt
#test_dataset_gpt['Statistics'] = fraction_gpt

#test_dataset_mistral['Predictions'] = result_mistral
#test_dataset_mistral['Statistics'] = fraction_mistral

In [15]:
# test_dataset_gpt = test_dataset_gpt.drop(columns = ['Unnamed: 0'])
# test_dataset_mistral = test_dataset_mistral.drop(columns = ['Unnamed: 0'])
# test_dataset = test_dataset.drop(columns = ['Unnamed: 0'])

In [16]:
# test_dataset_gpt.to_csv('/content/gdrive/MyDrive/CSCI567 Project/GPT-2 Inference Results.csv')
# test_dataset_mistral.to_csv('/content/gdrive/MyDrive/CSCI567 Project/Mistral-7b Inference Results.csv')