In [1]:
%%capture
!pip install datasets transformers tqdm
!pip install torch
!pip install transformers[torch]
!pip install accelerate -U

In [2]:
import pandas as pd
import numpy as np
import ast
from datasets import Dataset
from transformers import pipeline, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
test_df = pd.read_csv('/content/drive/MyDrive/tar_projekt/csvs/dataset_validation_allLevels.csv')
test_df = test_df.loc[test_df['level'] > 1]

texts = test_df['text']
labels = test_df['change']
labels = [ast.literal_eval(string) for string in labels]

In [12]:
test_df

Unnamed: 0,id,text-id,text,num-authors,level,change,text-length
900,medium-1,1,I asked them about something of that nature an...,4,2,"[1, 0, 1, 1, 1, 1, 0]",2813
901,medium-2,2,"In general, be courteous to others. Debate/dis...",4,2,"[0, 1, 1, 1, 1, 1, 1]",1554
902,medium-3,3,Hitler was ethnically German. The part of the ...,2,2,"[1, 0, 0, 1, 0]",2801
903,medium-4,4,It would be good if their athletes could still...,3,2,"[1, 1, 1, 1]",1064
904,medium-5,5,"Mexican here, this is the context: Plan B aros...",3,2,"[0, 0, 1, 1]",1941
...,...,...,...,...,...,...,...
2695,hard-896,896,There's actually many different ways a tank ca...,2,3,"[0, 1, 0, 0]",1745
2696,hard-897,897,"ok, and, as the article mentions, the vast maj...",4,3,"[1, 1, 0, 1, 0, 0, 0, 0, 0]",1500
2697,hard-898,898,"Ok, I went through the online thingy to self-v...",2,3,"[0, 0, 0, 1]",1196
2698,hard-899,899,They're not stupid. Without US support and pro...,2,3,[1],1095


# 1. Evaluation on whole dataset

In [13]:
test_data = []

for text, label in zip(texts, labels):
    paragraphs = text.split(' ||| ')
    for (p1, p2), y in zip(zip(paragraphs, paragraphs[1: ]), label):
        p1 = p1.strip()
        p2 = p2.strip()

        test_data.append({'text': p1 + '\n' + p2, 'label': y})

In [15]:
test_data = Dataset.from_pandas(pd.DataFrame(test_data))
test_data[0]

{'text': 'I asked them about something of that nature and they weren’t willing to work with me on it (it’s a committee in the advising office that makes the determination.) Basically they said that my petition wouldn’t get approved unless I late withdrew after making another “effort,” which I honestly don’t want to have to do (a) for financial reasons, I’m 23 living independently and lost my job to covid but the school won’t help me with financial aid at all so I’m $100k+ in loan debt and (b) it may sound like BS, but I don’t want to go through the humiliating and overwhelming stressful process of taking a language class, since my last attempt at the university my professor called me lazy multiple times despite me going to office hours and working my butt off.\nYou probably need to secure the waiver from the disability resource office, they will have the ability typically to make the academic departments stick to it.',
 'label': 1}

In [16]:
# Loading the model and tokenizer
# Load the saved model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/tar_projekt/results_model")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/tar_projekt/results_model")

In [32]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_test = test_data.map(tokenize_function)
# tokenized_test = tokenized_test.remove_columns(["text"])
tokenized_test = tokenized_test.rename_column("label", "labels")
tokenized_test.set_format("torch")

Map:   0%|          | 0/11154 [00:00<?, ? examples/s]

In [34]:
tokenized_test['labels']

tensor([1, 0, 1,  ..., 0, 0, 0])

In [35]:
# Initialize the Trainer
trainer = Trainer(model=model)

# Make predictions
predictions = trainer.predict(tokenized_test)

In [36]:
predictions.predictions.shape

(11154, 2)

In [38]:
tokenized_test['labels'].shape

torch.Size([11154])

In [42]:
predictions.predictions

array([[ 1.0491076 , -0.40044987],
       [-0.08386427, -0.2601361 ],
       [ 0.77199644, -0.20676562],
       ...,
       [-0.03341053, -0.14320481],
       [ 0.66968054, -0.26396796],
       [-0.2256942 ,  0.01367193]], dtype=float32)

In [43]:
from sklearn.metrics import f1_score

In [44]:
# Process predictions
predicted_labels = np.argmax(predictions.predictions, axis=1)

# True labels
true_labels = np.array(tokenized_test["labels"])

# Calculate F1 score
f1 = f1_score(true_labels, predicted_labels)

In [47]:
len(predicted_labels)

11154

In [48]:
print(f"F1 Score (on whole dataset): {f1}")

F1 Score (on whole dataset): 0.6493130874909617


# 2. Evaluation on text that are also annotated by humans

In [88]:
id_human = ['medium-534', 'medium-677', 'hard-51', 'medium-84',
            'medium-492', 'medium-564', 'hard-72', 'hard-656',
            'hard-370', 'medium-17', 'hard-501', 'hard-784',
            'medium-659', 'hard-555', 'hard-3', 'hard-624',
            'medium-445', 'hard-808', 'medium-85', 'medium-746',
            'hard-547', 'hard-862', 'medium-192', 'hard-24',
            'medium-31', 'medium-269', 'hard-121', 'medium-889',
            'hard-118', 'hard-762', 'hard-693', 'medium-253']

# Initialize the dictionary
id_dict = {}
key_index = 1

# Iterate through the list
for i in range(len(id_human)):
    if i % 4 == 0:
        # Create a new key for every 4th item
        key = key_index
        id_dict[key] = []
        key_index += 1
    # Append the current item to the current key's list
    id_dict[key].append(id_human[i])

# Print the resulting dictionary
id_dict

{1: ['medium-534', 'medium-677', 'hard-51', 'medium-84'],
 2: ['medium-492', 'medium-564', 'hard-72', 'hard-656'],
 3: ['hard-370', 'medium-17', 'hard-501', 'hard-784'],
 4: ['medium-659', 'hard-555', 'hard-3', 'hard-624'],
 5: ['medium-445', 'hard-808', 'medium-85', 'medium-746'],
 6: ['hard-547', 'hard-862', 'medium-192', 'hard-24'],
 7: ['medium-31', 'medium-269', 'hard-121', 'medium-889'],
 8: ['hard-118', 'hard-762', 'hard-693', 'medium-253']}

In [69]:
human_df = test_df[test_df['id'].isin(id_human)]

texts_ids = human_df['id']
texts_h = human_df['text']
labels_h = human_df['change']
labels_h = [ast.literal_eval(string) for string in labels_h]

In [73]:
human_data = []

for id, text, label in zip(texts_ids, texts_h, labels_h):
    paragraphs = text.split(' ||| ')
    for (p1, p2), y in zip(zip(paragraphs, paragraphs[1: ]), label):
        p1 = p1.strip()
        p2 = p2.strip()

        human_data.append({'text-id': id,'text': p1 + '\n' + p2, 'label': y})

In [78]:
paragraphs_df = pd.DataFrame(human_data)

In [89]:
paragraphs_df['version'] = 'unknown'

# Iterate over the dictionary to update the 'version' column
for version, text_ids in id_dict.items():
    paragraphs_df.loc[paragraphs_df['text-id'].isin(text_ids), 'version'] = version

In [92]:
paragraphs_df

Unnamed: 0,text-id,text,label,version
0,medium-17,"Due to the overreach on the initial layoffs, c...",1,3
1,medium-17,I used to be in Pharmacy. Then I did a researc...,0,3
2,medium-17,So if a medical center has any that are partia...,1,3
3,medium-17,Open vials are supposed to be discarded after ...,1,3
4,medium-17,It costs about 15-35 cents to make a mL of ins...,1,3
...,...,...,...,...
158,hard-862,"Well, I still don’t have an answer. I have opt...",0,6
159,hard-862,"Same here, others too. I don’t consider email ...",1,6
160,hard-862,After months of email going to spam I finally ...,1,6
161,hard-862,Receiving the same e-mails from Cedars trying ...,0,6


In [93]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


def evaluate_data(test_data, model):
  # test data: dictionary: text, label
  tokenized_test = test_data.map(tokenize_function)
  # tokenized_test = tokenized_test.remove_columns(["text"])
  tokenized_test = tokenized_test.rename_column("label", "labels")
  tokenized_test.set_format("torch")

  # Initialize the Trainer
  trainer = Trainer(model=model)

  # Make predictions
  predictions = trainer.predict(tokenized_test)

  # Process predictions
  predicted_labels = np.argmax(predictions.predictions, axis=1)
  true_labels = np.array(tokenized_test["labels"])
  f1 = f1_score(true_labels, predicted_labels)

  return predicted_labels, true_labels, f1

In [105]:
results_vers = {
    'version': [],
    'f1': [],
}

for key in id_dict.keys():
  _, _, f1 = evaluate_data(test_data = Dataset.from_pandas(paragraphs_df.loc[paragraphs_df['version'] == key, ['text', 'label']]),
              model = model)
  results_vers['version'].append(key)
  results_vers['f1'].append(f1)

results_versions_df = pd.DataFrame(results_vers)

Map:   0%|          | 0/17 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Map:   0%|          | 0/17 [00:00<?, ? examples/s]

Map:   0%|          | 0/27 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

In [110]:
predicted, true, f1 = evaluate_data(test_data = Dataset.from_pandas(pd.DataFrame(paragraphs_df[['text', 'label']])),
              model = model)

results_versions_df = results_versions_df._append({'version': 'all', 'f1': f1}, ignore_index=True)

Map:   0%|          | 0/163 [00:00<?, ? examples/s]

In [111]:
results_versions_df

Unnamed: 0,version,f1
0,1,0.705882
1,2,0.666667
2,3,0.580645
3,4,0.761905
4,5,0.580645
5,6,0.736842
6,7,0.470588
7,8,0.636364
8,all,0.636872


# 3. Results

In [113]:
f1 = f1_score(true_labels, predicted_labels)
print(f"F1 Score (on whole dataset): {f1}")

F1 Score (on whole dataset): 0.6493130874909617


In [114]:
results_versions_df

Unnamed: 0,version,f1
0,1,0.705882
1,2,0.666667
2,3,0.580645
3,4,0.761905
4,5,0.580645
5,6,0.736842
6,7,0.470588
7,8,0.636364
8,all,0.636872
