### Create a dataset of pairs from the eval set. Enrich it with scores, sentiment analysis and other markers so that the dataset can effectively be used to label the quality of each pair. 

In [12]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [76]:
from dataclasses import dataclass, field
from typing import Optional
import huggingface_hub
import functools as ft
import torch
import pandas as pd
import torch
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig
from tqdm import tqdm
from transformers import Adafactor, AutoTokenizer, HfArgumentParser, pipeline, AutoConfig, GPTNeoXForCausalLM, AutoModelForCausalLM
from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed
from trl.core import LengthSampler
from transformers import pipeline, TextGenerationPipeline, AutoConfig, AutoTokenizer, AutoModelForCausalLM, AutoModel, AutoModelForSequenceClassification, GPTNeoXForCausalLM, LlamaForSequenceClassification
from redditqa.dataset import load_reddit_dataset
from textblob import TextBlob
from hashlib import sha256
import json

In [14]:
# Load the dataset 

eval_dataset = load_reddit_dataset("eval", pairs=True)
eval_dataset = eval_dataset.shuffle(seed=42).select(range(5000))

eval_dataset, eval_dataset[0]

Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-cba55e4212677d14.arrow
Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-d8898fc7c787d1eb.arrow
Loading cached shuffled indices for dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-e35089f0b695ca2b.arrow
Loading cached shuffled indices for dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-ea21b592f4358562.arrow


(Dataset({
     features: ['answer_link_id', 'question_title', 'response_j', 'response_k', 'score_j', 'score_k'],
     num_rows: 5000
 }),
 {'answer_link_id': '2y0dxt',
  'question_title': 'Why do employers ask "where do you see yourself in 5-10 years?" How do personal goals matter at all?',
  'response_j': 'After you respond, be sure to ask, "How do you see the company changing over that timespan?"',
  'response_k': "Whatever job you're applying for, think of the logical career path and where you should be in fifteen years. Like all basic interview questions it's more about whether you can have an adult conversation than the actual answers. ",
  'score_j': 13,
  'score_k': 2})

In [15]:
# Add sentiment analysis and question detection to both answers 

sentiment_pipe = pipeline("text-classification", model="michellejieli/emotion_text_classifier", device='cuda:0')
question_detection_pipe = pipeline("text-classification", model="huaen/question_detection", device='cuda:0')

def apply_get_emotions(row): 
    row['emotion_j'] = sentiment_pipe(row['response_j'])[0]['label']
    row['emotion_k'] = sentiment_pipe(row['response_k'])[0]['label']
    return row

def apply_question_detection(row): 
    row['is_question_j'] = question_detection_pipe(row['response_j'])[0]['label']
    row['is_question_k'] = question_detection_pipe(row['response_k'])[0]['label']
    return row
   
eval_dataset = eval_dataset.map(apply_get_emotions).map(apply_question_detection)

Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-701044c431d0d9a7.arrow
Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-4744c26eca1bd983.arrow


In [28]:
def percent(x): 
    return "{:.2f}%".format(x * 100)

def apply_textblob(row):
    textblob_j = TextBlob(row["response_j"]).sentiment
    textblob_k = TextBlob(row["response_k"]).sentiment
    #row["polarity_j"] = percent(textblob_j.polarity)
    row["subjectivity_j"] = textblob_j.subjectivity
    #row["polarity_k"] = percent(textblob_k.polarity)
    row["subjectivity_k"] = textblob_k.subjectivity
    return row

eval_dataset = eval_dataset.map(apply_textblob)

                                                                 

In [29]:
eval_dataset, eval_dataset[0]

(Dataset({
     features: ['answer_link_id', 'question_title', 'response_j', 'response_k', 'score_j', 'score_k', 'emotion_j', 'emotion_k', 'is_question_j', 'is_question_k', 'subjectivity_j', 'subjectivity_k'],
     num_rows: 5000
 }),
 {'answer_link_id': '2y0dxt',
  'question_title': 'Why do employers ask "where do you see yourself in 5-10 years?" How do personal goals matter at all?',
  'response_j': 'After you respond, be sure to ask, "How do you see the company changing over that timespan?"',
  'response_k': "Whatever job you're applying for, think of the logical career path and where you should be in fifteen years. Like all basic interview questions it's more about whether you can have an adult conversation than the actual answers. ",
  'score_j': 13,
  'score_k': 2,
  'emotion_j': 'neutral',
  'emotion_k': 'neutral',
  'is_question_j': 'question',
  'is_question_k': 'non_question',
  'subjectivity_j': 0.8888888888888888,
  'subjectivity_k': 0.255})

In [80]:
# Create df for labelling 
df = pd.DataFrame(eval_dataset)

# Delete columns that are not helpful for labelling
del df['is_question_j']
del df['is_question_k']

# Rename general columns
df = df.rename(columns={'answer_link_id': 'Post ID', 'question_title': "Question"})

# Reorder columns
df = df[[
    'Post ID',
    'Question',
    "response_j",
    "score_j",
    "emotion_j",
    "subjectivity_j",
    "response_k",
    "score_k",
    "emotion_k",
    "subjectivity_k",
]]

# # Add multi index to columns
# df = df.rename(
#     columns={
#         "response_j": ("J", "Response"),
#         "score_j": ("J", "Score"),
#         "emotion_j": ("J", "Emotion"),
#         "subjectivity_j": ("J", "Subjectivity"),
#         "response_k": ("K", "Response"),
#         "score_k": ("K", "Score"),
#         "emotion_k": ("K", "Emotion"),
#         "subjectivity_k": ("K", "Subjectivity"),
#     }
# )

# Replace emotions with emojis 
emoji_map = {
    "neutral": "",
    "anger": "🤬",
    "disgust": "🤢",
    "fear": "😨",
    "joy": "😀",
    "sadness": "😭",
    "surprise": "😲"
}
df.emotion_j = df.emotion_j.map(emoji_map)
df.emotion_k = df.emotion_k.map(emoji_map)

# Format subjectivity numbers 
df.subjectivity_j = df.subjectivity_j.map(lambda x: f"{x:.2f}")
df.subjectivity_k = df.subjectivity_k.map(lambda x: f"{x:.2f}")

# Add a unique id for each row based on the question id and both responses 
pair_ids = (df["Post ID"] + df.response_j + df.response_k).apply(lambda s: sha256(s.encode('utf-8')).hexdigest())
df.insert(1, "Pair ID", pair_ids)

df.head(5)

Unnamed: 0,Post ID,Pair ID,Question,response_j,score_j,emotion_j,subjectivity_j,response_k,score_k,emotion_k,subjectivity_k
0,2y0dxt,b63a93a16996d2b7fa925f29fefb5bf89f6a8ba3d71842...,"Why do employers ask ""where do you see yoursel...","After you respond, be sure to ask, ""How do you...",13,,0.89,"Whatever job you're applying for, think of the...",2,,0.26
1,2sdmhm,b4193d5af4b8bd266a7a33ad263f4cb9ce3a73e1e5e22f...,Why do some people have dark circles under the...,I have a little medical background and one rea...,23,,0.56,Am I the only one who LIKES their raccoon ring...,1,,0.8
2,1wqcoe,fda0463e5a8a6998e42e616196e8d599939372944f99a7...,Is it possible to sneeze while sleeping? If no...,"No, at least not during REM sleep, as motor ne...",34,,0.66,I've seen one of my friends sneeze while sleep...,-4,😲,0.72
3,3x3egv,a723e6db4c9868a977f04680516be2a305aeabbac1952b...,Do pets know or realize what gender/sex their ...,"I am familiar with animals, specifically dogs,...",4,,0.44,And when my siblings come over...can they smel...,1,,0.0
4,1igu56,63f8f7e6e67ab2af2a1e4950cc702846534b7caf31620b...,How did Trey Parker and Matt Stone convince Co...,Most people don't know that Matt &amp; Trey pr...,3,,0.4,"They let the word ""fuck"" slip through a few ti...",-4,🤢,0.35


#### Generate table for labellers 

**Status**

250 annotated (Tolga 150, Moritz 50, JV 50)

**Commitments**

* Tolga: 1000
* Moritz: 500
* Yindong: 200
* Margarita: 300
* JV: 400

**Approach**

We want every sample to be annotated by two annotators 

**Idea** 

Tolga "covers" Moritz, Margarita and Yindong

JV covers 200 samples from previous round as well as 200 samples from Tolga and Moritz for which we then have 3 labels 

**Result**

* 200 samples with 3 annotators 
* 1000 samples with 2 annotators 
* 50 samples with 1 annotator 

In [98]:
df_labelling = df.iloc[250:1250].copy()

df_labelling['Assignee'] = ""
df_labelling['Label'] = ""
df_labelling['Comment'] = ""

# Duplicate rows
df_labelling = pd.concat([df_labelling, df_labelling, df_labelling.iloc[:200]], ignore_index=True)

# Set assignees
def get_assignee(index): 
    if index < 1000: 
        return "Tolga"
    elif index < 1500: 
        return "Moritz"
    elif index < 1700: 
        return "Yindong"
    elif index < 2000:
        return "Yindong"
    else: 
        return "JV"
df_labelling.Assignee = df_labelling.apply(lambda row: get_assignee(row.name), axis=1)


Verify that we have 800 pairs annotated by two persons and 200 annotated by three

In [99]:
df_labelling.groupby("Pair ID")["Assignee"].count().value_counts()

Assignee
2    800
3    200
Name: count, dtype: int64

Show and export the dataset

In [102]:
# Create a multi index for the columns to group them by J or K 

def rename_column(col): 

    if isinstance(col, tuple):
        return col

    if "_" not in col: 
        return col, ""
    else: 
        heading = col.split("_")[0]
        subheading = col.split("_")[1]

        heading = heading.capitalize()
        subheading = subheading.capitalize()

        return heading, subheading

df_labelling.columns = pd.MultiIndex.from_tuples([rename_column(col) for col in df_labelling.columns])

df_labelling.head()

Unnamed: 0_level_0,Post ID,Pair ID,Question,J,J,J,J,K,K,K,K,Assignee,Label,Comment
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Response,Score,Emotion,Subjectivity,Response,Score,Emotion,Subjectivity,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,6477or,6515193e8371d0c4c71c1b753b489d521943c49f6424fc...,Why is mass surveillance by the government on ...,The legal reason is because you do not have an...,27,,0.25,What exactly do you mean by mass surveillance?...,3,😲,0.71,Tolga,,
1,29e2fn,c08ffa04985f2779f17d67a981ff249043ab1523ef7815...,Why do I feel more tired when I wake up early ...,Sleep works in different stages (like REM or d...,6,,0.52,I take a lot if medications that make me tired...,1,,0.7,Tolga,,
2,20x5ha,5435dfe3489093d2f8598c68fc7286bb4de421fbfd235d...,If I am/my kids are vaccinated why should I be...,Because non-vaccinated children/people can hos...,3,,0.0,"If you and your child are vaccinated with, say...",2,,0.5,Tolga,,
3,20gu9e,38133a7d8129d73504a64eb43916fbe8945ac93e77b856...,"What exactly is the issue with bees right now,...",The bees are infested with parasites. that's w...,2,🤢,0.49,I get that bees might be disappearing in the w...,-1,,0.26,Tolga,,
4,3l674c,a4546ef47c6ed8722f49962c014c67207df808a008bc3a...,Why do Chinese companies go to such lengths to...,Roadside car breakers in Ireland mis-spell the...,3,,0.65,That reminds me. Years ago we bought a video g...,1,,0.49,Tolga,,


Export

In [103]:
df_labelling.to_excel('eval_dataset_for_labelling-250to1250-multiple_annotators.xlsx')