### Create a dataset of pairs from the eval set. Enrich it with scores, sentiment analysis and other markers so that the dataset can effectively be used to label the quality of each pair. 

In [16]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [17]:
from dataclasses import dataclass, field
from typing import Optional
import huggingface_hub
import functools as ft
import torch
import pandas as pd
import torch
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig
from tqdm import tqdm
from transformers import Adafactor, AutoTokenizer, HfArgumentParser, pipeline, AutoConfig, GPTNeoXForCausalLM, AutoModelForCausalLM
from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed
from trl.core import LengthSampler
from transformers import pipeline, TextGenerationPipeline, AutoConfig, AutoTokenizer, AutoModelForCausalLM, AutoModel, AutoModelForSequenceClassification, GPTNeoXForCausalLM, LlamaForSequenceClassification
from redditqa.dataset import load_reddit_dataset
from textblob import TextBlob

In [18]:
# Load the dataset 

eval_dataset = load_reddit_dataset("eval", pairs=True)
eval_dataset = eval_dataset.shuffle(seed=42).select(range(250))

eval_dataset, eval_dataset[0]

Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-cba55e4212677d14.arrow
Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-d8898fc7c787d1eb.arrow
Loading cached shuffled indices for dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-e35089f0b695ca2b.arrow
Loading cached shuffled indices for dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-ea21b592f4358562.arrow


(Dataset({
     features: ['answer_link_id', 'question_title', 'response_j', 'response_k', 'score_j', 'score_k'],
     num_rows: 250
 }),
 {'answer_link_id': '2y0dxt',
  'question_title': 'Why do employers ask "where do you see yourself in 5-10 years?" How do personal goals matter at all?',
  'response_j': 'After you respond, be sure to ask, "How do you see the company changing over that timespan?"',
  'response_k': "Whatever job you're applying for, think of the logical career path and where you should be in fifteen years. Like all basic interview questions it's more about whether you can have an adult conversation than the actual answers. ",
  'score_j': 13,
  'score_k': 2})

In [19]:
# Add sentiment analysis and question detection to both answers 

sentiment_pipe = pipeline("text-classification", model="michellejieli/emotion_text_classifier", device='cuda:0')
question_detection_pipe = pipeline("text-classification", model="huaen/question_detection", device='cuda:0')

def apply_get_emotions(row): 
    row['emotion_j'] = sentiment_pipe(row['response_j'])[0]['label']
    row['emotion_k'] = sentiment_pipe(row['response_k'])[0]['label']
    return row

def apply_question_detection(row): 
    row['is_question_j'] = question_detection_pipe(row['response_j'])[0]['label']
    row['is_question_k'] = question_detection_pipe(row['response_k'])[0]['label']
    return row
    
eval_dataset = eval_dataset.map(apply_get_emotions).map(apply_question_detection)

Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-aa778f10af66955c.arrow
Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-5e29707aeaf881d0.arrow


In [20]:
def percent(x): 
    return "{:.2f}%".format(x * 100)

def apply_textblob(row):
    textblob_j = TextBlob(row["response_j"]).sentiment
    textblob_k = TextBlob(row["response_k"]).sentiment
    row["polarity_j"] = percent(textblob_j.polarity)
    row["subjectivity_j"] = percent(textblob_j.subjectivity)
    row["polarity_k"] = percent(textblob_k.polarity)
    row["subjectivity_k"] = percent(textblob_k.subjectivity)
    return row

eval_dataset = eval_dataset.map(apply_textblob)

Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-4dad404e64e4d280.arrow


In [21]:
eval_dataset[0]

{'answer_link_id': '2y0dxt',
 'question_title': 'Why do employers ask "where do you see yourself in 5-10 years?" How do personal goals matter at all?',
 'response_j': 'After you respond, be sure to ask, "How do you see the company changing over that timespan?"',
 'response_k': "Whatever job you're applying for, think of the logical career path and where you should be in fifteen years. Like all basic interview questions it's more about whether you can have an adult conversation than the actual answers. ",
 'score_j': 13,
 'score_k': 2,
 'emotion_j': 'neutral',
 'emotion_k': 'neutral',
 'is_question_j': 'question',
 'is_question_k': 'non_question',
 'polarity_j': '50.00%',
 'subjectivity_j': '88.89%',
 'polarity_k': '17.00%',
 'subjectivity_k': '25.50%'}

In [22]:
# Export pairs for labelling
pd.DataFrame(eval_dataset).to_excel("eval_dataset_for_labelling-n250.xlsx")