BEFORE RUNNING: 

Set the Runtime Instance to use GPU Acceleration 
(Runtime -> Change Runtime Type -> Select "GPU" in the Hardware Accelerator dropdown menu).

In [1]:
from datetime import date, datetime
current_date = date.today().strftime('%b-%d-%Y')
current_time = datetime.now().time().strftime('%H:%M:%S')
print(f'ITEM GENERATION START TIME: {current_date}, {current_time}')

ITEM GENERATION START TIME: Apr-24-2022, 17:12:06


NOTE: In order to replicate the results from this study, the decoder_model should be set to 'GPT2-XL'. Presently, this model is too large to run with the resources alloted for free in Colab. A Colab Pro account provided sufficient resources at the time of writing to run this model. By default, the decoder_model is set to 'GPT2', as it can presently run successfully on a free GPU runtime instance. 

You may be able to run a larger model for free after lowering some of the optional arguments provided below. The values listed are those which were used in the final run used in the study.

In [2]:
# RUN INFO
#decoder_model = 'GPT2-XL' # Model used to generate item pool in study; REQUIRES HIGH RAM INSTANCE WITH COLAB PRO
decoder_model = 'GPT2' # Model which can be run with a free colab GPU instance.
sentence_encoder_model = 'Sentence_RoBERTa'
# RUN SETTINGS
stem_txt = 'I' # Common Stem
fluff= -1 # Add this amount to a given facet's seq_limit, which is the average facet item length!
n_item_pairs = 30 # How many random pairs of items do you want to create for every factor?

# BEAM SEARCH ARGS
#beam_search_n=1000 # n_beams
n_gram_rule=2 # n_gram
seqs_returned = 50 # return_seqs
beam_k=30 # k for top_k sample in beam search
beam_sample=.85 # p for top_p sample in beam search
rep_penalty=5.0 # Repetition Penalty
sample_temp=0.7 # Temperature of Sampling Distribution 



 # GPT-2 Token ids for banned words 
banned_token_idx = []
conjunctions_etc =  [[14508], [635], [392], [290], [18855], [606], [780], [13893], [31336], [2035], [270], [340], [273], [393], [14108], [534], [5832], [345]]
banned_token_idx.extend(conjunctions_etc) #Banned Words: also, [space]also, and, [space]and, them, [space]them, because, [space]because, either, [space]either, your, [space]your, you, [space]you

punctuation = [[1],[366], [198], [11], [553], [42911]]
banned_token_idx.extend(punctuation) #", [space]", ,", it, [space]it, or, [space]or, '"', '[space]"', '\n',[space]',', ',"', '[space],"'

actual_bad_words = [[33526], [562], [21551],[19317], [29836]]
banned_token_idx.extend(actual_bad_words)


In [3]:
%%capture
!pip install transformers
!pip install sentence_transformers
!pip install datasets

In [4]:
# Set Seeds...
import numpy as np
np.random.seed(2020)
import random
random.seed(2020)

import torch
torch.random.manual_seed(2020) 

import transformers
transformers.set_seed(2020)

torch.backends.cudnn.deterministic = True
if torch.cuda.is_available(): torch.cuda.manual_seed_all(2020)

Upload a copy of the 'HEXACO 100-item.csv' file included in this repository, or upload with another survey of your choosing and set the file name to df_path!

In [5]:
df_path='./HEXACO 100-Items.csv'

import regex
tgt_survey = regex.sub(string = df_path, pattern = "\\.\/", repl = "")
tgt_survey = regex.sub(string = tgt_survey, pattern = "\\.csv", repl = "")
export_path = f"./{tgt_survey}|GPT-2 Large RoBERTa Large " + current_date + ".xlsx"


In [6]:
### Load Pre-trained Transformers

## Generative Model - GPT-2
from transformers import GPT2Tokenizer
from transformers import GPT2LMHeadModel

# Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(decoder_model.lower()) # Need the decoder's tokenizer!
tokenizer.eos_token = '\.'
tokenizer.pad_token = tokenizer.eos_token # gpt2 has no default pad token...

# Model
gpt2 =GPT2LMHeadModel.from_pretrained(decoder_model.lower()) # gpt2 no fine-tuning
cuda=torch.device('cuda:0')
gpt2.to(cuda)


## Encoder Models - Sentence RoBERTa
from sentence_transformers import SentenceTransformer, util

# STS Model
sts_encoder = SentenceTransformer('stsb-roberta-large') # sentence roberta no fine-tuning
# Paraphrase Model
paraphraser = SentenceTransformer('paraphrase-distilroberta-base-v1') # distilled roberta no fine-tuning


In [7]:
# Import all items from a csv file...
import pandas as pd
survey_items=pd.read_csv(f"./{tgt_survey}.csv")

#Turn both columns into lists...
item = survey_items.loc[:,'ITEM'].tolist() #item - the actual survey items
info = survey_items.loc[:,'INFO'].tolist() #info - indicates beginning of new facet, indicates whether an item is + or - coded

#Cleaning item info list...
import regex
import numpy as np

#item_info = [regex.sub(string=i,  pattern=" \\(.*", repl="") for i in info] # Remove reliability info from facet headers
item_info = info

for i in range(0,len(item_info)): # Replace blank lines with info for keying found in previous line.
    if item_info[i] is " " or type(item_info[i]) is float:
        item_info[i] = item_info[i-1]

item_info = [regex.sub(string=item_i, pattern = "\\xa0.*",repl="") for item_i in item_info] # Formatting requirement for McCrae Costa 30-facet scale

item_dict={} 
for i in range(0,len(item_info)):
    if not bool(regex.search(pattern="\\+ keyed",string=item_info[i])) and not bool(regex.search(pattern='- keyed',string=item_info[i])):
        curr_var = {}
        j = i+1
               
        pos_key=[]
        neg_key=[]
        
        while not bool(regex.search(pattern="^\\w",string=item_info[j])):
#        
            if bool(regex.search(pattern="\\+ keyed",string=item_info[j])):
                #pos_key.append("I " + item[j].lower())
                pos_key.append(item[j])
            if bool(regex.search(pattern="- keyed",string=item_info[j])):
                #neg_key.append("I " + item[j].lower())
                neg_key.append(item[j])
#            
            j+=1
            if j == (len(item_info)-1):
                break

        curr_var['+'] = pos_key
        curr_var['-'] = neg_key

        item_dict[item_info[i]] = curr_var


In [8]:

# Performs a beam search on a decoder transformer given some prompt text, and appends the output text of length k to a list. 
def get_sequence(model,prompt=str,word_limit=10,word_lower_limit=3,n_beams=1,n_gram=0,return_seqs=1,p=.5,prohibited_ids=banned_token_idx):
    
  #  assert n_beams >= return_seqs, 'n_beams must be greater than or equal to return_seqs!'

    #beam_outputs=model.generate(**tokenizer([prompt],return_tensors='pt'),
    beam_outputs=model.generate(**prompt,
    max_length=word_limit,
    min_length=word_lower_limit,
    early_stopping=True,
    #num_beams=n_beams,
    no_repeat_ngram_size=n_gram,
    num_return_sequences=return_seqs,
    do_sample=True,
    top_k=beam_k,
    temperature=sample_temp, 
    repetition_penalty=rep_penalty,
    top_p=p,
    bad_words_ids=prohibited_ids,
    early_stopping_rounds=True # stops generation of sequence at EOS token!
    )
    
    seqs = []
    for i, seq in enumerate(beam_outputs):
        seqs.append(tokenizer.decode(seq.tolist()))
    
    return seqs

# Simple function for cleaning and formatting sequence output. Drops all content before the first period (i.e. primer text) and after the second period (i.e., text for second sequence).
def clean_sequence(seq, drop_primer=True, drop_fragments=True, drop_duplicates=True):
    global lower_limit

    for i in range(0,len(seq)):
        if drop_primer:
            seq[i] = regex.sub(string = seq[i], pattern = prompt_txt, repl="")
            seq[i] = "I" + seq[i]
        #if drop_extra:        
            
            
        # seq[i] = seq[i] + '.'
        seq[i] = regex.sub(string = seq[i], pattern = "\n\n.*", repl="")
        #seq[i] = regex.sub(string = seq[i], pattern = "\n", repl="")
        seq[i] = regex.sub(string = seq[i], pattern = "\..*$", repl=".")
        #seq[i] = regex.sub(string = seq[i], pattern = ",.*$", repl="")
    #seq = [i for i in seq if len(regex.split(string=i,pattern=" ")) > lower_limit]
    if drop_fragments:
      seq = [i for i in seq if bool(regex.search(string=i,pattern="\."))] # Only keep sentences containing a period
    
    if drop_duplicates:
      seq = list(set(seq)) # Remove duplicate items

    return seq
#%%

def convergent(seqs, curr_key, item_list, sentence_transformer):
    emb_items = torch.tensor(sentence_transformer.encode(item_list))
    #emb_items_pos = sentence_transformer.encode(item_dict[curr_key]['+'])
    #emb_items_neg = sentence_transformer.encode(item_dict[curr_key]['-'])
    #emb_items = (torch.abs(torch.tensor(emb_items_pos)) + torch.abs(torch.tensor(emb_items_neg)))/2


    emb_seqs = sentence_transformer.encode(seqs)
    cos_sim = util.pytorch_cos_sim(emb_seqs, emb_items)
    
    tgt_cosines = torch.median(cos_sim,1).values
    
    
    return tgt_cosines


def discriminant(seqs, curr_key, item_list, sentence_transformer):

    emb_items = torch.tensor(sentence_transformer.encode(item_list))

    emb_seqs = sentence_transformer.encode(seqs)
    cos_sim = util.pytorch_cos_sim(emb_seqs, emb_items)
    
    tgt_cosines = torch.median(cos_sim,1).values
    

    cosine_diff={}
    for idx, key in enumerate(item_dict.keys()):
        if key != curr_key:
            curr_items = item_dict[key]['+'].copy()
            curr_items.extend(item_dict[key]['-'])
            emb_items = sentence_transformer.encode(curr_items)
            cos_sim = util.pytorch_cos_sim(emb_seqs, emb_items)
            median_cos = torch.median(cos_sim,1).values
            
            cosine_diff[key] = median_cos
    
    tensor_list = [cosine_diff[key] for key in cosine_diff.keys() ]
    
    n_factors = idx
    non_tgt_mean_cosines = torch.abs(torch.cat(tensor_list).view(n_factors, tensor_list[0].shape[0]).transpose(-1,0)).median(dim=1).values 

    cosine_discriminant = tgt_cosines - non_tgt_mean_cosines
    
    return cosine_discriminant

def internal_consistency(item_list, sentence_transformer):
    import numpy as np
    emb_items = sentence_transformer.encode(item_list)
    cos_sim = util.pytorch_cos_sim(emb_items, emb_items)
    
    median_cosines = torch.median(cos_sim,1).values
    
    return np.round(median_cosines,4)

#%%
# Takes a list of generated text sequences, and calculates the average cosine similarity of each element with a list of survey items. 
# Returns the top k items with the highest average cosine similarity.

def screen_item_pool(item_pool, cosines, sentence_transformer, curr_key,k=10):
    import torch

    assert type(cosines) == list, "Cosines must be a list of lists!!!"
      
      

    
    final_items = []
    final_cosines = []

    for cos in cosine_distinct:
      # Get indices and cosines sorted by cosine similarity
      idxs=torch.argsort(cos,0) # indices for matching with item pool idx
      order=idxs.tolist() # indices as list for sorting idx and cosine value tensors
      top_cosine = cos[order] # cosine values

      #lb= torch.quantile(torch.absolute(torch.tensor(cos)),q=.9)
      lb= torch.quantile(torch.absolute(torch.tensor(cos)),q=0)
      ub= torch.quantile(torch.absolute(torch.tensor(cos)),q=1)

      top_k = idxs[torch.where((torch.absolute(torch.tensor(cos)) >= lb) & (torch.absolute(torch.tensor(cos)) <= ub))].tolist()
      top_cosine = cos[torch.where((torch.absolute(torch.tensor(cos)) >= lb) & (torch.absolute(torch.tensor(cos)) <= ub))].tolist() # cosine values

### Take the list of indices to draw items from the pool
      screened_items = [item_pool[int(j)] for j in top_k]
      final_items.extend(screened_items)

### Round list of cosine values for legibility
      top_cosine = np.round(top_cosine,4).tolist()
      final_cosines.extend(top_cosine)
### Test output?
# [si + " : " + str(tc) for (si, tc) in zip(screened_items, top_cosine)] 
    return screened_items, top_cosine


#%%




While the same generator and sentence encoder is used across all uses in this script, you may specify different models if you'd like to use on different scales, or even for differently keyed items within a given scale.

In [9]:
for idx, key in enumerate(item_dict.keys()):
    
    if regex.search('Honesty',key):
        item_dict[key]['pos_decoder'] = gpt2
        item_dict[key]['neg_decoder'] = gpt2
        item_dict[key]['sent_encoder'] = sts_encoder

    if regex.search('^Emotionality',key):
        item_dict[key]['pos_decoder'] = gpt2
        item_dict[key]['neg_decoder'] = gpt2
        item_dict[key]['sent_encoder'] = sts_encoder

    if regex.search('^Extraversion',key):
        item_dict[key]['pos_decoder'] = gpt2
        item_dict[key]['neg_decoder'] = gpt2
        item_dict[key]['sent_encoder'] = sts_encoder

    if regex.search('^Agreeableness',key):
        item_dict[key]['pos_decoder'] = gpt2
        item_dict[key]['neg_decoder'] = gpt2
        item_dict[key]['sent_encoder'] = sts_encoder

    if regex.search('^Conscientiousness',key):
        item_dict[key]['pos_decoder'] = gpt2
        item_dict[key]['neg_decoder'] = gpt2
        item_dict[key]['sent_encoder'] = sts_encoder

    if regex.search('^Openness',key):
        item_dict[key]['pos_decoder'] = gpt2
        item_dict[key]['neg_decoder'] = gpt2
        item_dict[key]['sent_encoder'] = sts_encoder



Item Generation Block. For each generation occasion, a pair of items are randomly drawn from a given scale (regardless of their key) with replacement. A batch of items are collected cleaned, and appended to a dataframe.

In [11]:
full_df = pd.DataFrame()

import random

for idx, key in enumerate(item_dict.keys()): # Iterate over all Traits...
    item_list = item_dict[key]['+'].copy()
    item_list.extend(item_dict[key]['-'])

    prompt_txts = []
    #n_item_pairs = 5
    for n in range(0,n_item_pairs):
      
      i = 0
      j = 0
      while i == j: # ENSURE NO ITEM PAIRS ARE DUPLICATES...
        i = random.choices(range(0, len(item_list)), k=1)[0]
        j = random.choices(range(0, len(item_list)), k=1)[0]
        
      item_1 = item_list[i]
      item_2 = item_list[j]
      items = item_1 + ' ' + item_2 + ' ' + stem_txt
      prompt_txts.append(items)


    item_pool = []

    for prompt_txt in prompt_txts:
        seq_lengths=[len(regex.split(string=item_list[i], pattern= " ")) + 1 for i in range(0,len(item_list))]
        lower_limit=min(seq_lengths) 
        upper_limit=max(seq_lengths) + fluff
        # SEQ_LIMIT DEPENDS ON THE ITEMS USED IN PROMPT, PLUS A FLUFF CONSTANT!

        prompt_len = len(tokenizer(prompt_txt)['input_ids'])
        seq_lower_limit= lower_limit + prompt_len
        seq_upper_limit = upper_limit + prompt_len
        
        tokenized_prompt = tokenizer([prompt_txt],return_tensors='pt')
        for tokenizer_key in tokenized_prompt.keys():
            tokenized_prompt[tokenizer_key] = tokenized_prompt[tokenizer_key].to(cuda)

        while True:
            try:                  
                new_seqs = get_sequence(gpt2,tokenized_prompt,word_limit=seq_upper_limit,
                word_lower_limit=seq_lower_limit, prohibited_ids = banned_token_idx, #n_beams=beam_search_n,
                n_gram=n_gram_rule,return_seqs=seqs_returned,p=beam_sample)
                break
            except UnboundLocalError:
                print(f'Increasing seq_limit for {key} pos_decoder.')
            finally:     
                seq_upper_limit+=1


        cleaned_seqs = clean_sequence(new_seqs)
        
        sts_cosine = convergent(seqs = cleaned_seqs, curr_key = key, item_list=item_list, sentence_transformer=sts_encoder)
        distinctiveness = discriminant(seqs=cleaned_seqs, curr_key=key, item_list=item_list, sentence_transformer=paraphraser)
        

        curr_df = pd.DataFrame({"Factor":key, "Item":cleaned_seqs, "STS Cosine":sts_cosine, "Distinct Cosine":distinctiveness, "Prompt":np.repeat(prompt_txt,  [len(cleaned_seqs)], axis=0)}).drop_duplicates()
        full_df = pd.concat([full_df, curr_df], axis=0).drop_duplicates()


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

The cells below calculate some descriptive statistics that represent the semantic textual similarity (STS) of certain items compared to others (see code for the functions "convergent", "discriminant", and "internal_consistency" above).

Although these statistics were not used to filter out or select specific items, they do appear informative of how an item relates to its prompts, as well as the other items generated, and the apparent key of items generated. 

In [12]:
internal_consistency_df = pd.DataFrame()
for key in item_dict.keys():

    factor_items = full_df[full_df.Factor.eq(key)].iloc[:,1].to_list()
    
    curr_ic = internal_consistency(item_list=factor_items, sentence_transformer = paraphraser)
    curr_df = pd.DataFrame({'Factor':key,'Item':factor_items, 'Internal Consistency': curr_ic.tolist()})
    internal_consistency_df = pd.concat([internal_consistency_df, curr_df], axis=0)


full_df = full_df.merge(internal_consistency_df, on=['Factor', 'Item'])


In [13]:
final_pos_df = pd.DataFrame()
final_neg_df = pd.DataFrame()
for key in item_dict.keys():
  
  curr_list = full_df[full_df['Factor']==key]['Item'].to_list()
  
  
  clip_prop = .0001

  curr_df = pd.DataFrame()
  embed_seqs = sts_encoder.encode(curr_list)
  embed_items = sts_encoder.encode(item_dict[key]['+'])
  cos_sim = util.pytorch_cos_sim(embed_seqs, embed_items)
  keep_pos_items = torch.where(cos_sim > torch.quantile(torch.median(cos_sim,1).values,clip_prop))[0].tolist()
  pos_med_cos = torch.median(cos_sim,1).values.tolist()
  
  curr_df = pd.DataFrame({'Factor':key,'Item':[curr_list[int(item)] for item in keep_pos_items], 'Pos Cosine':[pos_med_cos[int(item)] for item in keep_pos_items]})
  final_pos_df = pd.concat([final_pos_df, curr_df], axis=0)

  curr_df = pd.DataFrame()
  embed_items = sts_encoder.encode(item_dict[key]['-'])
  cos_sim = util.pytorch_cos_sim(embed_seqs, embed_items)
  keep_neg_items = torch.where(cos_sim > torch.quantile(torch.median(cos_sim,1).values,clip_prop))[0].tolist()
  neg_med_cos = torch.median(cos_sim,1).values.tolist()
  
  curr_df = pd.DataFrame({'Factor':key,'Item':[curr_list[int(item)] for item in keep_neg_items], 'Neg Cosine':[neg_med_cos[int(item)] for item in keep_neg_items]})
  final_neg_df = pd.concat([final_neg_df, curr_df], axis=0)

In [14]:
final_df = final_pos_df.merge(final_neg_df, on = ['Factor', 'Item'])
final_df = final_df.merge(full_df, on = ['Factor', 'Item'])
final_df['Cos Diff'] = final_df['Pos Cosine'] - final_df['Neg Cosine']
final_df = final_df.drop_duplicates(subset='Item')

In [16]:
#final_df2 = final_df.copy()
internal_df = pd.DataFrame()
for key in item_dict.keys():
  item_list = final_df[final_df['Factor'] == key]['Item'].to_list()
  sts_cosine = convergent(seqs = item_list, curr_key = key, item_list=item_list, sentence_transformer=sts_encoder)
  distinctiveness = discriminant(seqs=item_list, curr_key=key, item_list=item_list, sentence_transformer=paraphraser)

  curr_df = pd.DataFrame({'Factor':key,'Item':item_list, 'Internal STS':sts_cosine.tolist(),'Internal Distinct':distinctiveness.tolist()})
  internal_df = pd.concat([internal_df, curr_df], axis=0)


In [17]:
final_df = final_df.merge(right=internal_df, on=['Factor','Item'])

Export results

In [None]:
from google.colab import files

final_df.to_excel(export_path)
files.download(export_path)

In [18]:
final_df

Unnamed: 0,Factor,Item,Pos Cosine,Neg Cosine,STS Cosine,Distinct Cosine,Prompt,Internal Consistency,Cos Diff,Internal STS,Internal Distinct
0,Honesty_Humility,I have the same kind-heartedness in me as othe...,0.274561,0.248132,0.274445,-0.044524,I would get a lot of pleasure from owning expe...,0.2765,0.026429,0.257434,-0.009495
1,Honesty_Humility,I think that people have to accept my opinions...,0.170223,0.175738,0.175738,-0.022455,I would get a lot of pleasure from owning expe...,0.2139,-0.005515,0.181646,-0.026862
2,Honesty_Humility,I can only imagine what my life will be like i...,0.120150,0.211231,0.129368,0.040330,I would get a lot of pleasure from owning expe...,0.2152,-0.091081,0.198688,0.048458
3,Honesty_Humility,I have to be prepared for all the hardships th...,0.233918,0.367388,0.343129,0.090257,I would get a lot of pleasure from owning expe...,0.2147,-0.133471,0.229444,0.023206
4,Honesty_Humility,I have always been lucky to be in the business...,0.258920,0.355786,0.325142,0.151571,I would get a lot of pleasure from owning expe...,0.2072,-0.096866,0.224660,0.046566
...,...,...,...,...,...,...,...,...,...,...,...
4697,Openness to Experience,I can't imagine what the next day will entail.,0.189039,0.136531,0.185055,0.055843,I’ve never really enjoyed looking through an e...,0.1735,0.052508,0.221368,0.081211
4698,Openness to Experience,I just want the best for my children.,0.300048,0.169354,0.233278,-0.011982,I’ve never really enjoyed looking through an e...,0.2394,0.130694,0.329442,0.060241
4699,Openness to Experience,I'd have the same problem with reading books a...,0.020410,0.292826,0.220799,0.053008,I’ve never really enjoyed looking through an e...,0.2118,-0.272416,0.139491,0.032895
4700,Openness to Experience,I don't think that's the case with these things.,0.008531,0.206919,0.080847,0.012990,I’ve never really enjoyed looking through an e...,0.2836,-0.198387,0.122779,0.080835
