Install Libraries

In [None]:
#install huggingface
!pip install transformers

#install sentencepiece (tokenizer used by some language models---GPT, DeBERTa V2)
!pip install sentencepiece
!wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt

#install the version of sklearn that supports varimax rotation in factor analysis
!pip uninstall scikit-learn -y
!pip install -U scikit-learn

Import Libraries

In [1]:
# Deep Learning / NLP
import torch
from transformers import AutoModel, AutoTokenizer #AutoModelForMaskedLM

# Basic Operations
import numpy as np
import pandas as pd

# Plotting Results
import seaborn as sns
import matplotlib.pyplot as plt

# Matrix Factorization
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FactorAnalysis, KernelPCA

Helper Functions

In [2]:
# Model Loading Function
def load_model(model_name="roberta"):
   model = AutoModel.from_pretrained(models[model_name][0], output_attentions=False)
   tokenizer = AutoTokenizer.from_pretrained(models[model_name][1], use_fast=False)

   device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
   model.to(device)

   # grab the  mask token
   mask_token = tokenizer.mask_token

   return model, tokenizer, mask_token

In [3]:
# Embedding Functions
def embed_w_batches(sentences, tokenizer, model, device, mask=True, verbose=False, batch_size=256):
   # Empty GPU cache
   torch.cuda.empty_cache()

   def chunks(lst, n):
       for i in range(0, len(lst), n):
           yield lst[i:i + n]
   embedding = []

   for i, sentence_batch in enumerate(chunks(sentences, batch_size)):
       if verbose: print(f"Sample: {i*batch_size}")
       embedding += list(embed_sentences(sentence_batch, tokenizer, model, device, mask=mask, verbose=verbose))
   return np.array(embedding)

def embed_sentences(sentences, tokenizer, model, device, mask=True, verbose=False):
   inputs = tokenizer(sentences, return_tensors="pt", padding=True).to(device)
   outputs = model(**inputs)
   embedding = outputs.last_hidden_state.cpu().detach().numpy()

   if mask == True:
       mask_idx = inputs["input_ids"] == tokenizer.mask_token_id
       mask_idx = mask_idx.cpu()
       embed_temp = []

       for i in range(embedding.shape[0]):
           embed_row = embedding[i, mask_idx[i], :].squeeze()
           if len(embed_row.shape) > 1:
               embed_row = embed_row.mean(axis=0)
           embed_temp += [embed_row]
       embedding = np.array(embed_temp)

   if mask == False:
       mask_idx = None
       embedding = embedding[:, 0, :].squeeze()

   # clean up the memory on GPU
   del inputs, outputs, mask_idx
   torch.cuda.empty_cache()

   return embedding

Load Language Models

In [4]:
models = {
   "bert": ("bert-base-uncased", "bert-base-uncased"),
   "deberta": ("microsoft/deberta-large", "microsoft/deberta-large"),
   "bart" : ["facebook/bart-large-mnli", "facebook/bart-large-mnli", 256], #(only give one mask token)
   "deberta-l-mnli" : ["Narsil/deberta-large-mnli-zero-cls", "Narsil/deberta-large-mnli-zero-cls", 128]
}

# download language model
model_name = "deberta-l-mnli"
model, tokenizer, mask_token = load_model(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



Load SETPOINT Dimensions

*Note:* Instead of defining SETPOINT with dimension names, we utilized a short description of each dimension to provide a clearer explanation to language models. 

In [5]:
# SETPOINT Dimension Description
dimensions = [
   "the application of science to life, medicine, and health", 
   "the expression of imaginative and creative ideas", 
   "problem-solving, innovation, and creation of technology", 
   "working with, helping people, and understanding people", 
   "planning and organizing in structured business environments", 
   "leading, persuading, and influencing other people", 
   "agriculture, outdoors, and nature",
   "mechanical, hands-on, and physical activities"
]

Dimension Query

In [6]:
# Defining Query for Dimensions
DM_query = [f"A career in {mask_token} {mask_token} is aligned with a general interest in activities that involve {category}." for category in dimensions]

In [7]:
# Embedding Query for Dimensions
DM_vectors = embed_w_batches(DM_query, tokenizer, model, device, verbose=True, batch_size=64)

Sample: 0


In [8]:
# Convert Dimension Vectors to Dataframe
DM_vectors_df = pd.DataFrame(DM_vectors, index = dimensions)

Load SETPOINT Basic Interests

In [9]:
# SETPOINT Basic Interests
basic_interest = {
   "Health Science": ["life science", "medical science", "health care service"],
   "Creative Expression": ["media", "applied arts and design", "music", "visual arts",
                           "performing arts", "creative writing", "culinary art"],
   "Technology": ["engineering", "physical science", "information technology", "mathematics or statistics"],
   "People": ["social science", "humanities or foreign language", "teaching or education",
              "social service", "religious activities"],
   "Organization": ["human resources", "personal service", "accounting", "office work", "finance"],
   "Influence": ["management or administration", "business initiatives", "marketing or advertising", "professional advising", "public speaking", "sales",
                "politics", "law"],
   "Nature": ["agriculture", "outdoors", "animal service"],
   "Things": ["mechanics or electronics", "transportation or machine operation",
              "construction or woodwork", "physical or manual labor", "athletics",
              "protective service"]
}

# Flatten  Descriptors
basic_interests = [desc for sublist in basic_interest.values() for desc in sublist]

Basic Interst Query

In [10]:
# Defining Query for Basic Interests
queries = [f"A career in [MASK][MASK] is aligned with {descriptor} interests." for descriptor in basic_interests]

In [11]:
# Embedding Query for Basic Interests
BI_vectors = embed_w_batches(queries, tokenizer, model, device, verbose=True, batch_size=64)

Sample: 0


In [12]:
# Convert Basic Interest Vectors to Dataframe
BI_vectors_df = pd.DataFrame(BI_vectors, index = basic_interests)

csv_file_path = "/Users/annikawei/Desktop/work/vocational interest project/data/bi_vectors.csv"
BI_vectors_df.to_csv(csv_file_path)

print(f"Vectors have been saved to {csv_file_path}")

Vectors have been saved to /Users/annikawei/Desktop/work/vocational interest project/data/bi_vectors.csv
