In [1]:
import sys
if 'google.colab' in sys.modules:
    # Installing packages in Google Colab environment
    !pip install datasets transformers

    # Mounting google drive to enable access to data files
    from google.colab import drive
    drive.mount('/content/drive')

    # Changing working directory to ex1
    %cd /content/drive/MyDrive/LLM4behavior_workshop/ex2

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

In [3]:
# Loading data with pandas
neo_items =  pd.read_csv('NEO_items.csv', usecols=['construct', 'text'])
neo_items

Unnamed: 0,construct,text
0,Achievement-Striving,Go straight for the goal.
1,Achievement-Striving,Plunge into tasks with all my heart.
2,Achievement-Striving,Demand quality.
3,Achievement-Striving,Set high standards for myself and others.
4,Achievement-Striving,Turn plans into actions.
...,...,...
295,Vulnerability,Remain calm under pressure.
296,Vulnerability,Am calm even in tense situations.
297,Vulnerability,Can handle complex problems.
298,Vulnerability,Readily overcome setbacks.


In [4]:
# Converting into a HuggingFace dataset
dat = Dataset.from_pandas(neo_items)
dat

Dataset({
    features: ['construct', 'text'],
    num_rows: 300
})

In [5]:
# Loading the tokenizer
model_ckpt = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
print(f'Vocabulary size: {tokenizer.vocab_size}, max context length: {tokenizer.model_max_length}')

Vocabulary size: 30522, max context length: 512


In [6]:
# Tokenizing the text
batch_tokenizer = lambda x: tokenizer(x['text'], padding=True, truncation=True)
dat = dat.map(batch_tokenizer, batched=True, batch_size=None)
print(tokenizer.decode(dat[0]['input_ids']))
dat

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

[CLS] go straight for the goal. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


Dataset({
    features: ['construct', 'text', 'input_ids', 'attention_mask'],
    num_rows: 300
})

In [7]:
# Setting the format of the dataset to torch tensors for passing to the model
dat.set_format('torch', columns=['input_ids', 'attention_mask'])
dat

Dataset({
    features: ['construct', 'text', 'input_ids', 'attention_mask'],
    num_rows: 300
})

In [8]:
dat[('hidden_state')].shape

KeyError: "Column hidden_state not in the dataset. Current columns in the dataset: ['construct', 'text', 'input_ids', 'attention_mask']"

# Feature extraction

In [None]:
import torch
from transformers import AutoModel
torch.manual_seed(42) # For reproducibility

In [None]:
# Loading the model and moving it to the GPU if available
if torch.cuda.is_available():  # for nvidia GPUs
    device = torch.device('cuda')
elif torch.backends.mps.is_available(): # for Apple Metal Performance Sharder (mps) GPUs
    device = torch.device('mps')
else:
    device = torch.device('cpu')

device

In [None]:
# Loading the model
model = AutoModel.from_pretrained(model_ckpt).to(device)
f'Model inputs: {tokenizer.model_input_names}'

In [None]:
def extract_features(batch):
    """Extract features from a batch of items"""
    inputs = {k:v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
        return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}


dat = dat.map(extract_features, batched=True, batch_size=8)
dat

# Comparing predicted and observed construct similarities

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# Converting the hidden state into a data frame for easy manipulation
embeds = pd.DataFrame(dat['hidden_state'])
embeds

In [None]:
# Adding the construct that each embedding represents
embeds['construct'] = neo_items['construct']

# Calculating the mean embedding for each construct
construct_embeds = embeds.groupby('construct').mean()
construct_embeds

In [None]:
# Calculating the cosine similarity between construct embeddings
sims = pd.DataFrame(
    cosine_similarity(construct_embeds), # cosine similarity between each pair of rows
    index=construct_embeds.index, # row names
    columns=construct_embeds.index # column names
)
sims

In [None]:
# Loading observed correlations and pivoting to a correlation matrix
sims_observed = pd.read_csv('NEO_correlations.csv')
sims_observed

In [None]:
# Pivoting to a correlation matrix for easy comparison with predicted correlations
sims_observed = sims_observed.pivot(index='construct_1', columns='construct_2', values='correlation')
sims_observed

In [None]:
# Aligning rows and columns the predicted and observed correlations
sims, sims_observed = sims_observed.align(sims_observed)


def lower_triangle_flat(df):
    """Takes the lower triangle of a dataframe and flattens it into a vector"""
    rows, cols = np.triu_indices(len(df), k=1)  # k=1 to exclude the diagonal (self-similarities)
    return pd.Series(df.values[rows, cols])


sims, sims_observed = lower_triangle_flat(sims), lower_triangle_flat(sims_observed)

# Correlation between predicted and observed
print(f'r: {sims.corr(sims_observed).round(2)}')
print(f'r of absolute values: {sims.abs().corr(sims_observed.abs()).round(2)}')

# Feature extraction pipeline