In [1]:
import sys
if 'google.colab' in sys.modules:
    # Installing packages in Google Colab environment
    !pip install datasets transformers

    # Mounting google drive to enable access to data files
    from google.colab import drive
    drive.mount('/content/drive')

    # Changing working directory to personality
    %cd /content/drive/MyDrive/LLM4BeSci/personality

In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

In [2]:
# Loading data with pandas
neo_items =  pd.read_csv('NEO_items.csv', usecols=['construct', 'text'])
neo_items

Unnamed: 0,construct,text
0,Achievement-Striving,Go straight for the goal.
1,Achievement-Striving,Plunge into tasks with all my heart.
2,Achievement-Striving,Demand quality.
3,Achievement-Striving,Set high standards for myself and others.
4,Achievement-Striving,Turn plans into actions.
...,...,...
295,Vulnerability,Remain calm under pressure.
296,Vulnerability,Am calm even in tense situations.
297,Vulnerability,Can handle complex problems.
298,Vulnerability,Readily overcome setbacks.


In [3]:
# Converting into a HuggingFace dataset
dat = Dataset.from_pandas(neo_items)
dat

Dataset({
    features: ['construct', 'text'],
    num_rows: 300
})

In [4]:
# Loading the tokenizer
model_ckpt = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
print(f'Vocabulary size: {tokenizer.vocab_size}, max context length: {tokenizer.model_max_length}')

Vocabulary size: 30522, max context length: 512


In [7]:
# Tokenizing the text
batch_tokenizer = lambda x: tokenizer(x['text'], padding=True, truncation=True)
dat = dat.map(batch_tokenizer, batched=True, batch_size=None)
print([tokenizer.decode(id) for id in dat['input_ids'][0]])

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

['[CLS]', 'go', 'straight', 'for', 'the', 'goal', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [8]:
# Setting the format of the dataset to torch tensors for passing to the model
dat.set_format('torch', columns=['input_ids', 'attention_mask'])
dat

Dataset({
    features: ['construct', 'text', 'input_ids', 'attention_mask'],
    num_rows: 300
})

# Feature extraction

In [17]:
import torch
from transformers import AutoModel

In [10]:
# Loading the model and moving it to the GPU if available
if torch.cuda.is_available():  # for nvidia GPUs
    device = torch.device('cuda')
elif torch.backends.mps.is_available(): # for Apple Metal Performance Sharder (mps) GPUs
    device = torch.device('mps')
else:
    device = torch.device('cpu')

device

device(type='mps')

In [22]:
# Loading the model
model = AutoModel.from_pretrained('bert-base-uncased').to(device)
f'Model inputs: {tokenizer.model_input_names}'

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


"Model inputs: ['input_ids', 'attention_mask']"

In [13]:
def extract_features(batch):
    """Extract features from a batch of items"""
    inputs = {k:v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
        return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}


dat = dat.map(extract_features, batched=True, batch_size=8)
dat

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

RuntimeError: Placeholder storage has not been allocated on MPS device!

In [11]:
dat['input_ids'].shape

torch.Size([300, 16])

# Comparing predicted and observed construct similarities

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [17]:
# Converting the hidden state into a data frame for easy manipulation
features = pd.DataFrame(dat['hidden_state'])
features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.101577,-0.232342,0.053150,-0.209234,-0.147043,-0.446292,0.199789,0.281034,0.125232,-0.613696,...,0.044836,0.017810,0.095211,-0.113162,-0.015173,0.144610,-0.213362,-0.020966,0.347819,0.403197
1,-0.029325,-0.013542,-0.146410,-0.170675,-0.168257,-0.195446,0.282303,0.277026,0.012995,-0.397131,...,0.163580,-0.074748,0.106048,-0.131116,0.277533,-0.158600,-0.067269,-0.130695,0.299019,0.183073
2,-0.123460,-0.163301,-0.039022,0.061186,0.029218,0.047689,-0.021713,0.339821,-0.140700,-0.464206,...,0.160522,-0.311289,0.193131,-0.157316,0.000914,-0.016334,-0.061327,-0.302192,0.101102,0.261451
3,0.171672,-0.080539,-0.324507,-0.123403,-0.002906,-0.286729,0.328532,0.331734,-0.043537,-0.253172,...,0.071976,-0.158575,0.220341,-0.259936,0.323301,-0.199359,-0.065793,-0.017549,0.293964,0.210275
4,-0.011405,-0.273886,-0.275290,-0.039883,-0.059297,-0.349610,0.179417,0.395709,0.137042,-0.603452,...,0.039848,-0.206626,-0.002964,-0.014211,0.193043,0.076054,0.014883,-0.220791,0.123017,0.311863
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.053610,-0.207244,-0.021129,-0.324069,0.030456,-0.204914,0.296093,0.273525,0.266422,-0.575218,...,0.317866,-0.407704,0.310368,-0.065386,0.138820,-0.249926,0.007644,-0.194356,-0.014655,0.453587
296,-0.142525,-0.113036,-0.115370,-0.241582,-0.054642,-0.213681,0.443018,0.357535,0.210813,-0.495573,...,0.035589,-0.360558,0.319993,-0.275261,0.140480,-0.021844,-0.020103,-0.103788,0.226172,0.270764
297,-0.456889,-0.435810,-0.229935,0.023752,-0.163952,-0.391314,0.149716,0.373441,0.096713,-0.571043,...,0.229077,-0.370328,0.303710,-0.193726,0.087309,0.007515,0.016861,-0.389869,-0.153120,0.362962
298,-0.156440,-0.067095,-0.164803,0.055055,0.044017,-0.371144,0.238250,0.062213,0.249838,-0.599417,...,0.185703,-0.331456,0.035973,-0.143764,0.141534,-0.062924,-0.108850,-0.206454,0.068723,0.168381


In [23]:
# Calculating the cosine similarity between construct embeddings
sims = pd.DataFrame(
    cosine_similarity(features),
    index=neo_items['text'],
    columns=neo_items['text'],
)
sims

text,Go straight for the goal.,Plunge into tasks with all my heart.,Demand quality.,Set high standards for myself and others.,Turn plans into actions.,Do more than what's expected of me.,Work hard.,Do just enough work to get by.,Am not highly motivated to succeed.,Put little time and effort into my work.,...,Panic easily.,Get overwhelmed by emotions.,Feel that I'm unable to deal with things.,Can't make up my mind.,Become overwhelmed by events.,Remain calm under pressure.,Am calm even in tense situations.,Can handle complex problems.,Readily overcome setbacks.,Know how to cope.
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Go straight for the goal.,1.000000,0.937292,0.912419,0.937389,0.937179,0.941870,0.945306,0.953538,0.913815,0.935873,...,0.936882,0.933743,0.922841,0.923830,0.869716,0.893923,0.896983,0.859028,0.905520,0.944151
Plunge into tasks with all my heart.,0.937292,1.000000,0.934913,0.969671,0.942819,0.978878,0.964293,0.962596,0.951278,0.973330,...,0.964064,0.957841,0.976752,0.970222,0.896999,0.875516,0.920864,0.855678,0.913151,0.969265
Demand quality.,0.912419,0.934913,1.000000,0.947667,0.928942,0.938465,0.949418,0.943863,0.916180,0.938434,...,0.934006,0.923044,0.930710,0.928840,0.857612,0.880098,0.910788,0.891799,0.936942,0.945570
Set high standards for myself and others.,0.937389,0.969671,0.947667,1.000000,0.949412,0.969233,0.966870,0.962684,0.947640,0.975383,...,0.943541,0.948249,0.955413,0.949402,0.889186,0.905084,0.933213,0.875336,0.922472,0.973282
Turn plans into actions.,0.937179,0.942819,0.928942,0.949412,1.000000,0.950606,0.938060,0.942705,0.924114,0.951295,...,0.920239,0.962744,0.933894,0.917970,0.940487,0.920888,0.925302,0.905540,0.940013,0.947410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Remain calm under pressure.,0.893923,0.875516,0.880098,0.905084,0.920888,0.888300,0.897367,0.883605,0.878887,0.893301,...,0.853544,0.914012,0.874819,0.853221,0.896863,1.000000,0.930315,0.867545,0.895987,0.901234
Am calm even in tense situations.,0.896983,0.920864,0.910788,0.933213,0.925302,0.922115,0.923240,0.906764,0.941968,0.926968,...,0.887088,0.926703,0.918987,0.894791,0.900692,0.930315,1.000000,0.889343,0.919258,0.924744
Can handle complex problems.,0.859028,0.855678,0.891799,0.875336,0.905540,0.870966,0.864420,0.866394,0.852616,0.871043,...,0.839746,0.881715,0.852122,0.845022,0.866255,0.867545,0.889343,1.000000,0.929644,0.875341
Readily overcome setbacks.,0.905520,0.913151,0.936942,0.922472,0.940013,0.922284,0.912404,0.916042,0.910178,0.921127,...,0.900938,0.915672,0.901567,0.885332,0.894593,0.895987,0.919258,0.929644,1.000000,0.917696


In [24]:
# Loading observed correlations and pivoting to a correlation matrix
sims_observed = pd.read_csv('item_corrs.csv')
sims_observed

Unnamed: 0,text_i,text_j,cor
0,Worry about things.,Worry about things.,1.000000
1,Make friends easily.,Worry about things.,-0.092088
2,Have a vivid imagination.,Worry about things.,0.011413
3,Trust others.,Worry about things.,-0.122167
4,Complete tasks successfully.,Worry about things.,-0.052228
...,...,...,...
89995,Am calm even in tense situations.,Often make last-minute plans.,0.031644
89996,Seldom joke around.,Often make last-minute plans.,-0.143314
89997,Like to stand during the national anthem.,Often make last-minute plans.,-0.023413
89998,Can't stand weak people.,Often make last-minute plans.,0.038725


In [25]:
# Pivoting to a correlation matrix for easy comparison with predicted correlations
sims_observed = sims_observed.pivot(index='text_i', columns='text_j', values='cor')
sims_observed

text_j,Act comfortably with others.,Act wild and crazy.,Act without thinking.,Adapt easily to new situations.,Am a creature of habit.,Am able to control my cravings.,Am able to stand up for myself.,Am afraid of many things.,Am afraid that I will do the wrong thing.,Am afraid to draw attention to myself.,...,Want everything to be just right.,Want to be left alone.,Warm up quickly to others.,Waste my time.,Willing to try anything once.,Work hard.,Worry about things.,Would never cheat on my taxes.,Would never go hang gliding or bungee jumping.,Yell at people.
text_i,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Act comfortably with others.,1.000000,0.217360,0.012991,-0.430405,0.101136,-0.104918,-0.303101,-0.245115,-0.229300,-0.393090,...,-0.021545,0.407432,0.519459,0.193105,0.158005,0.162693,-0.162281,0.029572,0.124640,0.090932
Act wild and crazy.,0.217360,1.000000,-0.421215,-0.177011,0.134400,0.101634,-0.126245,-0.028102,-0.040210,-0.276224,...,-0.070427,0.175862,0.213489,-0.113697,0.317553,-0.097946,-0.076365,-0.115061,0.294004,-0.230751
Act without thinking.,0.012991,-0.421215,1.000000,-0.023407,-0.024389,-0.240195,-0.072953,-0.154781,-0.133205,0.055036,...,0.047438,0.012868,-0.069470,0.308881,-0.188441,0.217708,-0.050914,0.132766,-0.137169,0.327725
Adapt easily to new situations.,-0.430405,-0.177011,-0.023407,1.000000,-0.222098,0.153603,0.343191,0.359561,0.278076,0.328830,...,0.105141,-0.249252,-0.314628,-0.169759,-0.257563,-0.143309,0.260600,0.019874,-0.192907,-0.120178
Am a creature of habit.,0.101136,0.134400,-0.024389,-0.222098,1.000000,-0.065602,-0.100340,-0.169118,-0.155461,-0.156853,...,-0.222918,0.140241,0.077795,0.057557,0.132251,-0.029368,-0.163692,-0.042522,0.154302,0.049869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Work hard.,0.162693,-0.097946,0.217708,-0.143309,-0.029368,-0.168711,-0.162225,-0.115883,-0.075400,-0.040886,...,0.160276,0.106788,0.094098,0.422429,0.002044,1.000000,0.027527,0.159320,-0.016483,0.113103
Worry about things.,-0.162281,-0.076365,-0.050914,0.260600,-0.163692,0.144669,0.204268,0.431686,0.408099,0.198018,...,0.239192,-0.107536,-0.078245,-0.074586,-0.107065,0.027527,1.000000,0.051653,-0.143831,-0.136540
Would never cheat on my taxes.,0.029572,-0.115061,0.132766,0.019874,-0.042522,-0.103727,0.021737,0.030539,0.060323,0.083540,...,0.072984,0.056801,0.049876,0.137484,-0.088864,0.159320,0.051653,1.000000,-0.067971,0.115916
Would never go hang gliding or bungee jumping.,0.124640,0.294004,-0.137169,-0.192907,0.154302,-0.043246,-0.126138,-0.192227,-0.083722,-0.163592,...,-0.083874,0.119223,0.097164,-0.009435,0.352464,-0.016483,-0.143831,-0.067971,1.000000,-0.011130


In [26]:
# Aligning rows and columns the predicted and observed correlations
sims, sims_observed = sims.align(sims_observed)


def lower_triangle_flat(df):
    """Takes the lower triangle of a dataframe and flattens it into a vector"""
    rows, cols = np.triu_indices(len(df), k=1)  # k=1 to exclude the diagonal (self-similarities)
    return pd.Series(df.values[rows, cols])


sims, sims_observed = lower_triangle_flat(sims), lower_triangle_flat(sims_observed)

# Correlation between predicted and observed
print(f'r: {sims.corr(sims_observed).round(2)}')
print(f'r of absolute values: {sims.abs().corr(sims_observed.abs()).round(2)}')

r: 0.05
r of absolute values: 0.14
