In [None]:
import sys
if 'google.colab' in sys.modules:
    # Installing packages in Google Colab environment
    !pip install datasets transformers

    # Mounting google drive to enable access to data files
    from google.colab import drive
    drive.mount('/content/drive')

    # Changing working directory to ex1
    %cd /content/drive/MyDrive/LLM4BeSci/personality

## Loading data

In [1]:
import pandas as pd

In [2]:
# Loading data with pandas
neo_items =  pd.read_csv('NEO_items.csv', usecols=['construct', 'item'])
neo_items

Unnamed: 0,construct,item
0,Achievement-Striving,Go straight for the goal.
1,Achievement-Striving,Plunge into tasks with all my heart.
2,Achievement-Striving,Demand quality.
3,Achievement-Striving,Set high standards for myself and others.
4,Achievement-Striving,Turn plans into actions.
...,...,...
295,Vulnerability,Remain calm under pressure.
296,Vulnerability,Am calm even in tense situations.
297,Vulnerability,Can handle complex problems.
298,Vulnerability,Readily overcome setbacks.


## Feature extraction

In [3]:
from transformers import pipeline
import torch

In [4]:
# Loading the model and moving it to the GPU if available
if torch.cuda.is_available():  # for nvidia GPUs
    device = torch.device('cuda')
elif torch.backends.mps.is_available(): # for Apple Metal Performance Sharder (mps) GPUs
    device = torch.device('mps')
else:
    device = torch.device('cpu')

device

device(type='mps')

In [6]:
# Loading the feature extraction pipeline
model_ckpt = 'distilbert-base-uncased'
feature_extractor = pipeline(
    'feature-extraction', model=model_ckpt, tokenizer=model_ckpt,
    device=device, framework='pt', batch_size=8
)

# Extracting the features for all items
features = feature_extractor(
    neo_items['item'].to_list(), return_tensors=True, 
    tokenize_kwargs= {'padding': True, 'truncation': True}
)
features

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[tensor([[[-0.1016, -0.2323,  0.0532,  ..., -0.0210,  0.3478,  0.4032],
          [ 0.0796, -0.4371,  0.0781,  ..., -0.1949,  0.3942,  0.5441],
          [ 0.4672, -0.4254,  0.1733,  ..., -0.2626,  0.0033,  0.0906],
          ...,
          [ 0.1062, -0.0311,  0.2063,  ...,  0.2131,  0.0617,  0.2878],
          [ 0.2026,  0.0310,  0.2395,  ...,  0.1445,  0.0900,  0.1349],
          [ 0.0946, -0.0251,  0.2154,  ...,  0.2539,  0.1087,  0.3186]]]),
 tensor([[[-0.0293, -0.0135, -0.1464,  ..., -0.1307,  0.2990,  0.1831],
          [-0.1950,  0.2925,  0.0331,  ..., -0.2528,  0.2721, -0.0752],
          [-0.4747,  0.1305, -0.0480,  ..., -0.4448,  0.1154, -0.2313],
          ...,
          [ 0.9559,  0.1278, -0.2636,  ...,  0.0458, -0.4735, -0.3454],
          [-0.0158,  0.1332,  0.0899,  ...,  0.0887, -0.0870,  0.1617],
          [ 0.0619,  0.2018,  0.0451,  ...,  0.1007, -0.0023,  0.1977]]]),
 tensor([[[-0.1235, -0.1633, -0.0390,  ..., -0.3022,  0.1011,  0.2615],
          [ 0.2037, -0.5305,

In [7]:
# Extracting the embeddings for the [CLS] token
features = [sample[0][0].numpy() for sample in features]

# Converting to a data frame`
features = pd.DataFrame(features)
features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.101577,-0.232341,0.053150,-0.209234,-0.147042,-0.446292,0.199789,0.281034,0.125232,-0.613696,...,0.044836,0.017810,0.095211,-0.113162,-0.015173,0.144610,-0.213362,-0.020967,0.347819,0.403197
1,-0.029326,-0.013542,-0.146410,-0.170675,-0.168257,-0.195445,0.282303,0.277026,0.012995,-0.397132,...,0.163581,-0.074748,0.106048,-0.131116,0.277533,-0.158600,-0.067269,-0.130695,0.299019,0.183073
2,-0.123460,-0.163300,-0.039022,0.061185,0.029218,0.047689,-0.021713,0.339821,-0.140701,-0.464206,...,0.160523,-0.311289,0.193131,-0.157316,0.000914,-0.016334,-0.061327,-0.302191,0.101102,0.261451
3,0.171672,-0.080539,-0.324507,-0.123403,-0.002906,-0.286730,0.328533,0.331734,-0.043537,-0.253172,...,0.071976,-0.158575,0.220342,-0.259936,0.323301,-0.199359,-0.065793,-0.017549,0.293964,0.210275
4,-0.011405,-0.273886,-0.275290,-0.039883,-0.059297,-0.349610,0.179417,0.395709,0.137041,-0.603452,...,0.039848,-0.206626,-0.002964,-0.014211,0.193043,0.076055,0.014883,-0.220791,0.123017,0.311863
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.053610,-0.207244,-0.021129,-0.324069,0.030457,-0.204915,0.296093,0.273525,0.266422,-0.575218,...,0.317866,-0.407703,0.310368,-0.065386,0.138821,-0.249927,0.007644,-0.194356,-0.014654,0.453587
296,-0.142525,-0.113036,-0.115370,-0.241583,-0.054642,-0.213681,0.443018,0.357536,0.210813,-0.495573,...,0.035589,-0.360558,0.319993,-0.275262,0.140480,-0.021844,-0.020102,-0.103788,0.226173,0.270764
297,-0.456889,-0.435810,-0.229935,0.023752,-0.163952,-0.391314,0.149716,0.373441,0.096712,-0.571043,...,0.229076,-0.370328,0.303710,-0.193726,0.087308,0.007515,0.016861,-0.389869,-0.153120,0.362962
298,-0.156439,-0.067095,-0.164802,0.055055,0.044016,-0.371144,0.238249,0.062213,0.249837,-0.599417,...,0.185703,-0.331456,0.035973,-0.143764,0.141533,-0.062924,-0.108850,-0.206454,0.068723,0.168381


# Comparing predicted and observed construct similarities

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [9]:
sims = pd.DataFrame(
    cosine_similarity(features), # cosine similarity between each pair of rows
    index=neo_items['item'], # row names
    columns=neo_items['item'] # column names
)
sims

item,Go straight for the goal.,Plunge into tasks with all my heart.,Demand quality.,Set high standards for myself and others.,Turn plans into actions.,Do more than what's expected of me.,Work hard.,Do just enough work to get by.,Am not highly motivated to succeed.,Put little time and effort into my work.,...,Panic easily.,Get overwhelmed by emotions.,Feel that I'm unable to deal with things.,Can't make up my mind.,Become overwhelmed by events.,Remain calm under pressure.,Am calm even in tense situations.,Can handle complex problems.,Readily overcome setbacks.,Know how to cope.
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Go straight for the goal.,1.000000,0.937291,0.912419,0.937388,0.937178,0.941870,0.945305,0.953538,0.913815,0.935873,...,0.936882,0.933743,0.922841,0.923829,0.869716,0.893923,0.896983,0.859028,0.905520,0.944151
Plunge into tasks with all my heart.,0.937291,0.999999,0.934913,0.969671,0.942818,0.978878,0.964292,0.962596,0.951277,0.973330,...,0.964064,0.957840,0.976751,0.970222,0.896998,0.875516,0.920864,0.855678,0.913151,0.969264
Demand quality.,0.912419,0.934913,1.000000,0.947668,0.928942,0.938465,0.949418,0.943864,0.916180,0.938434,...,0.934007,0.923044,0.930711,0.928840,0.857612,0.880098,0.910788,0.891799,0.936942,0.945570
Set high standards for myself and others.,0.937388,0.969671,0.947668,1.000000,0.949412,0.969233,0.966870,0.962684,0.947639,0.975383,...,0.943541,0.948249,0.955413,0.949402,0.889187,0.905084,0.933213,0.875336,0.922472,0.973282
Turn plans into actions.,0.937178,0.942818,0.928942,0.949412,0.999999,0.950606,0.938060,0.942705,0.924113,0.951295,...,0.920239,0.962744,0.933894,0.917969,0.940487,0.920888,0.925301,0.905540,0.940013,0.947410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Remain calm under pressure.,0.893923,0.875516,0.880098,0.905084,0.920888,0.888300,0.897367,0.883605,0.878887,0.893301,...,0.853544,0.914011,0.874819,0.853221,0.896864,1.000000,0.930315,0.867545,0.895987,0.901234
Am calm even in tense situations.,0.896983,0.920864,0.910788,0.933213,0.925301,0.922115,0.923240,0.906764,0.941967,0.926968,...,0.887088,0.926703,0.918986,0.894791,0.900692,0.930315,1.000000,0.889343,0.919258,0.924743
Can handle complex problems.,0.859028,0.855678,0.891799,0.875336,0.905540,0.870966,0.864420,0.866394,0.852616,0.871043,...,0.839746,0.881715,0.852122,0.845022,0.866255,0.867545,0.889343,1.000000,0.929644,0.875341
Readily overcome setbacks.,0.905520,0.913151,0.936942,0.922472,0.940013,0.922285,0.912404,0.916042,0.910177,0.921128,...,0.900938,0.915672,0.901567,0.885332,0.894593,0.895987,0.919258,0.929644,1.000000,0.917696


In [10]:
# Loading observed correlations and pivoting to a correlation matrix
sims_observed = pd.read_csv('item_corrs.csv')
sims_observed

Unnamed: 0,text_i,text_j,cor
0,Worry about things.,Worry about things.,1.000000
1,Make friends easily.,Worry about things.,-0.092088
2,Have a vivid imagination.,Worry about things.,0.011413
3,Trust others.,Worry about things.,-0.122167
4,Complete tasks successfully.,Worry about things.,-0.052228
...,...,...,...
89995,Am calm even in tense situations.,Often make last-minute plans.,0.031644
89996,Seldom joke around.,Often make last-minute plans.,-0.143314
89997,Like to stand during the national anthem.,Often make last-minute plans.,-0.023413
89998,Can't stand weak people.,Often make last-minute plans.,0.038725


In [11]:
# Pivoting to a correlation matrix for easy comparison with predicted correlations
sims_observed = sims_observed.pivot(index='text_i', columns='text_j', values='cor')
sims_observed

text_j,Act comfortably with others.,Act wild and crazy.,Act without thinking.,Adapt easily to new situations.,Am a creature of habit.,Am able to control my cravings.,Am able to stand up for myself.,Am afraid of many things.,Am afraid that I will do the wrong thing.,Am afraid to draw attention to myself.,...,Want everything to be just right.,Want to be left alone.,Warm up quickly to others.,Waste my time.,Willing to try anything once.,Work hard.,Worry about things.,Would never cheat on my taxes.,Would never go hang gliding or bungee jumping.,Yell at people.
text_i,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Act comfortably with others.,1.000000,0.217360,0.012991,-0.430405,0.101136,-0.104918,-0.303101,-0.245115,-0.229300,-0.393090,...,-0.021545,0.407432,0.519459,0.193105,0.158005,0.162693,-0.162281,0.029572,0.124640,0.090932
Act wild and crazy.,0.217360,1.000000,-0.421215,-0.177011,0.134400,0.101634,-0.126245,-0.028102,-0.040210,-0.276224,...,-0.070427,0.175862,0.213489,-0.113697,0.317553,-0.097946,-0.076365,-0.115061,0.294004,-0.230751
Act without thinking.,0.012991,-0.421215,1.000000,-0.023407,-0.024389,-0.240195,-0.072953,-0.154781,-0.133205,0.055036,...,0.047438,0.012868,-0.069470,0.308881,-0.188441,0.217708,-0.050914,0.132766,-0.137169,0.327725
Adapt easily to new situations.,-0.430405,-0.177011,-0.023407,1.000000,-0.222098,0.153603,0.343191,0.359561,0.278076,0.328830,...,0.105141,-0.249252,-0.314628,-0.169759,-0.257563,-0.143309,0.260600,0.019874,-0.192907,-0.120178
Am a creature of habit.,0.101136,0.134400,-0.024389,-0.222098,1.000000,-0.065602,-0.100340,-0.169118,-0.155461,-0.156853,...,-0.222918,0.140241,0.077795,0.057557,0.132251,-0.029368,-0.163692,-0.042522,0.154302,0.049869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Work hard.,0.162693,-0.097946,0.217708,-0.143309,-0.029368,-0.168711,-0.162225,-0.115883,-0.075400,-0.040886,...,0.160276,0.106788,0.094098,0.422429,0.002044,1.000000,0.027527,0.159320,-0.016483,0.113103
Worry about things.,-0.162281,-0.076365,-0.050914,0.260600,-0.163692,0.144669,0.204268,0.431686,0.408099,0.198018,...,0.239192,-0.107536,-0.078245,-0.074586,-0.107065,0.027527,1.000000,0.051653,-0.143831,-0.136540
Would never cheat on my taxes.,0.029572,-0.115061,0.132766,0.019874,-0.042522,-0.103727,0.021737,0.030539,0.060323,0.083540,...,0.072984,0.056801,0.049876,0.137484,-0.088864,0.159320,0.051653,1.000000,-0.067971,0.115916
Would never go hang gliding or bungee jumping.,0.124640,0.294004,-0.137169,-0.192907,0.154302,-0.043246,-0.126138,-0.192227,-0.083722,-0.163592,...,-0.083874,0.119223,0.097164,-0.009435,0.352464,-0.016483,-0.143831,-0.067971,1.000000,-0.011130


In [12]:
# Aligning rows and columns the predicted and observed correlations
sims, sims_observed = sims.align(sims_observed)


def lower_triangle_flat(df):
    """Takes the lower triangle of a dataframe and flattens it into a vector"""
    rows, cols = np.triu_indices(len(df), k=1)  # k=1 to exclude the diagonal (self-similarities)
    return pd.Series(df.values[rows, cols])


sims, sims_observed = lower_triangle_flat(sims), lower_triangle_flat(sims_observed)

# Correlation between predicted and observed
print(f'r: {sims.corr(sims_observed).round(2)}')
print(f'r of absolute values: {sims.abs().corr(sims_observed.abs()).round(2)}')

r: 0.05
r of absolute values: 0.14
