In [5]:
import pandas as pd
from transformers import pipeline
import torch

# Loading data

In [7]:
# Loading data with pandas
neo_items =  pd.read_csv('NEO_items.csv', usecols=['construct', 'text'])
neo_items

Unnamed: 0,construct,text
0,Achievement-Striving,Go straight for the goal.
1,Achievement-Striving,Plunge into tasks with all my heart.
2,Achievement-Striving,Demand quality.
3,Achievement-Striving,Set high standards for myself and others.
4,Achievement-Striving,Turn plans into actions.
...,...,...
295,Vulnerability,Remain calm under pressure.
296,Vulnerability,Am calm even in tense situations.
297,Vulnerability,Can handle complex problems.
298,Vulnerability,Readily overcome setbacks.


# Feature extraction

In [8]:
# Loading the model and moving it to the GPU if available
if torch.cuda.is_available():  # for nvidia GPUs
    device = torch.device('cuda')
elif torch.backends.mps.is_available(): # for Apple Metal Performance Sharder (mps) GPUs
    device = torch.device('mps')
else:
    device = torch.device('cpu')

device

device(type='mps')

In [11]:
# Loading the feature extraction pipeline
model_ckpt = 'distilbert-base-uncased'
feature_extractor = pipeline(
    'feature-extraction', model=model_ckpt, tokenizer=model_ckpt,
    device=device, framework='pt', batch_size=8
)

# Extracting the features for all items
features = feature_extractor(neo_items['text'].to_list(), return_tensors='pt', padding=True, truncation=True)
features

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[tensor([[[-0.1016, -0.2323,  0.0532,  ..., -0.0210,  0.3478,  0.4032],
          [ 0.0796, -0.4371,  0.0781,  ..., -0.1949,  0.3942,  0.5441],
          [ 0.4672, -0.4254,  0.1733,  ..., -0.2626,  0.0033,  0.0906],
          ...,
          [ 0.1062, -0.0311,  0.2063,  ...,  0.2131,  0.0617,  0.2878],
          [ 0.2026,  0.0310,  0.2395,  ...,  0.1445,  0.0900,  0.1349],
          [ 0.0946, -0.0251,  0.2154,  ...,  0.2539,  0.1087,  0.3186]]]),
 tensor([[[-0.0293, -0.0135, -0.1464,  ..., -0.1307,  0.2990,  0.1831],
          [-0.1950,  0.2925,  0.0331,  ..., -0.2528,  0.2721, -0.0752],
          [-0.4747,  0.1305, -0.0480,  ..., -0.4448,  0.1154, -0.2313],
          ...,
          [ 0.9559,  0.1278, -0.2636,  ...,  0.0458, -0.4735, -0.3454],
          [-0.0158,  0.1332,  0.0899,  ...,  0.0887, -0.0870,  0.1617],
          [ 0.0619,  0.2018,  0.0451,  ...,  0.1007, -0.0023,  0.1977]]]),
 tensor([[[-0.1235, -0.1633, -0.0390,  ..., -0.3022,  0.1011,  0.2615],
          [ 0.2037, -0.5305,

In [18]:
# Extracting the embedding for the [CLS] token
features = [sample[0][0].numpy() for sample in features]

# Converting to a data frame`
features = pd.DataFrame(features)
features['construct'] = neo_items['construct']

# Calculating the mean embedding for each construct
construct_embeds = features.groupby('construct').mean()
construct_embeds

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
construct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Achievement-Striving,0.013353,-0.089708,-0.169998,-0.131463,-0.045731,-0.206825,0.203802,0.360478,0.022377,-0.441881,...,0.097368,-0.164423,0.128858,-0.138729,0.169996,-0.018982,-0.073836,-0.072658,0.267997,0.250622
Activity-Level,0.004021,-0.071409,-0.019853,-0.096305,-0.047191,-0.246824,0.174134,0.353834,0.015114,-0.367777,...,0.068777,-0.184708,0.150589,-0.109231,0.143219,-0.056337,-0.10025,-0.075928,0.244102,0.198484
Adventurousness,-0.046137,-0.087742,-0.08454,-0.101376,-0.069758,-0.161813,0.196976,0.439567,-0.036838,-0.396599,...,0.052072,-0.194046,0.132261,-0.080574,0.211863,-0.06742,-0.061219,-0.065457,0.336179,0.185967
Aesthetic Appreciation/Artistic Interests,0.017378,-0.056796,-0.200088,-0.073536,-0.126212,-0.242879,0.283706,0.559418,0.036007,-0.379027,...,0.033841,-0.262946,0.118737,-0.229406,0.13085,-0.027526,-0.06435,-0.062079,0.445192,0.199478
Altruism,-0.026554,-0.050085,-0.238696,-0.122274,-0.130973,-0.188094,0.226324,0.363062,-0.023369,-0.422964,...,0.023287,-0.27768,0.089907,-0.105416,0.175643,0.023544,-0.074981,-0.025053,0.275023,0.236502
Anger,-0.018077,-0.054141,-0.078035,-0.151001,-0.161714,-0.247133,0.23906,0.337718,-0.064823,-0.323869,...,0.084268,-0.211631,0.148573,-0.104486,0.176267,-0.059649,-0.13719,-0.069776,0.289723,0.180428
Anxiety,-0.018075,-0.07817,-0.15362,-0.120646,-0.074629,-0.230783,0.202978,0.322429,0.048996,-0.446482,...,0.082527,-0.241534,0.172122,-0.187911,0.164564,-0.035584,-0.142948,-0.048372,0.238696,0.257686
Assertiveness,-0.003684,-0.122501,-0.236054,-0.117558,-0.101578,-0.26103,0.150721,0.357804,-0.009862,-0.39033,...,0.038993,-0.191062,0.103026,-0.078804,0.148913,-0.005338,-0.073794,-0.045135,0.257464,0.200311
Cautiousness,-0.015021,-0.157241,-0.316968,-0.09667,-0.122395,-0.269769,0.19691,0.39347,0.048505,-0.418679,...,0.07056,-0.249967,0.123258,-0.112736,0.158704,-0.041619,-0.065702,-0.053759,0.211256,0.227792
Cheerfulness,-0.038822,-0.067395,-0.045376,-0.129328,-0.121524,-0.265878,0.219243,0.478895,-0.060575,-0.373098,...,0.037886,-0.201319,0.102415,-0.107334,0.227338,-0.064169,-0.096258,-0.085663,0.322244,0.152029


# Comparing predicted and observed construct similarities

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [20]:
sims = pd.DataFrame(
    cosine_similarity(construct_embeds), # cosine similarity between each pair of rows
    index=construct_embeds.index, # row names
    columns=construct_embeds.index # column names
)
sims

construct,Achievement-Striving,Activity-Level,Adventurousness,Aesthetic Appreciation/Artistic Interests,Altruism,Anger,Anxiety,Assertiveness,Cautiousness,Cheerfulness,...,Liberalism,Modesty/Humility,Morality,Orderliness,Self-Discipline,Self-Efficacy,Self-consciousness,Sympathy,Trust,Vulnerability
construct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Achievement-Striving,1.0,0.993875,0.98816,0.978286,0.992959,0.988859,0.99235,0.994452,0.993854,0.989934,...,0.957502,0.991743,0.994413,0.993691,0.994205,0.994343,0.987075,0.98101,0.982888,0.988391
Activity-Level,0.993875,1.0,0.991282,0.974734,0.99121,0.99265,0.993281,0.992678,0.990177,0.992053,...,0.945868,0.992842,0.99071,0.994945,0.990008,0.988392,0.989878,0.97418,0.976879,0.985173
Adventurousness,0.98816,0.991282,1.0,0.984314,0.990669,0.988553,0.990493,0.989002,0.988039,0.989789,...,0.959142,0.990687,0.988008,0.992202,0.984389,0.985809,0.985992,0.983215,0.982389,0.982859
Aesthetic Appreciation/Artistic Interests,0.978286,0.974734,0.984314,1.0,0.985649,0.975926,0.98382,0.981777,0.981902,0.982547,...,0.975998,0.977922,0.977996,0.983193,0.976567,0.982758,0.974601,0.990147,0.986972,0.980364
Altruism,0.992959,0.99121,0.990669,0.985649,1.0,0.988182,0.993882,0.995717,0.992656,0.992216,...,0.964233,0.991853,0.993165,0.993346,0.987653,0.991087,0.988749,0.989347,0.990056,0.987993
Anger,0.988859,0.99265,0.988553,0.975926,0.988182,1.0,0.991955,0.989797,0.988977,0.991269,...,0.946798,0.99115,0.989381,0.991603,0.988643,0.985982,0.988143,0.974877,0.97467,0.982339
Anxiety,0.99235,0.993281,0.990493,0.98382,0.993882,0.991955,1.0,0.992847,0.992247,0.990855,...,0.955823,0.991212,0.989717,0.994977,0.991705,0.992148,0.993307,0.984194,0.983361,0.991828
Assertiveness,0.994452,0.992678,0.989002,0.981777,0.995717,0.989797,0.992847,1.0,0.995051,0.991754,...,0.96033,0.99076,0.99639,0.993023,0.990767,0.992001,0.985878,0.983477,0.986454,0.988586
Cautiousness,0.993854,0.990177,0.988039,0.981902,0.992656,0.988977,0.992247,0.995051,1.0,0.989673,...,0.960164,0.986575,0.994818,0.993149,0.992303,0.992876,0.982606,0.984071,0.985116,0.990819
Cheerfulness,0.989934,0.992053,0.989789,0.982547,0.992216,0.991269,0.990855,0.991754,0.989673,1.0,...,0.951265,0.991804,0.988446,0.991858,0.986534,0.985517,0.987106,0.978512,0.978355,0.982456


In [21]:
# Loading observed correlations and pivoting to a correlation matrix
sims_observed = pd.read_csv('NEO_correlations.csv')
sims_observed

Unnamed: 0,construct_1,construct_2,correlation
0,Anxiety,Anxiety,1.000000
1,Friendliness,Anxiety,-0.319029
2,Imagination,Anxiety,0.061749
3,Trust,Anxiety,-0.303660
4,Self-Efficacy,Anxiety,-0.453877
...,...,...,...
895,Vulnerability,Cautiousness,-0.276091
896,Cheerfulness,Cautiousness,-0.172144
897,Liberalism,Cautiousness,-0.125071
898,Sympathy,Cautiousness,0.117780


In [22]:
# Pivoting to a correlation matrix for easy comparison with predicted correlations
sims_observed = sims_observed.pivot(index='construct_1', columns='construct_2', values='correlation')
sims_observed

construct_2,Achievement-Striving,Activity-Level,Adventurousness,Aesthetic Appreciation/Artistic Interests,Altruism,Anger,Anxiety,Assertiveness,Cautiousness,Cheerfulness,...,Liberalism,Modesty/Humility,Morality,Orderliness,Self-Discipline,Self-Efficacy,Self-consciousness,Sympathy,Trust,Vulnerability
construct_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Achievement-Striving,1.0,0.556188,0.207487,0.146712,0.297056,-0.16918,-0.182208,0.429419,0.355385,0.202051,...,-0.195316,-0.179229,0.245104,0.423729,0.692032,0.68105,-0.319778,0.03937,0.172744,-0.377009
Activity-Level,0.556188,1.0,0.237717,0.056497,0.159652,0.011251,-0.081694,0.470846,0.066479,0.134995,...,-0.140695,-0.182681,0.021023,0.26731,0.526717,0.41536,-0.332381,-0.045992,0.061763,-0.233511
Adventurousness,0.207487,0.237717,1.0,0.368639,0.222988,-0.246594,-0.391473,0.357362,-0.161081,0.32394,...,0.228079,-0.211193,-0.00999,-0.218838,0.112415,0.293686,-0.447935,0.130991,0.197621,-0.365071
Aesthetic Appreciation/Artistic Interests,0.146712,0.056497,0.368639,1.0,0.375494,-0.094176,0.002154,0.131344,-0.003963,0.281623,...,0.248999,-0.01153,0.202955,-0.014465,0.040799,0.150727,-0.100406,0.390052,0.1834,-0.018475
Altruism,0.297056,0.159652,0.222988,0.375494,1.0,-0.329779,-0.095435,0.176933,0.158027,0.464327,...,-0.012385,0.131117,0.491216,0.125416,0.237525,0.311276,-0.168456,0.630879,0.502078,-0.13268
Anger,-0.16918,0.011251,-0.246594,-0.094176,-0.329779,1.0,0.623347,-0.030149,-0.303361,-0.295796,...,-0.000979,0.014338,-0.267615,-0.018533,-0.231005,-0.364793,0.302524,-0.165342,-0.395414,0.558404
Anxiety,-0.182208,-0.081694,-0.391473,0.002154,-0.095435,0.623347,1.0,-0.278197,-0.110386,-0.340853,...,0.052482,0.222765,-0.023234,0.065618,-0.259856,-0.453877,0.596427,0.13538,-0.30366,0.79242
Assertiveness,0.429419,0.470846,0.357362,0.131344,0.176933,-0.030149,-0.278197,1.0,-0.089866,0.395249,...,-0.066339,-0.565173,-0.205692,0.077129,0.297605,0.485758,-0.67306,-0.078673,0.135688,-0.379676
Cautiousness,0.355385,0.066479,-0.161081,-0.003963,0.158027,-0.303361,-0.110386,-0.089866,1.0,-0.172144,...,-0.125071,0.113419,0.442165,0.433175,0.434872,0.444948,0.009633,0.11778,0.090374,-0.276091
Cheerfulness,0.202051,0.134995,0.32394,0.281623,0.464327,-0.295796,-0.340853,0.395249,-0.172144,1.0,...,-0.025711,-0.271256,0.058002,-0.063304,0.1085,0.297445,-0.388872,0.200852,0.45364,-0.278647


In [23]:
# Aligning rows and columns the predicted and observed correlations
sims, sims_observed = sims.align(sims_observed)


def lower_triangle_flat(df):
    """Takes the lower triangle of a dataframe and flattens it into a vector"""
    rows, cols = np.triu_indices(len(df), k=1)  # k=1 to exclude the diagonal (self-similarities)
    return pd.Series(df.values[rows, cols])


sims, sims_observed = lower_triangle_flat(sims), lower_triangle_flat(sims_observed)

# Correlation between predicted and observed
print(f'r: {sims.corr(sims_observed).round(2)}')
print(f'r of absolute values: {sims.abs().corr(sims_observed.abs()).round(2)}')

r: 0.16
r of absolute values: 0.32
