In [None]:
import sys
if 'google.colab' in sys.modules:
    # Installing packages in Google Colab environment
    !pip install datasets transformers

    # Mounting google drive to enable access to data files
    from google.colab import drive
    drive.mount('/content/drive')

    # Changing working directory to personality
    %cd /content/drive/MyDrive/LLM4BeSci/personality

## Loading data

In [None]:
import pandas as pd

In [None]:
# Loading data with pandas
neo_items =  pd.read_csv('NEO_items.csv', usecols=['construct', 'item'])
neo_items

## Feature extraction

In [None]:
from transformers import pipeline
import torch

In [None]:
# Loading the model and moving it to the GPU if available
if torch.cuda.is_available():  # for nvidia GPUs
    device = torch.device('cuda')
elif torch.backends.mps.is_available(): # for Apple Metal Performance Sharder (mps) GPUs
    device = torch.device('mps')
else:
    device = torch.device('cpu')

device

In [None]:
# Loading the feature extraction pipeline
model_ckpt = 'distilbert-base-uncased'
feature_extractor = pipeline(
    'feature-extraction', model=model_ckpt, tokenizer=model_ckpt,
    device=device, framework='pt', batch_size=8
)

# Extracting the features for all items
features = feature_extractor(
    neo_items['item'].to_list(), return_tensors=True, 
    tokenize_kwargs= {'padding': True, 'truncation': True}
)
features

In [None]:
# Extracting the embeddings for the [CLS] token
features = [sample[0][0].numpy() for sample in features]

# Converting to a data frame`
features = pd.DataFrame(features)
features

# Comparing predicted and observed construct similarities

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
sims = pd.DataFrame(
    cosine_similarity(features), # cosine similarity between each pair of rows
    index=neo_items['item'], # row names
    columns=neo_items['item'] # column names
)
sims

In [None]:
# Loading observed correlations and pivoting to a correlation matrix
sims_observed = pd.read_csv('item_corrs.csv')
sims_observed

In [None]:
# Pivoting to a correlation matrix for easy comparison with predicted correlations
sims_observed = sims_observed.pivot(index='text_i', columns='text_j', values='cor')
sims_observed

In [None]:
# Aligning rows and columns the predicted and observed correlations
sims, sims_observed = sims.align(sims_observed)


def lower_triangle_flat(df):
    """Takes the lower triangle of a dataframe and flattens it into a vector"""
    rows, cols = np.triu_indices(len(df), k=1)  # k=1 to exclude the diagonal (self-similarities)
    return pd.Series(df.values[rows, cols])


sims, sims_observed = lower_triangle_flat(sims), lower_triangle_flat(sims_observed)

# Correlation between predicted and observed
print(f'r: {sims.corr(sims_observed).round(2)}')
print(f'r of absolute values: {sims.abs().corr(sims_observed.abs()).round(2)}')