In [1]:
import sys
if 'google.colab' in sys.modules:  # If in Google Colab environment
    # Installing requisite packages
    !pip install datasets transformers evaluate accelerate -U

    # Mount google drive to enable access to data files
    from google.colab import drive
    drive.mount('/content/drive')

    # Change working directory to ex1
    %cd /content/drive/MyDrive/LLM4behavior_workshop/choice

# Processing data

In [3]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

In [4]:
# Reading in the .csv data
df = pd.read_csv('choice_data.csv')
df

Unnamed: 0.1,Unnamed: 0,participant,task,trial,choice,reward,RT,expected_reward0,expected_reward1,forced_choice,horizon
0,0,0,0,0,1,66.0,1.849054,40,70,True,5
1,1,0,0,1,1,80.0,1.771619,40,70,True,5
2,2,0,0,2,0,29.0,0.562676,40,70,True,5
3,3,0,0,3,1,75.0,0.578808,40,70,True,5
4,4,0,0,4,1,81.0,3.942767,40,70,False,5
...,...,...,...,...,...,...,...,...,...,...,...
143995,69595,59,319,0,1,69.0,0.262339,80,60,True,5
143996,69596,59,319,1,0,90.0,0.313540,80,60,True,5
143997,69597,59,319,2,0,86.0,0.355905,80,60,True,5
143998,69598,59,319,3,0,93.0,0.403659,80,60,True,5


In [5]:
# Add column with prompts
num_participants = df.participant.max() + 1
num_tasks = df.task.max() + 1
instructions = "You made the following observations in the past:\n"
question = "Q: Which machine do you choose?\nA: Machine"
text = []
for participant in range(num_participants):
    print(participant)

    df_participant = df[(df['participant'] == participant)]

    for task in range(num_tasks):
        # new prompt for each task
        history = ""
        df_task = df_participant[(df_participant['task'] == task)]
        num_trials = df_task.trial.max() + 1
        for trial in range(num_trials):
            df_trial = df_task[(df_task['trial'] == trial)]
            # add text for free choice trials
            if not df_trial['forced_choice'].item():
                trials_left = num_trials - trial
                trials_left = str(trials_left) + " additional choices" if trials_left > 1 else str(trials_left) + " additional choice"
                trials_left_string = "Your goal is to maximize the sum of received dollars within " +  trials_left + ".\n\n"

                prompt = instructions + history + "\n" + trials_left_string + question
                text.append(prompt)
            else:
                text.append("")

            # add data to history
            c = df_trial.choice.item()
            r = df_trial.reward.item()
            history += "- Machine " + str(c+1) +  " delivered " + str(r) + " dollars.\n"

df['text'] = text

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59


In [6]:
# Removing forced choice trials and converting into a HuggingFace dataset
df = df[~df.forced_choice]
dat = Dataset.from_pandas(df)
dat

Dataset({
    features: ['Unnamed: 0', 'participant', 'task', 'trial', 'choice', 'reward', 'RT', 'expected_reward0', 'expected_reward1', 'forced_choice', 'horizon', 'text', '__index_level_0__'],
    num_rows: 67200
})

In [7]:
dat[0]

{'Unnamed: 0': 4,
 'participant': 0,
 'task': 0,
 'trial': 4,
 'choice': 1,
 'reward': 81.0,
 'RT': 3.942767,
 'expected_reward0': 40,
 'expected_reward1': 70,
 'forced_choice': False,
 'horizon': 5,
 'text': 'You made the following observations in the past:\n- Machine 2 delivered 66.0 dollars.\n- Machine 2 delivered 80.0 dollars.\n- Machine 1 delivered 29.0 dollars.\n- Machine 2 delivered 75.0 dollars.\n\nYour goal is to maximize the sum of received dollars within 1 additional choice.\n\nQ: Which machine do you choose?\nA: Machine',
 '__index_level_0__': 4}

In [8]:
# Defining model checkpoint
model_ckpt = 'distilbert-base-uncased'

# Tokenizing the dataset
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

print(f'Vocabulary size: {tokenizer.vocab_size}, max context length: {tokenizer.model_max_length}')

Vocabulary size: 30522, max context length: 512


In [9]:
# Function to tokenize a batch of samples
batch_tokenizer = lambda batch: tokenizer(batch['text'], padding=True, truncation=True)

#  Tokenizing the dataset
dat = dat.map(batch_tokenizer, batched=True, batch_size=None)
dat[0]

Map:   0%|          | 0/67200 [00:00<?, ? examples/s]

{'Unnamed: 0': 4,
 'participant': 0,
 'task': 0,
 'trial': 4,
 'choice': 1,
 'reward': 81.0,
 'RT': 3.942767,
 'expected_reward0': 40,
 'expected_reward1': 70,
 'forced_choice': False,
 'horizon': 5,
 'text': 'You made the following observations in the past:\n- Machine 2 delivered 66.0 dollars.\n- Machine 2 delivered 80.0 dollars.\n- Machine 1 delivered 29.0 dollars.\n- Machine 2 delivered 75.0 dollars.\n\nYour goal is to maximize the sum of received dollars within 1 additional choice.\n\nQ: Which machine do you choose?\nA: Machine',
 '__index_level_0__': 4,
 'input_ids': [101,
  2017,
  2081,
  1996,
  2206,
  9420,
  1999,
  1996,
  2627,
  1024,
  1011,
  3698,
  1016,
  5359,
  5764,
  1012,
  1014,
  6363,
  1012,
  1011,
  3698,
  1016,
  5359,
  3770,
  1012,
  1014,
  6363,
  1012,
  1011,
  3698,
  1015,
  5359,
  2756,
  1012,
  1014,
  6363,
  1012,
  1011,
  3698,
  1016,
  5359,
  4293,
  1012,
  1014,
  6363,
  1012,
  2115,
  3125,
  2003,
  2000,
  25845,
  1996,
  7680

In [10]:
# Setting the format of the dataset to torch tensors for passing to the model
dat.set_format('torch', columns=['input_ids', 'attention_mask'])
dat

Dataset({
    features: ['Unnamed: 0', 'participant', 'task', 'trial', 'choice', 'reward', 'RT', 'expected_reward0', 'expected_reward1', 'forced_choice', 'horizon', 'text', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 67200
})

# Loading the model for feature extraction

In [11]:
import torch
torch.manual_seed(42) # For reproducibility
from transformers import AutoModel

In [12]:
# Loading the model and moving it to the GPU if available
if torch.cuda.is_available():  # for nvidia GPUs
    device = torch.device('cuda')
elif torch.backends.mps.is_available(): # for Apple Metal Performance Sharder (mps) GPUs
    device = torch.device('mps')
else:
    device = torch.device('cpu')

device

device(type='cpu')

In [13]:
# Loading distilbert-base-uncased and moving it to the GPU if available
model = AutoModel.from_pretrained(model_ckpt).to(device)
f'Model inputs: {tokenizer.model_input_names}'

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


"Model inputs: ['input_ids', 'attention_mask']"

In [14]:
def extract_features(batch):
    """Extract features from a batch of items"""
    inputs = {k:v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
        return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}


dat = dat.map(extract_features, batched=True, batch_size=8)
dat['hidden_state'].shape

Map:   0%|          | 0/67200 [00:00<?, ? examples/s]

torch.Size([67200, 768])

# Predicting choice behavior with extracted features

In [15]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split

In [16]:
features = pd.DataFrame(dat['hidden_state'])
features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.148934,-0.180077,-0.134831,0.132854,0.188525,0.209104,0.182039,0.366131,-0.134321,-0.346842,...,0.190699,-0.239401,0.005970,-0.241647,-0.078778,0.144100,-0.042690,-0.123956,0.312236,0.423458
1,-0.154421,-0.182241,-0.125056,0.147510,0.202007,0.225791,0.221678,0.374226,-0.132166,-0.335095,...,0.175367,-0.232369,0.020902,-0.236038,-0.056490,0.125195,-0.048702,-0.132431,0.321394,0.415944
2,-0.135062,-0.180490,-0.139446,0.148766,0.202410,0.230755,0.232328,0.392416,-0.130439,-0.341893,...,0.175604,-0.239217,0.015016,-0.246387,-0.059535,0.113876,-0.048130,-0.144594,0.328848,0.411730
3,-0.146425,-0.168291,-0.146339,0.157026,0.195126,0.228927,0.242971,0.391838,-0.138923,-0.351250,...,0.177936,-0.235448,-0.001898,-0.247530,-0.055010,0.113964,-0.037188,-0.136178,0.339591,0.417105
4,-0.160054,-0.157333,-0.161730,0.150484,0.203888,0.249553,0.230836,0.403514,-0.145254,-0.358722,...,0.175128,-0.241010,-0.003095,-0.249168,-0.077995,0.097609,-0.042888,-0.116797,0.354835,0.403606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67195,-0.145345,-0.163477,-0.140924,0.158046,0.207640,0.221088,0.239371,0.389046,-0.141190,-0.352974,...,0.186530,-0.236211,-0.001155,-0.242675,-0.067532,0.111500,-0.028072,-0.139120,0.328213,0.420918
67196,-0.153714,-0.154976,-0.158024,0.151719,0.217826,0.244407,0.228012,0.397839,-0.143305,-0.364769,...,0.184289,-0.241012,-0.001346,-0.245445,-0.089254,0.090774,-0.033711,-0.122340,0.347009,0.405053
67197,-0.151804,-0.145767,-0.177265,0.166729,0.216136,0.233474,0.229400,0.428303,-0.161650,-0.365459,...,0.192576,-0.252931,-0.024368,-0.236704,-0.098830,0.105329,-0.053929,-0.118514,0.356066,0.398116
67198,-0.135138,-0.139661,-0.203946,0.151004,0.216032,0.222765,0.203365,0.431771,-0.155505,-0.390393,...,0.199809,-0.272604,-0.034887,-0.243870,-0.107104,0.104959,-0.042936,-0.088606,0.359039,0.401997


In [18]:
# Initializing logistic regression 
regr = LogisticRegressionCV()

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, dat['choice'], test_size=.1, random_state=42)
f'Train size: {len(X_train)}, test size: {len(X_test)}'

'Train size: 60480, test size: 6720'

In [19]:
# Fitting the model and evaluating performance
regr.fit(X_train, y_train)
f'Test R2 = {regr.score(X_test, y_test).round(2)}'

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

'Test R2 = 0.72'