# kramer2.0 Experiment 03: Analyses
## Angelique I. Delarazan
### kramer2.0_experiment_03_vC2
### Retrieval Tasks: Free Recall

## Set Up

### Import packages

In [22]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
from scipy import stats
import itertools 
import statsmodels.api as sm 
from statsmodels.formula.api import ols 
from statsmodels.stats.anova import AnovaRM
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.regressionplots import plot_partregress_grid
import pingouin as pg
from pingouin import anova as pg_anova
from psifr import fr
import tensorflow as tf
import tensorflow_hub as hub
import rpy2
from scipy.spatial.distance import squareform
from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu
from scipy.stats import chisquare

%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


### Universal Sentence Encoder Modules

In [2]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


### Overall Recall

In [6]:
recall = pd.read_csv('~/Box Sync/aidelarazan_box/Projects/kramer2.0/data/kramer2.0_vC2/kramer2.0_vC2_sub-all_desc-recall.csv')
recall = recall[recall['subject']!=652]
recall['condition'] = recall['coherence'] + '_' + recall['lag']
recall['word_count'] = recall['response_detail'].str.split().str.len()
recall = recall[recall['word_count']!=0]
recall.dropna(subset=['word_count'], inplace=True)
print(recall.columns.tolist())
recall.head()

['subject', 'version', 'character', 'act', 'event', 'coherence', 'lag', 'correct_detail', 'correct_order', 'response_detail', 'response_order', 'notes', 'rater', 'condition', 'word_count']


Unnamed: 0,subject,version,character,act,event,coherence,lag,correct_detail,correct_order,response_detail,response_order,notes,rater,condition,word_count
0,604,1,charles,1,A,mainplot,mainplot,It was Charles Bort’s big break: there was fin...,1.0,Charles Bort was a journalist taking photos fo...,1.0,Perceptual features of Charles,aid,mainplot_mainplot,24.0
1,604,1,beatrice,1,A,unrelated,long,"A tremor ran through his leg, as his phone rec...",2.0,"On his way, Charles was interrupted by text me...",3.0,Vague,aid,unrelated_long,9.0
2,604,1,beatrice,1,A,unrelated,long,"A tremor ran through his leg, as his phone rec...",2.0,"First, Beatrice Small called him and asked him...",6.0,Vague,aid,unrelated_long,117.0
3,604,1,charles,1,B,mainplot,mainplot,"The newsroom was quiet, but Charles could make...",3.0,"He went into work one day and the head editor,...",2.0,perceptual features of Charles' boss described,aid,mainplot_mainplot,103.0
4,604,1,melvin,1,A,coherent,long,Charles was interrupted by a video call from M...,4.0,and video calls from his friends.,4.0,Vague,aid,coherent_long,6.0


### Creating Dataframe to fit psifr package

In [23]:
recall = pd.read_csv('~/Box Sync/aidelarazan_box/Projects/kramer2.0/data/kramer2.0_vC2/kramer2.0_vC2_sub-all_desc-recall.csv')
recall = recall[recall['subject']!=652]
recall['condition'] = recall['coherence'] + '_' + recall['lag']
recall['word_count'] = recall['response_detail'].str.split().str.len()

Subset existing data so we format it to match psifr. We want an encoded story one, which is what participants were introduced to.

In [24]:
encoded_story = recall[['subject', 'version', 'character', 'act', 'event', 'coherence', 'lag', 'condition', 'correct_detail', 'correct_order']]
encoded_story = encoded_story.assign(trial_type='study')
encoded_story = encoded_story.assign(list=1)
encoded_story['item'] = recall['character'] + recall['act'].astype(str) + recall['event']
encoded_story = encoded_story.rename(columns={'correct_order':'position', 'correct_detail':'detail'})
encoded_story.dropna(subset=['position'], inplace=True)
encoded_story.drop_duplicates(inplace=True)
encoded_story.reset_index()
encoded_story.head()

Unnamed: 0,subject,version,character,act,event,coherence,lag,condition,detail,position,trial_type,list,item
0,604,1,charles,1,A,mainplot,mainplot,mainplot_mainplot,It was Charles Bort’s big break: there was fin...,1.0,study,1,charles1A
1,604,1,beatrice,1,A,unrelated,long,unrelated_long,"A tremor ran through his leg, as his phone rec...",2.0,study,1,beatrice1A
3,604,1,charles,1,B,mainplot,mainplot,mainplot_mainplot,"The newsroom was quiet, but Charles could make...",3.0,study,1,charles1B
4,604,1,melvin,1,A,coherent,long,coherent_long,Charles was interrupted by a video call from M...,4.0,study,1,melvin1A
6,604,1,charles,1,C,mainplot,mainplot,mainplot_mainplot,Twenty minutes later Charles chained the bike ...,5.0,study,1,charles1C


Subset existing data so we format it to match psifr. We want a recalled one, which is what participants recalled.

In [25]:
recalled_story = recall[['subject', 'version', 'character', 'act', 'event', 'coherence', 'lag', 'condition', 'response_detail', 'response_order']]
recalled_story = recalled_story.assign(trial_type='recall')
recalled_story = recalled_story.assign(list=1)
recalled_story['item'] = recall['character'] + recall['act'].astype(str) + recall['event']
recalled_story.dropna(subset=['response_order'], inplace=True)
recalled_story['response_order'] = recalled_story['response_order'].astype(int)
recalled_story = recalled_story.rename(columns={'response_order':'position', 'response_detail':'detail'})
recalled_story.reset_index(drop=True, inplace=True)
recalled_story.head()

Unnamed: 0,subject,version,character,act,event,coherence,lag,condition,detail,position,trial_type,list,item
0,604,1,charles,1,A,mainplot,mainplot,mainplot_mainplot,Charles Bort was a journalist taking photos fo...,1,recall,1,charles1A
1,604,1,beatrice,1,A,unrelated,long,unrelated_long,"On his way, Charles was interrupted by text me...",3,recall,1,beatrice1A
2,604,1,beatrice,1,A,unrelated,long,unrelated_long,"First, Beatrice Small called him and asked him...",6,recall,1,beatrice1A
3,604,1,charles,1,B,mainplot,mainplot,mainplot_mainplot,"He went into work one day and the head editor,...",2,recall,1,charles1B
4,604,1,melvin,1,A,coherent,long,coherent_long,and video calls from his friends.,4,recall,1,melvin1A


Now, put encoded and recalled story together in a psifr format.

In [26]:
story = pd.concat([encoded_story, recalled_story])
story.reset_index()
story_df = fr.merge_free_recall(story, study_keys=['coherence', 'lag', 'condition'])
story_df.head()

Unnamed: 0,subject,list,item,input,output,study,recall,repeat,intrusion,coherence,lag,condition,prior_list,prior_input
0,604,1,charles1A,1.0,1.0,True,True,0,False,mainplot,mainplot,mainplot_mainplot,,
1,604,1,beatrice1A,2.0,3.0,True,True,0,False,unrelated,long,unrelated_long,,
2,604,1,beatrice1A,2.0,6.0,False,True,1,False,unrelated,long,unrelated_long,,
3,604,1,charles1B,3.0,2.0,True,True,0,False,mainplot,mainplot,mainplot_mainplot,,
4,604,1,melvin1A,4.0,4.0,True,True,0,False,coherent,long,coherent_long,,


## Overall Recall Performance is Driven by Narrative Coherence (Word Count)

In [27]:
recall[recall['coherence']!='mainplot'].groupby(['subject', 'coherence', 'lag', 'condition']).sum().sort_values(by=['word_count']).reset_index()

Unnamed: 0,subject,coherence,lag,condition,version,act,correct_order,response_order,word_count
0,627,unrelated,long,unrelated_long,40,3,21.0,0.0,0.0
1,644,unrelated,short,unrelated_short,26,3,17.0,0.0,0.0
2,628,unrelated,short,unrelated_short,60,3,21.0,0.0,0.0
3,633,unrelated,short,unrelated_short,64,3,21.0,0.0,0.0
4,647,unrelated,short,unrelated_short,40,3,21.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
151,647,coherent,short,coherent_short,80,5,29.0,56.0,159.0
152,612,coherent,long,coherent_long,8,3,17.0,22.0,165.0
153,604,coherent,long,coherent_long,3,4,25.0,28.0,177.0
154,604,coherent,short,coherent_short,2,3,21.0,30.0,206.0


In [28]:
aov = pg.rm_anova(
    data=recall[recall['coherence']!='mainplot'].groupby(['subject', 'coherence', 'lag', 'condition']).sum().reset_index(),
    dv='word_count',
    within=['coherence', 'lag'],
    subject='subject',
    detailed=True
)

pg.print_table(aov)

post_hoc = pg.pairwise_tests(
    data=recall[recall['coherence']!='mainplot'].groupby(['subject', 'coherence', 'lag', 'condition']).sum().reset_index(),
    dv='word_count',
    within='condition',
    subject='subject',
    padjust='bonf'
)

pg.print_table(post_hoc)


ANOVA SUMMARY

Source                  SS    ddof1    ddof2         MS       F    p-unc    p-GG-corr    ng2    eps
---------------  ---------  -------  -------  ---------  ------  -------  -----------  -----  -----
coherence        26234.160        1       38  26234.160  33.530    0.000        0.000  0.100  1.000
lag                243.750        1       38    243.750   0.466    0.499        0.499  0.001  1.000
coherence * lag     40.006        1       38     40.006   0.036    0.852        0.852  0.000  1.000


POST HOC TESTS

Contrast    A               B                Paired    Parametric        T     dof  alternative      p-unc    p-corr  p-adjust         BF10    hedges
----------  --------------  ---------------  --------  ------------  -----  ------  -------------  -------  --------  ----------  ---------  --------
condition   coherent_long   coherent_short   True      True          0.219  38.000  two-sided        0.828     1.000  bonf            0.177     0.033
condition   cohe

In [29]:
ttest = pg.ttest(
    x=recall[recall['coherence']=='coherent'].groupby(['subject', 'coherence', 'lag', 'condition']).sum().reset_index()['word_count'],
    y=recall[recall['coherence']=='unrelated'].groupby(['subject', 'coherence', 'lag', 'condition']).sum().reset_index()['word_count'],
    paired=False
)

pg.print_table(ttest)

print()

    T    dof  alternative      p-val  CI95%            cohen-d     BF10    power
-----  -----  -------------  -------  -------------  ---------  -------  -------
4.145    154  two-sided        0.000  [13.58 38.3 ]      0.664  355.968    0.985




In [30]:
ttest = pg.ttest(
    x=recall[recall['lag']=='short'].groupby(['subject', 'coherence', 'lag', 'condition']).sum().reset_index()['word_count'],
    y=recall[recall['lag']=='long'].groupby(['subject', 'coherence', 'lag', 'condition']).sum().reset_index()['word_count'],
    paired=False
)

pg.print_table(ttest)

print()

     T    dof  alternative      p-val  CI95%              cohen-d    BF10    power
------  -----  -------------  -------  ---------------  ---------  ------  -------
-0.379    154  two-sided        0.705  [-15.53  10.53]      0.061   0.184    0.066




## Evaluating Temporal, Narrative Category, and Semantic Clusters in Free Recall

### Temporal Clustering

#### Lag-CRP with ALL conditions here.

In [31]:
fr.lag_crp(story_df).head()

Unnamed: 0,subject,lag,prob,actual,possible
0,604,-17.0,,0,0
1,604,-16.0,,0,0
2,604,-15.0,,0,0
3,604,-14.0,,0,0
4,604,-13.0,,0,0


#### Restricting to specific transitions (Only for visualization)

In [32]:
lagcrp_coherent = fr.lag_crp(story_df, test_key='coherence', test=lambda x, y: x=='coherent')
lagcrp_unrelated = fr.lag_crp(story_df, test_key='coherence', test=lambda x, y: x == 'unrelated')
lagcrp_mainplot = fr.lag_crp(story_df, test_key='coherence', test=lambda x, y: x == 'mainplot')
lagcrp_combined = pd.concat([lagcrp_coherent, lagcrp_unrelated, lagcrp_mainplot], keys=['coherent', 'unrelated', 'mainplot'], axis=0)
lagcrp_combined.index.set_names('coherence', level=0, inplace=True)
lagcrp_combined.reset_index(inplace=True)

### Lag Ranks by Coherence

In [33]:
lagrank_coherent = fr.lag_rank(story_df, test_key='coherence', test=lambda x, y: x == 'coherent')
lagrank_coherent.head()

  {'subject': subject, 'rank': np.nanmean(ranks)}, index=[subject]


Unnamed: 0,subject,rank
0,604,0.641026
1,605,0.607143
2,606,1.0
3,611,0.685897
4,612,0.96337


In [34]:
lagrank_unrelated = fr.lag_rank(story_df, test_key='coherence', test=lambda x, y: x == 'unrelated')
lagrank_unrelated.head()

  {'subject': subject, 'rank': np.nanmean(ranks)}, index=[subject]


Unnamed: 0,subject,rank
0,604,1.0
1,605,1.0
2,606,0.923077
3,611,0.903571
4,612,0.75


In [35]:
lagrank_mainplot = fr.lag_rank(story_df, test_key='coherence', test=lambda x, y: x == 'mainplot')
lagrank_mainplot.head()

Unnamed: 0,subject,rank
0,604,0.878108
1,605,0.668023
2,606,0.950265
3,611,0.727848
4,612,0.81713


In [36]:
lagrank_combined = pd.concat([lagrank_coherent, lagrank_unrelated, lagrank_mainplot], keys=['coherent', 'unrelated', 'mainplot'], axis=0)
lagrank_combined.index.set_names('coherence', level=0, inplace=True)
lagrank_combined.reset_index(inplace=True)
lagrank_combined.dropna(subset=['rank'], inplace=True)
lagrank_combined

Unnamed: 0,coherence,level_1,subject,rank
0,coherent,0,604,0.641026
1,coherent,1,605,0.607143
2,coherent,2,606,1.000000
3,coherent,3,611,0.685897
4,coherent,4,612,0.963370
...,...,...,...,...
112,mainplot,34,653,0.680774
113,mainplot,35,654,0.654762
114,mainplot,36,655,0.910119
115,mainplot,37,656,0.759207


In [38]:
aov = pg.anova(data=lagrank_combined, dv='rank', between='coherence', detailed=True)
print(aov)

      Source        SS   DF        MS         F    p-unc       np2
0  coherence  0.854690    2  0.427345  8.185682  0.00049  0.131633
1     Within  5.638289  108  0.052206       NaN      NaN       NaN


In [39]:
lagrank_combined

Unnamed: 0,coherence,level_1,subject,rank
0,coherent,0,604,0.641026
1,coherent,1,605,0.607143
2,coherent,2,606,1.000000
3,coherent,3,611,0.685897
4,coherent,4,612,0.963370
...,...,...,...,...
112,mainplot,34,653,0.680774
113,mainplot,35,654,0.654762
114,mainplot,36,655,0.910119
115,mainplot,37,656,0.759207


In [40]:
posthocs = lagrank_combined.pairwise_tukey(dv='rank', between='coherence').round(3)
posthocs.round(3)

Unnamed: 0,A,B,mean(A),mean(B),diff,se,T,p-tukey,hedges
0,coherent,mainplot,0.552,0.723,-0.17,0.052,-3.271,0.004,-0.729
1,coherent,unrelated,0.552,0.751,-0.198,0.054,-3.677,0.001,-0.729
2,mainplot,unrelated,0.723,0.751,-0.028,0.054,-0.522,0.861,-0.158


In [41]:
post_hoc = pg.pairwise_tests(
    data=lagrank_combined,
    dv='rank',
    within='coherence',
    subject='subject',
    padjust='bonf'
)

pg.print_table(post_hoc)


POST HOC TESTS

Contrast    A         B          Paired    Parametric         T     dof  alternative      p-unc    p-corr  p-adjust      BF10    hedges
----------  --------  ---------  --------  ------------  ------  ------  -------------  -------  --------  ----------  ------  --------
coherence   coherent  mainplot   True      True          -3.308  32.000  two-sided        0.002     0.007  bonf        15.332    -0.721
coherence   coherent  unrelated  True      True          -2.934  32.000  two-sided        0.006     0.018  bonf         6.618    -0.639
coherence   mainplot  unrelated  True      True          -0.237  32.000  two-sided        0.814     1.000  bonf         0.191    -0.049



In [42]:
temporal_cluster_coherence = pd.merge(recall.groupby(['subject','coherence'])['word_count'].sum().reset_index(), lagrank_combined, how='inner')
temporal_cluster_coherence = temporal_cluster_coherence.rename(columns={'rank':'temporal_rank'})
temporal_cluster_coherence['rank_type']='temporal'
temporal_cluster_coherence['chance_rank'] = 0.5
temporal_cluster_coherence.head()


Unnamed: 0,subject,coherence,word_count,level_1,temporal_rank,rank_type,chance_rank
0,604,coherent,383.0,0,0.641026,temporal,0.5
1,604,mainplot,742.0,0,0.878108,temporal,0.5
2,604,unrelated,227.0,0,1.0,temporal,0.5
3,605,coherent,111.0,1,0.607143,temporal,0.5
4,605,mainplot,408.0,1,0.668023,temporal,0.5


In [43]:
ttest = pg.ttest(
    x=temporal_cluster_coherence[temporal_cluster_coherence['coherence']=='unrelated']['temporal_rank'],
    y=temporal_cluster_coherence[temporal_cluster_coherence['coherence']=='unrelated']['chance_rank'],
    paired=True
)

pg.print_table(ttest)

print()

    T    dof  alternative      p-val  CI95%          cohen-d       BF10    power
-----  -----  -------------  -------  -----------  ---------  ---------  -------
6.595     33  two-sided        0.000  [0.17 0.33]      1.599  91080.000    1.000




In [44]:
ttest = pg.ttest(
    x=temporal_cluster_coherence[temporal_cluster_coherence['coherence']=='mainplot']['temporal_rank'],
    y=temporal_cluster_coherence[temporal_cluster_coherence['coherence']=='mainplot']['chance_rank'],
    paired=True
)

pg.print_table(ttest)

print()

     T    dof  alternative      p-val  CI95%          cohen-d             BF10    power
------  -----  -------------  -------  -----------  ---------  ---------------  -------
11.413     38  two-sided        0.000  [0.18 0.26]      2.585  97160000000.000    1.000




In [45]:
ttest = pg.ttest(
    x=temporal_cluster_coherence[temporal_cluster_coherence['coherence']=='coherent']['temporal_rank'],
    y=temporal_cluster_coherence[temporal_cluster_coherence['coherence']=='coherent']['chance_rank'],
    paired=True
)

pg.print_table(ttest)

print()

    T    dof  alternative      p-val  CI95%            cohen-d    BF10    power
-----  -----  -------------  -------  -------------  ---------  ------  -------
1.056     37  two-sided        0.298  [-0.05  0.15]      0.242   0.293    0.307




In [46]:
temporal_cluster_coherence.groupby(['coherence']).agg(['mean', 'sem'])

Unnamed: 0_level_0,subject,subject,word_count,word_count,level_1,level_1,temporal_rank,temporal_rank,chance_rank,chance_rank
Unnamed: 0_level_1,mean,sem,mean,sem,mean,sem,mean,sem,mean,sem
coherence,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
coherent,632.368421,2.606323,144.368421,12.71229,19.263158,1.854863,0.552321,0.04956,0.5,0.0
mainplot,632.025641,2.561652,366.589744,23.654244,19.0,1.825742,0.722674,0.01951,0.5,0.0
unrelated,631.823529,2.871729,99.617647,9.165736,18.852941,2.045007,0.750658,0.038008,0.5,0.0


In [47]:
corr = pg.corr(
    x=temporal_cluster_coherence[temporal_cluster_coherence['coherence']=='coherent']['temporal_rank'],
    y=temporal_cluster_coherence[temporal_cluster_coherence['coherence']=='coherent']['word_count'],
    method='pearson'
)

pg.print_table(corr)


  n      r  CI95%            p-val    BF10    power
---  -----  -------------  -------  ------  -------
 38  0.253  [-0.07  0.53]    0.125   0.628    0.340



In [48]:
corr = pg.corr(
    x=temporal_cluster_coherence[temporal_cluster_coherence['coherence']=='unrelated']['temporal_rank'],
    y=temporal_cluster_coherence[temporal_cluster_coherence['coherence']=='unrelated']['word_count'],
    method='pearson'
)

pg.print_table(corr)


  n      r  CI95%            p-val    BF10    power
---  -----  -------------  -------  ------  -------
 34  0.250  [-0.1   0.54]    0.154   0.566    0.301



In [49]:
corr = pg.corr(
    x=temporal_cluster_coherence[temporal_cluster_coherence['coherence']=='mainplot']['temporal_rank'],
    y=temporal_cluster_coherence[temporal_cluster_coherence['coherence']=='mainplot']['word_count'],
    method='pearson'
)

pg.print_table(corr)


  n      r  CI95%            p-val    BF10    power
---  -----  -------------  -------  ------  -------
 39  0.192  [-0.13  0.48]    0.242   0.386    0.218



### Lag Rank by Condition

In [50]:
lagrank_coherent_short = fr.lag_rank(story_df, test_key='condition', test=lambda x, y: x == 'coherent_short')
lagrank_coherent_long = fr.lag_rank(story_df, test_key='condition', test=lambda x, y: x == 'coherent_long')
lagrank_unrelated_short = fr.lag_rank(story_df, test_key='condition', test=lambda x, y: x == 'unrelated_short')
lagrank_unrelated_long = fr.lag_rank(story_df, test_key='condition', test=lambda x, y: x == 'unrelated_long')

  {'subject': subject, 'rank': np.nanmean(ranks)}, index=[subject]


In [51]:
lagrank_combined_condition = pd.concat([lagrank_coherent_short, lagrank_coherent_long, lagrank_unrelated_short, lagrank_unrelated_long, lagrank_mainplot], keys=['coherent_short', 'coherent_long', 'unrelated_short', 'unrelated_long', 'mainplot'], axis=0)
lagrank_combined_condition.index.set_names('condition', level=0, inplace=True)
lagrank_combined_condition.reset_index(inplace=True)
lagrank_combined_condition.dropna(subset=['rank'], inplace=True)
lagrank_combined_condition

Unnamed: 0,condition,level_1,subject,rank
0,coherent_short,0,604,1.000000
1,coherent_short,1,605,0.410714
2,coherent_short,2,606,1.000000
3,coherent_short,3,611,0.528846
4,coherent_short,4,612,0.945055
...,...,...,...,...
190,mainplot,34,653,0.680774
191,mainplot,35,654,0.654762
192,mainplot,36,655,0.910119
193,mainplot,37,656,0.759207


In [52]:
aov = pg.anova(data=lagrank_combined_condition, dv='rank', between='condition', detailed=True)
print(aov)

      Source         SS   DF        MS         F     p-unc       np2
0  condition   1.447021    4  0.361755  4.423872  0.002046  0.101293
1     Within  12.838431  157  0.081773       NaN       NaN       NaN


In [53]:
posthocs = lagrank_combined_condition.pairwise_tukey(dv='rank', between='condition').round(3)
posthocs.round(3)

Unnamed: 0,A,B,mean(A),mean(B),diff,se,T,p-tukey,hedges
0,coherent_long,coherent_short,0.551,0.562,-0.011,0.069,-0.157,1.0,-0.03
1,coherent_long,mainplot,0.551,0.723,-0.172,0.068,-2.541,0.087,-0.61
2,coherent_long,unrelated_long,0.551,0.752,-0.201,0.074,-2.709,0.057,-0.567
3,coherent_long,unrelated_short,0.551,0.77,-0.219,0.073,-2.986,0.027,-0.65
4,coherent_short,mainplot,0.562,0.723,-0.161,0.067,-2.418,0.116,-0.671
5,coherent_short,unrelated_long,0.562,0.752,-0.19,0.073,-2.596,0.076,-0.608
6,coherent_short,unrelated_short,0.562,0.77,-0.209,0.073,-2.876,0.037,-0.708
7,mainplot,unrelated_long,0.723,0.752,-0.029,0.072,-0.407,0.994,-0.138
8,mainplot,unrelated_short,0.723,0.77,-0.048,0.071,-0.671,0.962,-0.253
9,unrelated_long,unrelated_short,0.752,0.77,-0.018,0.077,-0.238,0.999,-0.067


##### Lag Rank

We can summarize the tendency to group together nearby items by running a lag rank analysis [PNK09] using lag_rank(). For each recall, this determines the absolute lag of all remaining items available for recall and then calculates their percentile rank. Then the rank of the actual transition made is taken, scaled to vary between 0 (furthest item chosen) and 1 (nearest item chosen). Chance clustering will be 0.5; clustering above that value is evidence of a temporal contiguity effect.

In [54]:
fr.lag_rank(story_df).head()

Unnamed: 0,subject,rank
0,604,0.844718
1,605,0.705047
2,606,0.958296
3,611,0.745201
4,612,0.838877


Compare temporal rank to chance.

In [55]:
temporal_cluster = pd.merge(recall.groupby(['subject'])['word_count'].sum().reset_index(), fr.lag_rank(story_df), how='inner')
temporal_cluster = temporal_cluster.rename(columns={'rank':'temporal_rank'})
temporal_cluster['rank_type']='temporal'
temporal_cluster['chance_rank'] = 0.5
temporal_cluster.head()


Unnamed: 0,subject,word_count,temporal_rank,rank_type,chance_rank
0,604,1352.0,0.844718,temporal,0.5
1,605,620.0,0.705047,temporal,0.5
2,606,533.0,0.958296,temporal,0.5
3,611,667.0,0.745201,temporal,0.5
4,612,1350.0,0.838877,temporal,0.5


In [56]:
print(temporal_cluster.agg(['mean', 'sem']))

         subject  word_count  temporal_rank  chance_rank
mean  632.025641  601.692308       0.674026          0.5
sem     2.561652   40.731787       0.023528          0.0


In [57]:
ttest = pg.ttest(
    x=temporal_cluster['temporal_rank'],
    y=temporal_cluster['chance_rank'],
    paired=True
)

pg.print_table(ttest)

print()

    T    dof  alternative      p-val  CI95%          cohen-d         BF10    power
-----  -----  -------------  -------  -----------  ---------  -----------  -------
7.397     38  two-sided        0.000  [0.13 0.22]      1.675  1758000.000    1.000




#### Correlations between Temporal Rank and Recall Performance

In [58]:
corr = pg.corr(
    x=temporal_cluster['temporal_rank'],
    y=temporal_cluster['word_count'],
    method='pearson'
)

pg.print_table(corr)


  n      r  CI95%          p-val    BF10    power
---  -----  -----------  -------  ------  -------
 39  0.350  [0.04 0.6 ]    0.029   1.991    0.601



#### Category Clustering

##### Category Clustering Based on Probability

In [59]:
fr.category_crp(story_df, category_key='coherence').head()

Unnamed: 0,subject,prob,actual,possible
0,604,0.428571,6,14
1,605,0.692308,9,13
2,606,0.444444,4,9
3,611,0.461538,6,13
4,612,0.285714,4,14


Compare category clustering to chance.

In [60]:
category_cluster = pd.merge(recall.groupby(['subject'])['word_count'].sum().reset_index(), fr.category_crp(story_df, category_key='coherence'), how='inner')
category_cluster = pd.merge(category_cluster, fr.category_clustering(story_df, category_key='coherence'), how='inner')
category_cluster = category_cluster.rename(columns={
    'prob':'category_prob',
    'arc':'category_arc',
    'lbc':'category_lbc'
    })
category_cluster['rank_type']='category'
category_cluster['chance_prob'] = 0.3 # because there are 3 categories
category_cluster['chance_arc'] = 0.0
category_cluster.head()


Unnamed: 0,subject,word_count,category_prob,actual,possible,category_lbc,category_arc,rank_type,chance_prob,chance_arc
0,604,1352.0,0.428571,6,14,1.294118,-0.046154,category,0.3,0.0
1,605,620.0,0.692308,9,13,4.882353,0.521277,category,0.3,0.0
2,606,533.0,0.444444,4,9,1.058824,-0.047619,category,0.3,0.0
3,611,667.0,0.461538,6,13,2.176471,0.186047,category,0.3,0.0
4,612,1350.0,0.285714,4,14,-0.705882,-0.307692,category,0.3,0.0


In [77]:
print(category_cluster.agg(['mean', 'sem']))

         subject  word_count  category_prob    actual   possible  \
mean  632.025641  601.692308       0.563865  6.102564  11.128205   
sem     2.561652   40.731787       0.030293  0.300236   0.324816   

      category_lbc  category_arc   chance_prob  chance_arc  
mean      2.678733      0.180294  3.000000e-01         0.0  
sem       0.339864      0.056365  1.801020e-17         0.0  


In [61]:
ttest = pg.ttest(
    x=category_cluster['category_prob'],
    y=category_cluster['chance_prob'],
    paired=True
)

pg.print_table(ttest)

print()

    T    dof  alternative      p-val  CI95%          cohen-d          BF10    power
-----  -----  -------------  -------  -----------  ---------  ------------  -------
8.710     38  two-sided        0.000  [0.2  0.33]      1.973  75390000.000    1.000




#### Correlations between Category Probability and Recall Performance

In [62]:
corr = pg.corr(
    x=category_cluster['category_prob'],
    y=category_cluster['word_count'],
    method='pearson'
)

pg.print_table(corr)


  n       r  CI95%            p-val    BF10    power
---  ------  -------------  -------  ------  -------
 39  -0.251  [-0.53  0.07]    0.123   0.628    0.344



#### Adjusted Ratio of Clustering

A number of measures have been developed to measure category clustering relative to that expected due to chance, under certain assumptions. Two such measures are list-based clustering (LBC) [SBW+02] and adjusted ratio of clustering (ARC) [RTB71].

In [63]:
ttest = pg.ttest(
    x=category_cluster['category_arc'],
    y=category_cluster['chance_arc'],
    paired=True
)

pg.print_table(ttest)

print()

    T    dof  alternative      p-val  CI95%          cohen-d    BF10    power
-----  -----  -------------  -------  -----------  ---------  ------  -------
3.199     38  two-sided        0.003  [0.07 0.29]      0.724  12.453    0.993




##### Correlations between ARC and Recall Performance

In [64]:
corr = pg.corr(
    x=category_cluster['category_arc'],
    y=category_cluster['word_count'],
    method='pearson'
)

pg.print_table(corr)


  n       r  CI95%            p-val    BF10    power
---  ------  -------------  -------  ------  -------
 39  -0.212  [-0.49  0.11]    0.196   0.447    0.256



#### Semantic Similarity Clustering

While the category CRP examines clustering based on semantic similarity at a coarse level (i.e., whether two items are in the same category or not), recall may also depend on more nuanced semantic relationships.

Models of semantic knowledge allow the semantic similarity between pairs of items to be quantified. If you have such a model defined for your stimulus pool, you can use the distance CRP analysis to examine how semantic similarity affects recall transitions [HK02, MP16].

Steps:
1. Obtain sentence embeddings of each encoding item from USE
2. Extract the item and embedding into an array. Calculate the correlation. This will result in a 26x26 matrix for each item.
3. Index it and add it to the dataframe (in our case, it is story_df).
4. Run distance_rank analyses using psifr.

In [65]:
encoded_story_embeddings = story[['item', 'detail']]
encoded_story_embeddings = encoded_story_embeddings.drop_duplicates(subset=['item'])
encoded_story_embeddings.reset_index(drop=True, inplace=True)
encoded_story_embeddings.head()

Unnamed: 0,item,detail
0,charles1A,It was Charles Bort’s big break: there was fin...
1,beatrice1A,"A tremor ran through his leg, as his phone rec..."
2,charles1B,"The newsroom was quiet, but Charles could make..."
3,melvin1A,Charles was interrupted by a video call from M...
4,charles1C,Twenty minutes later Charles chained the bike ...


In [66]:
story_embeddings = []

for i in range(len(encoded_story_embeddings)):
    item = encoded_story_embeddings.loc[i, 'item']
    detail = encoded_story_embeddings.loc[i, 'detail']
    embedded = np.array(embed([detail]))
    embedding = embedded.squeeze()

    curr_dict = {
        'item': item, 
        'detail': detail,
        'embedding': embedding
    }

    story_embeddings.append(curr_dict)
story_embeddings = pd.DataFrame(story_embeddings)
story_embeddings.head()

Unnamed: 0,item,detail,embedding
0,charles1A,It was Charles Bort’s big break: there was fin...,"[-0.025818096, 0.012939302, 0.0051612426, -0.0..."
1,beatrice1A,"A tremor ran through his leg, as his phone rec...","[-0.035725664, 0.037785117, 0.027241783, -0.04..."
2,charles1B,"The newsroom was quiet, but Charles could make...","[-0.05018931, 0.014588034, -0.001451634, -0.01..."
3,melvin1A,Charles was interrupted by a video call from M...,"[0.03513451, 0.0018453865, 0.038247604, -0.052..."
4,charles1C,Twenty minutes later Charles chained the bike ...,"[-0.05587792, 0.005138666, 0.016183615, -0.040..."


Generate correlation matrix

In [68]:
embeddings = np.stack(story_embeddings['embedding'].values)
correlations = np.inner(embeddings, embeddings)
norms = np.linalg.norm(embeddings, axis=1)
correlations /= np.outer(norms, norms)

In [69]:
item_indices = story_embeddings.index.values
item_indices

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25])

In [70]:
story_correlations = (
    story_embeddings['item'].values,
    correlations
)

# story_correlations

In [71]:
items, distances = story_correlations
story_df['item_index'] = fr.pool_index(story_df['item'], items)
np.fill_diagonal(distances, 0)
edges = np.percentile(squareform(distances), np.linspace(1, 99, 10))

In [72]:
fr.distance_crp(story_df, 'item_index', distances, edges).head()

Unnamed: 0,subject,center,bin,prob,actual,possible
0,604,0.273231,"(0.226, 0.321]",0.111111,2,18
1,604,0.339923,"(0.321, 0.359]",0.076923,1,13
2,604,0.370381,"(0.359, 0.381]",0.066667,1,15
3,604,0.395352,"(0.381, 0.409]",0.083333,1,12
4,604,0.426229,"(0.409, 0.443]",0.142857,2,14


In [74]:
fr.distance_crp(story_df, 'item_index', distances, edges).groupby(['bin']).agg(['mean', 'sem']).reset_index()

Unnamed: 0_level_0,bin,subject,subject,center,center,prob,prob,actual,actual,possible,possible
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,sem,mean,sem,mean,sem,mean,sem,mean,sem
0,"(0.226, 0.321]",632.025641,2.561652,0.273231,0.0,0.064671,0.010702,0.871795,0.138332,13.538462,0.461538
1,"(0.321, 0.359]",632.025641,2.561652,0.339923,0.0,0.080609,0.014116,0.897436,0.150549,10.820513,0.413784
2,"(0.359, 0.381]",632.025641,2.561652,0.370381,0.0,0.067404,0.011964,0.897436,0.150549,12.871795,0.460563
3,"(0.381, 0.409]",632.025641,2.561652,0.395352,0.0,0.066341,0.01017,0.794872,0.111301,12.025641,0.347513
4,"(0.409, 0.443]",632.025641,2.561652,0.426229,0.0,0.081373,0.009384,1.205128,0.143126,14.153846,0.570932
5,"(0.443, 0.463]",632.025641,2.561652,0.452858,0.0,0.078127,0.012866,0.846154,0.135169,11.487179,0.46449
6,"(0.463, 0.513]",632.025641,2.561652,0.487974,0.0,0.123819,0.012535,1.564103,0.167521,12.384615,0.416617
7,"(0.513, 0.568]",632.025641,2.561652,0.540482,0.0,0.075712,0.009781,1.128205,0.152264,14.051282,0.412276
8,"(0.568, 0.718]",632.025641,2.561652,0.643,0.0,0.139309,0.013339,2.615385,0.248111,19.410256,0.526899


#### Semantic Similarity Rank

Similarly to the lag rank analysis of temporal clustering, we can summarize distance-based clustering (such as semantic clustering) with a single rank measure [PNK09]. The distance rank varies from 0 (the most-distant item is always recalled) to 1 (the closest item is always recalled), with chance clustering corresponding to 0.5. Given a matrix of item distances, we can calculate distance rank using distance_rank().

In [75]:
fr.distance_rank(story_df, 'item_index', distances).head()

Unnamed: 0,subject,rank
0,604,0.47485
1,605,0.416177
2,606,0.549398
3,611,0.449805
4,612,0.52203


Compare semantic similarity rank to chance.

In [76]:
semantic_cluster = pd.merge(recall.groupby(['subject'])['word_count'].sum().reset_index(), fr.distance_rank(story_df, 'item_index', distances), how='inner')
semantic_cluster = semantic_cluster.rename(columns={'rank':'semantic_rank'})
semantic_cluster['rank_type']='semantic'
semantic_cluster['chance_rank'] = 0.5
semantic_cluster.head()


Unnamed: 0,subject,word_count,semantic_rank,rank_type,chance_rank
0,604,1352.0,0.47485,semantic,0.5
1,605,620.0,0.416177,semantic,0.5
2,606,533.0,0.549398,semantic,0.5
3,611,667.0,0.449805,semantic,0.5
4,612,1350.0,0.52203,semantic,0.5


In [98]:
print(semantic_cluster.agg(['mean', 'sem']))

         subject  word_count  semantic_rank  chance_rank
mean  632.025641  601.692308       0.402229          0.5
sem     2.561652   40.731787       0.020385          0.0


In [77]:
ttest = pg.ttest(
    x=semantic_cluster['semantic_rank'],
    y=semantic_cluster['chance_rank'],
    paired=True
)

pg.print_table(ttest)

print()

     T    dof  alternative      p-val  CI95%            cohen-d     BF10    power
------  -----  -------------  -------  -------------  ---------  -------  -------
-4.796     38  two-sided        0.000  [-0.14 -0.06]      1.086  867.390    1.000




#### Correlations between Semantic Rank and Recall Performance

In [78]:
corr = pg.corr(
    x=semantic_cluster['semantic_rank'],
    y=semantic_cluster['word_count'],
    method='pearson'
)

pg.print_table(corr)


  n      r  CI95%            p-val    BF10    power
---  -----  -------------  -------  ------  -------
 39  0.124  [-0.2   0.42]    0.452   0.262    0.117



### Distance Between Paired Events

In [122]:
recall_drop_duplicates = recall.sort_values('response_order').drop_duplicates(
    subset=['subject', 'version', 'character', 'coherence', 'lag', 'condition', 'act'], keep='first')


recall_distance = recall_drop_duplicates.pivot_table(
    index=['subject', 'version', 'character', 'coherence', 'lag', 'condition'],
    columns='act',
    values=['response_order', 'word_count'],
    aggfunc='mean'
).reset_index()


recall_distance.columns = [f'{col[0]}_{col[1]}' if col[1] else col[0] for col in recall_distance.columns]


def exclusion(row):
    if pd.isnull(row['response_order_1']) or pd.isnull(row['response_order_2']) or pd.isnull(row['word_count_1']) or pd.isnull(row['word_count_2']):
        return 'exclude'
    else:
        return 'include'


recall_distance['exclusion'] = recall_distance.apply(exclusion, axis=1)

recall_distance['distance'] = recall_distance['response_order_2'].values - recall_distance['response_order_1'].values
recall_distance['abs_distance'] = abs(recall_distance['response_order_2'].values - recall_distance['response_order_1'].values)
recall_distance

recall_distance.head()

Unnamed: 0,subject,version,character,coherence,lag,condition,response_order_1,response_order_2,word_count_1,word_count_2,exclusion,distance,abs_distance
0,604,1,beatrice,unrelated,long,unrelated_long,3.0,,9.0,,exclude,,
1,604,1,charles,mainplot,mainplot,mainplot_mainplot,1.0,13.0,24.0,12.0,include,12.0,12.0
2,604,1,johnny,coherent,short,coherent_short,11.0,19.0,108.0,98.0,include,8.0,8.0
3,604,1,melvin,coherent,long,coherent_long,4.0,17.0,6.0,101.0,include,13.0,13.0
4,604,1,sandra,unrelated,short,unrelated_short,5.0,18.0,2.0,23.0,include,13.0,13.0


In [97]:
recall_distance = recall_distance[recall_distance['condition']!='mainplot_mainplot']
recall_distance = recall_distance[recall_distance['exclusion']!='exclude']
recall_distance.head()

Unnamed: 0,subject,version,character,coherence,lag,condition,response_order_1,response_order_2,word_count_1,word_count_2,exclusion,distance,abs_distance
2,604,1,johnny,coherent,short,coherent_short,11.0,19.0,108.0,98.0,include,8.0,8.0
3,604,1,melvin,coherent,long,coherent_long,4.0,17.0,6.0,101.0,include,13.0,13.0
4,604,1,sandra,unrelated,short,unrelated_short,5.0,18.0,2.0,23.0,include,13.0,13.0
8,605,2,melvin,coherent,long,coherent_long,5.0,15.0,35.0,16.0,include,10.0,10.0
9,605,2,sandra,coherent,short,coherent_short,4.0,14.0,41.0,19.0,include,10.0,10.0


In [99]:
subject = recall_distance['subject'].to_numpy()
coherence = recall_distance['coherence'].to_numpy()
lag = recall_distance['lag'].to_numpy()
condition = recall_distance['condition'].to_numpy()
distance = recall_distance['distance'].to_numpy()
abs_distance = recall_distance['abs_distance'].to_numpy()

In [100]:
%%R -i subject,coherence,lag,condition,distance,abs_distance

library(lme4)
library(lmerTest)
df = data.frame(subject, coherence, lag, condition, distance, abs_distance)
df$subject = as.factor(df$subject)
df$coherence = as.factor(df$coherence)
df$condition = as.factor(df$condition)
df$lag = as.factor(df$lag)
lmer_model = lmer(distance~coherence*lag+(1|subject),data=df)
summary(lmer_model)




Attaching package: ‘lmerTest’



    lmer



    step




Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: distance ~ coherence * lag + (1 | subject)
   Data: df

REML criterion at convergence: 459.9

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-2.19067 -0.40817 -0.09006  0.50364  2.12377 

Random effects:
 Groups   Name        Variance Std.Dev.
 subject  (Intercept) 16.44    4.054   
 Residual             10.21    3.196   
Number of obs: 81, groups:  subject, 36

Fixed effects:
                            Estimate Std. Error      df t value Pr(>|t|)    
(Intercept)                   5.4030     0.9298 58.9420   5.811 2.67e-07 ***
coherenceunrelated            1.9323     1.2743 48.4727   1.516    0.136    
lagshort                     -0.5474     0.8846 46.1861  -0.619    0.539    
coherenceunrelated:lagshort  -1.2330     1.7374 47.5670  -0.710    0.481    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (Intr) chrnc

In [101]:
%%R -i subject,coherence,lag,condition,distance,abs_distance

library(lme4)
library(lmerTest)
df = data.frame(subject, coherence, lag, condition, distance, abs_distance)
df$subject = as.factor(df$subject)
df$coherence = as.factor(df$coherence)
df$condition = as.factor(df$condition)
df$lag = as.factor(df$lag)
lmer_model = lmer(abs_distance~coherence*lag+(1|subject),data=df)
summary(lmer_model)

Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: abs_distance ~ coherence * lag + (1 | subject)
   Data: df

REML criterion at convergence: 449.2

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.3759 -0.4223 -0.1342  0.4305  2.3279 

Random effects:
 Groups   Name        Variance Std.Dev.
 subject  (Intercept) 14.070   3.751   
 Residual              8.956   2.993   
Number of obs: 81, groups:  subject, 36

Fixed effects:
                            Estimate Std. Error      df t value Pr(>|t|)    
(Intercept)                   5.5740     0.8650 59.7008   6.444 2.25e-08 ***
coherenceunrelated            2.4687     1.1925 49.1831   2.070   0.0437 *  
lagshort                     -0.4202     0.8280 46.8445  -0.507   0.6142    
coherenceunrelated:lagshort  -1.7054     1.6261 48.2630  -1.049   0.2995    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (Intr) chrncn lgsh

In [102]:
recall_distance['correct_distance'] = np.where(recall_distance['lag'] == 'short', 4,
                                  np.where(recall_distance['lag'] == 'long', 12, np.nan))

recall_distance.head()

Unnamed: 0,subject,version,character,coherence,lag,condition,response_order_1,response_order_2,word_count_1,word_count_2,exclusion,distance,abs_distance,correct_distance
2,604,1,johnny,coherent,short,coherent_short,11.0,19.0,108.0,98.0,include,8.0,8.0,4.0
3,604,1,melvin,coherent,long,coherent_long,4.0,17.0,6.0,101.0,include,13.0,13.0,12.0
4,604,1,sandra,unrelated,short,unrelated_short,5.0,18.0,2.0,23.0,include,13.0,13.0,4.0
8,605,2,melvin,coherent,long,coherent_long,5.0,15.0,35.0,16.0,include,10.0,10.0,12.0
9,605,2,sandra,coherent,short,coherent_short,4.0,14.0,41.0,19.0,include,10.0,10.0,4.0


In [103]:
unique_conditions = recall_distance['coherence'].unique()

for condition in unique_conditions:
    data_condition = recall_distance[recall_distance['coherence'] == condition]
    abs_distance = data_condition['abs_distance']
    correct_distance = data_condition['correct_distance']
    
    t_stat, p_value = stats.ttest_ind(abs_distance, correct_distance, equal_var=False)
    
    print(f"T-test for {condition}:")
    print(f"   T-statistic: {t_stat}")
    print(f"   P-value: {p_value}")
    if p_value < 0.05:
        print("   Result: Significant difference")
    else:
        print("   Result: No significant difference")
    print("-" * 30)


T-test for coherent:
   T-statistic: -3.1756063823498977
   P-value: 0.0019361446209116547
   Result: Significant difference
------------------------------
T-test for unrelated:
   T-statistic: -0.2612345789412017
   P-value: 0.7951703728359001
   Result: No significant difference
------------------------------


In [104]:
unique_conditions = recall_distance['coherence'].unique()

for condition in unique_conditions:
    data_condition = recall_distance[recall_distance['coherence'] == condition]
    abs_distance = data_condition['abs_distance']
    correct_distance = data_condition['correct_distance']
    
    ttest = pg.ttest(x = abs_distance, y = correct_distance)
    
    print(f"T-test for {condition}:")
    pg.print_table(ttest)


T-test for coherent:
     T    dof  alternative      p-val  CI95%            cohen-d    BF10    power
------  -----  -------------  -------  -------------  ---------  ------  -------
-3.176    114  two-sided        0.002  [-4.23 -0.98]      0.590  16.477    0.883

T-test for unrelated:
     T    dof  alternative      p-val  CI95%            cohen-d    BF10    power
------  -----  -------------  -------  -------------  ---------  ------  -------
-0.261     44  two-sided        0.795  [-3.03  2.34]      0.077   0.301    0.058



In [105]:
print('df = ' + str((len(recall_distance[recall_distance['coherence']=='coherent'])*2)-2))
print('df = ' + str((len(recall_distance[recall_distance['coherence']=='unrelated'])*2)-2))

df = 114
df = 44
