Before running this code, please install these libraries with the following commands: 

- pip install scipy
- pip install transformers
- pip install torch
- pip install PIL
- pip install nltk
- pip install os

and make sure that "torch" and "PIL" are installed. 

## BERT

In [24]:
from transformers import BertModel, BertTokenizer
from scipy.io import loadmat
import torch
import nltk
import os
import pandas as pd

nltk.download('stopwords')

#load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chiar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
file_df = pd.read_csv('./text_data/AliceChapterOne.csv')

file_df = file_df.drop(file_df[file_df['IsLexical'] < 1.0].index)

file_df
words = file_df['Word']

embeddings = []
for word in words:
  tokenized_text = tokenizer(word, return_tensors='pt', padding=True, truncation=True)
  with torch.no_grad():
      outputs = model(**tokenized_text)
  emb = torch.mean(outputs.last_hidden_state, dim=1).squeeze().numpy()
  #print(emb)
  embeddings.append(emb)

emb_list = []
for i, tensor in enumerate(embeddings):
  emb = tensor.flatten().tolist()
  emb_list.append(emb)

file_df['embeddings'] = emb_list
file_df

file_df.to_csv('./text_data/AliceChapterOne.csv', index=False)

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
import ast
import pandas as pd
import numpy as np
#The semantic similarity of each word to its preceding context is then defined by
#comparing (via a Pearson’s correlation) its 400-dimensional vector with the average of the vectors of all the preceding
#words in the corresponding sentence. And the ‘‘semantic dissimilarity’’ of the word is quantified as 1 minus this correlation
embeddingavg = []
embeddings = []
df = pd.read_csv('./text_data/AliceChapterOne.csv')
df_words=df
scores=[]

num_sentences = df['Sentence'].unique()
num_sentences = [x if not np.isnan(x) else 0 for x in num_sentences]

num_sentences = [int(element) for element in num_sentences]
for id, idx in enumerate(num_sentences): #for each sentence
  #selecting all words in sentence id
  sent_df=df_words[df_words['Sentence']==idx]  
  sent_df=sent_df.reset_index(drop=True)       

  for k,row2 in sent_df.iterrows(): #for each word, row in sentence df

      #condition for beginning of sentence
      if (k==0) and (idx==0):
          scores.append(0) 
          print('i is:',idx)
          continue

      #average of previouse sentence
      if  (k==0) and (idx > 0): 
          prev_sent_df=df_words[df_words['Sentence']==num_sentences[id-1]]
          embedding=prev_sent_df['embeddings'].reset_index(drop=True)
          embeddings=np.array([ast.literal_eval(embedding[j]) for j in range(len(embedding))])
          if len(embeddings) == 0:
            scores.append(np.nan)
            continue
          else:
            embeddingavg=np.nanmean(embeddings,axis=0)
            current_embedding = ast.literal_eval(sent_df['embeddings'][k])
            if np.isnan(embeddingavg).any() or np.isnan(current_embedding).any():
                scores.append(np.nan)
            else:
                dissimilarity = 1 - np.corrcoef(embeddingavg.reshape(1, -1), np.array(current_embedding).reshape(1, -1))[0,1]
                scores.append(dissimilarity)
            print('i is:',idx)
            continue

      #second word in trial
      tmp_df=sent_df[0:k]
      embedding=tmp_df['embeddings'].reset_index(drop=True)
      embeddings=np.array([ast.literal_eval(embedding[j]) for j in range(len(embedding))])
      if len(embeddings) == 0:
              scores.append(np.nan)
      else:
        embeddingavg=np.mean(embeddings,axis=0)
        current_embedding = ast.literal_eval(sent_df['embeddings'][k])
        if np.isnan(embeddingavg).any() or np.isnan(current_embedding).any():
            scores.append(np.nan)
        else:
            dissimilarity = 1 - np.corrcoef(embeddingavg.reshape(1, -1), np.array(current_embedding).reshape(1, -1))[0,1]
            scores.append(dissimilarity)
        print('i is:',idx)
        continue

scores
df['diff'] = scores

df = df.dropna(subset=['diff'])

df.to_csv('./text_data/AliceChapterOne.csv', index=False)


i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 2
i is: 2
i is: 2
i is: 2
i is: 2
i is: 2
i is: 2
i is: 2
i is: 2
i is: 2
i is: 3
i is: 3
i is: 3
i is: 3
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 5
i is: 5
i is: 5
i is: 5
i is: 5
i is: 5
i is: 5
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 7
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 9
i is: 9
i is: 9
i is: 9
i is: 9
i is: 10
i is: 10
i is: 10
i is: 10
i is: 10
i is: 10
i is: 10
i is: 10
i is: 10
i is: 11
i is: 11
i is: 11
i is: 11
i is: 11
i is: 11
i is: 11
i is: 11
i is: 11
i is: 11
i is: 11
i is: 12
i is: 12
i is: 12
i

## CLIP

#conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0
#pip install ftfy regex tqdm
#pip install git+https://github.com/openai/CLIP.git

In [27]:
import torch
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel

device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")


url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

In [28]:
#removing stopwords defined as IsLexical = 0.0
import os
import pandas as pd

file_df = pd.read_csv('./text_data/AliceChapterOne.csv')
file_df = file_df.drop(file_df[file_df['IsLexical'] < 1.0].index)
file_df

Unnamed: 0,Word,Segment,onset,offset,Order,LogFreq,LogFreq_Prev,LogFreq_Next,SndPower,Length,Position,Sentence,IsLexical,NGRAM,RNN,CFG,embeddings,diff
0,Alice,1,0.046000,0.608721,1,8.65,0.00,14.56,3.621500e-07,0.562721,1,1,1.0,3.226499,3.126175,2.312348,"[0.11684151738882065, 0.014314591884613037, -0...",0.081541
1,beginning,1,0.784543,1.302929,3,10.69,14.56,16.35,3.686500e-09,0.518386,3,1,1.0,4.446766,4.100771,5.626722,"[-0.13216809928417206, -0.12346673756837845, -...",0.129593
2,very,1,1.616327,2.356749,6,13.28,13.79,9.86,4.072700e-09,0.740422,6,1,1.0,4.259243,4.461274,4.745532,"[0.20976144075393677, -0.0714065432548523, 0.0...",0.102155
3,tired,1,2.310749,2.845918,7,9.86,13.28,16.21,3.664100e-09,0.535169,7,1,1.0,2.649141,3.489757,4.749652,"[0.22826707363128662, 0.06488876789808273, 0.0...",0.090750
4,sitting,1,2.938712,3.350490,9,10.08,16.21,14.40,3.989200e-09,0.411778,9,1,1.0,3.898681,5.246072,3.542125,"[0.05619359016418457, 0.03932463750243187, -0....",0.049514
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,generally,12,44.651659,45.220877,2145,10.82,14.10,10.77,2.697600e-03,0.569218,5,84,1.0,4.287546,7.153666,3.230874,"[0.11673595756292343, -0.07533806562423706, -0...",0.131076
914,happens,12,45.226353,45.672448,2146,10.77,10.82,13.76,7.081200e-04,0.446095,6,84,1.0,5.574428,6.356812,2.969568,"[0.5203867554664612, -0.11791258305311203, 0.1...",0.106358
915,one,12,45.896829,46.058972,2148,14.17,13.76,8.15,1.984200e-03,0.162143,8,84,1.0,1.380381,2.187682,0.725398,"[-0.1231246218085289, -0.2559150159358978, -0....",0.146470
916,eats,12,46.064448,46.322373,2149,8.15,14.17,8.74,2.473000e-05,0.257925,9,84,1.0,3.171368,3.941021,2.767965,"[0.2447713166475296, 0.22371430695056915, 0.01...",0.094643


In [29]:
import os
import pandas as pd

words = file_df['Word']

embeddings = []
for w in words:
    inputs = processor(text=w, images=image, return_tensors="pt")
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image # this is the image-text similarity score
    embeddings.append(outputs.text_embeds)

emb_list = []
for i, tensor in enumerate(embeddings):
    emb = tensor.flatten().tolist()
    emb_list.append(emb)

file_df['CLIP_embeddings'] = emb_list

file_df.to_csv('./text_data/AliceChapterOne.csv', index=False)



In [30]:
from sklearn.metrics.pairwise import cosine_similarity
import ast
import pandas as pd
import numpy as np
#The semantic similarity of each word to its preceding context is then defined by
#comparing (via a Pearson’s correlation) its 400-dimensional vector with the average of the vectors of all the preceding
#words in the corresponding sentence. And the ‘‘semantic dissimilarity’’ of the word is quantified as 1 minus this correlation
embeddingavg = []
embeddings = []
df = pd.read_csv('./text_data/AliceChapterOne.csv')
df_words=df
scores=[]

num_sentences = df['Sentence'].unique()
num_sentences = [x if not np.isnan(x) else 0 for x in num_sentences]

num_sentences = [int(element) for element in num_sentences]
for id, idx in enumerate(num_sentences): #for each sentence
    #selecting all words in sentence id
    sent_df=df_words[df_words['Sentence']==idx] 
    sent_df=sent_df.reset_index(drop=True)         
    for k,row2 in sent_df.iterrows(): 

        #condition for beginning of sentence
        if (k==0) and (idx==0): 
            scores.append(0) 
            print('i is:',idx)
            continue

        #average of previouse sentence
        if  (k==0) and (idx > 0): 
            prev_sent_df=df_words[df_words['Sentence']==num_sentences[id-1]]
            embedding=prev_sent_df['CLIP_embeddings'].reset_index(drop=True)
            embeddings=np.array([ast.literal_eval(embedding[j]) for j in range(len(embedding))])
            if len(embeddings) == 0:
              scores.append(np.nan)
              continue
            else:
              embeddingavg=np.nanmean(embeddings,axis=0)
              current_embedding = ast.literal_eval(sent_df['CLIP_embeddings'][k])
              if np.isnan(embeddingavg).any() or np.isnan(current_embedding).any():
                  scores.append(np.nan)
              else:
                  dissimilarity = 1 - np.corrcoef(embeddingavg.reshape(1, -1), np.array(current_embedding).reshape(1, -1))[0,1]
                  scores.append(dissimilarity)
              print('i is:',idx)
              continue

        #second word in trial
        tmp_df=sent_df[0:k]
        embedding=tmp_df['CLIP_embeddings'].reset_index(drop=True)
        embeddings=np.array([ast.literal_eval(embedding[j]) for j in range(len(embedding))])
        if len(embeddings) == 0:
                scores.append(np.nan)
        else:
          embeddingavg=np.mean(embeddings,axis=0)
          current_embedding = ast.literal_eval(sent_df['CLIP_embeddings'][k])
          if np.isnan(embeddingavg).any() or np.isnan(current_embedding).any():
              scores.append(np.nan)
          else:
              dissimilarity = 1 - np.corrcoef(embeddingavg.reshape(1, -1), np.array(current_embedding).reshape(1, -1))[0,1]
              scores.append(dissimilarity)
          print('i is:',idx)
          continue

scores
df['diff_CLIP'] = scores

df = df.dropna(subset=['diff_CLIP'])

df.to_csv('./text_data/AliceChapterOne.csv', index=False)

i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 2
i is: 2
i is: 2
i is: 2
i is: 2
i is: 2
i is: 2
i is: 2
i is: 2
i is: 2
i is: 3
i is: 3
i is: 3
i is: 3
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 5
i is: 5
i is: 5
i is: 5
i is: 5
i is: 5
i is: 5
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 7
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 9
i is: 9
i is: 9
i is: 9
i is: 9
i is: 10
i is: 10
i is: 10
i is: 10
i is: 10
i is: 10
i is: 10
i is: 10
i is: 10
i is: 11
i is: 11
i is: 11
i is: 11
i is: 11
i is: 11
i is: 11
i is: 11
i is: 11
i is: 11
i is: 11
i is: 12
i is: 12
i is: 12
i