Before running this code, please install these libraries with the following commands: 

- pip install scipy
- pip install transformers
- pip install torch
- pip install PIL
- pip install nltk
- pip install os

and make sure that "torch" and "PIL" are installed. 


## BERT

In [1]:
from transformers import BertModel, BertTokenizer
from scipy.io import loadmat
import torch
import nltk
import os
import pandas as pd

nltk.download('stopwords')

#load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chiar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
directory_path = './text_data/'
for filename in os.listdir(directory_path):
  if filename.startswith('file') and filename.endswith('.csv'):
        file_path = os.path.join(directory_path, filename)
        print(f"Processing file: {file_path}")
        file_df = pd.read_csv(file_path)
        words = file_df['word']

        embeddings = []
        for word in words:
          tokenized_text = tokenizer(word, return_tensors='pt', padding=True, truncation=True)
          with torch.no_grad():
              outputs = model(**tokenized_text)
          emb = torch.mean(outputs.last_hidden_state, dim=1).squeeze().numpy()
          embeddings.append(emb)

        emb_list = []
        for i, tensor in enumerate(embeddings):
          emb = tensor.flatten().tolist()
          emb_list.append(emb)

        file_df['embeddings'] = emb_list
        file_df

        file_df.to_csv(str(os.path.join(directory_path, filename)), index=False)

Processing file: ./text_data/file10_diff.csv
Processing file: ./text_data/file11_diff.csv
Processing file: ./text_data/file12_diff.csv
Processing file: ./text_data/file13_diff.csv
Processing file: ./text_data/file14_diff.csv
Processing file: ./text_data/file15_diff.csv
Processing file: ./text_data/file16_diff.csv
Processing file: ./text_data/file17_diff.csv
Processing file: ./text_data/file18_diff.csv
Processing file: ./text_data/file19_diff.csv
Processing file: ./text_data/file1_diff.csv
Processing file: ./text_data/file20_diff.csv
Processing file: ./text_data/file21_diff.csv
Processing file: ./text_data/file22_diff.csv
Processing file: ./text_data/file23_diff.csv
Processing file: ./text_data/file24_diff.csv
Processing file: ./text_data/file25_diff.csv
Processing file: ./text_data/file26_diff.csv
Processing file: ./text_data/file27_diff.csv
Processing file: ./text_data/file28_diff.csv
Processing file: ./text_data/file29_diff.csv
Processing file: ./text_data/file2_diff.csv
Processing f

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
import ast
import pandas as pd
import numpy as np
#The semantic similarity of each word to its preceding context is then defined by
#comparing (via a Pearson’s correlation) its 400-dimensional vector with the average of the vectors of all the preceding
#words in the corresponding sentence. And the ‘‘semantic dissimilarity’’ of the word is quantified as 1 minus this correlation

directory_path = './text_data/'
for filename in os.listdir(directory_path):
  if filename.startswith('file') and filename.endswith('.csv'):
        file_path = os.path.join(directory_path, filename)
        embeddingavg = []
        embeddings = []
        df = pd.read_csv(file_path)
        df_words=df
        scores=[]

        num_sentences = df['sentence_id'].unique()
        num_sentences = [x if not np.isnan(x) else 0 for x in num_sentences]

        num_sentences = [int(element) for element in num_sentences]
        for id, idx in enumerate(num_sentences): #for each sentence
            #selecting all words in sentence id
            sent_df=df_words[df_words['sentence_id']==idx] 
            sent_df=sent_df.reset_index(drop=True)         

            for k,row2 in sent_df.iterrows(): #for each word, row in sentence df

                #condition for beginning of sentence
                if (k==0) and (idx==0): #beginning of trial 
                    scores.append(0) 
                    print('i is:',idx)
                    continue

                #average of previouse sentence
                if  (k==0) and (idx > 0):
                    prev_sent_df=df_words[df_words['sentence_id']==num_sentences[id-1]]
                    embedding=prev_sent_df['embeddings'].reset_index(drop=True)
                    embeddings=np.array([ast.literal_eval(embedding[j]) for j in range(len(embedding))])
                    if len(embeddings) == 0:
                        scores.append(np.nan)
                        continue
                    else:
                        embeddingavg=np.nanmean(embeddings,axis=0)
                        current_embedding = ast.literal_eval(sent_df['embeddings'][k])
                        if np.isnan(embeddingavg).any() or np.isnan(current_embedding).any():
                            scores.append(np.nan)
                        else:
                            dissimilarity = 1 - np.corrcoef(embeddingavg.reshape(1, -1), np.array(current_embedding).reshape(1, -1))[0,1]
                            scores.append(dissimilarity)
                        print('i is:',idx)
                        continue

                #second word in trial
                tmp_df=sent_df[0:k]
                embedding=tmp_df['embeddings'].reset_index(drop=True)
                embeddings=np.array([ast.literal_eval(embedding[j]) for j in range(len(embedding))])
                if len(embeddings) == 0:
                        scores.append(np.nan)
                else:
                    embeddingavg=np.mean(embeddings,axis=0)
                    current_embedding = ast.literal_eval(sent_df['embeddings'][k])
                    if np.isnan(embeddingavg).any() or np.isnan(current_embedding).any():
                        scores.append(np.nan)
                    else:
                        dissimilarity = 1 - np.corrcoef(embeddingavg.reshape(1, -1), np.array(current_embedding).reshape(1, -1))[0,1]
                        scores.append(dissimilarity)
                    print('i is:',idx)
                    continue

        df['diff'] = scores

        df = df.dropna(subset=['diff'])

        df.to_csv(str(os.path.join(directory_path, filename)), index=False)


i is: 0
i is: 0
i is: 0
i is: 0
i is: 0
i is: 0
i is: 0
i is: 0
i is: 0
i is: 0
i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 2
i is: 2
i is: 2
i is: 2
i is: 2
i is: 2
i is: 3
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 5
i is: 5
i is: 5
i is: 5
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 7
i is: 7
i is: 7
i is: 7
i is: 7
i is: 7
i is: 7
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 9
i is: 9
i is: 9
i is: 9
i is: 9
i is: 10
i is: 10
i is: 11
i is: 11
i is: 11
i is: 11
i is: 11
i is: 11
i is: 12
i is: 12
i is: 12
i is: 12
i is: 12
i is: 12
i is: 12
i is: 12
i is: 13
i is: 13
i is: 13
i is: 13
i is: 13
i is: 13
i is: 13
i is: 13
i is: 14
i is: 14
i is: 14
i is: 14
i is: 14
i is: 14
i is: 14
i is: 15
i is: 15
i is: 15
i is: 15
i is: 15
i is: 15
i is: 15
i is: 15
i is: 15
i is: 15
i is: 15
i is: 16
i is: 16
i is: 16
i is: 16
i is: 16
i

## CLIP

In [4]:
#conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0
#pip install ftfy regex tqdm
#pip install git+https://github.com/openai/CLIP.git

In [1]:
import torch
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel

device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")


url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)


In [2]:
import os
import pandas as pd

directory_path = './text_data/'
for filename in os.listdir(directory_path):
  if filename.startswith('file') and filename.endswith('.csv'):
    file_path = os.path.join(directory_path, filename)
    print(f"Processing file: {file_path}")
    file_df = pd.read_csv(file_path)
    words = file_df['word']

  embeddings = []
  for w in words:
    inputs = processor(text=w, images=image, return_tensors="pt")
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image # this is the image-text similarity score
    embeddings.append(outputs.text_embeds)

  emb_list = []
  for i, tensor in enumerate(embeddings):
    emb = tensor.flatten().tolist()
    emb_list.append(emb)

  file_df['CLIP_embeddings'] = emb_list

  file_df.to_csv(str(os.path.join(directory_path, filename)), index=False)



Processing file: ./text_data/file10_diff.csv


KeyboardInterrupt: 

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
import ast
import pandas as pd
import numpy as np
#The semantic similarity of each word to its preceding context is then defined by
#comparing (via a Pearson’s correlation) its 400-dimensional vector with the average of the vectors of all the preceding
#words in the corresponding sentence. And the ‘‘semantic dissimilarity’’ of the word is quantified as 1 minus this correlation
directory_path = './text_data/'
for filename in os.listdir(directory_path):
    if filename.startswith('file') and filename.endswith('.csv'):
        file_path = os.path.join(directory_path, filename)
        print(f"Processing file: {file_path}")
        df = pd.read_csv(file_path)
            
    embeddingavg = []
    embeddings = []
    df_words=df
    scores=[]

    num_sentences = df['sentence_id'].unique()
    num_sentences = [x if not np.isnan(x) else 0 for x in num_sentences]

    num_sentences = [int(element) for element in num_sentences]
    
    for id, idx in enumerate(num_sentences): #for each sentence
        #selecting all words in sentence id
        sent_df=df_words[df_words['sentence_id']==idx] #df con tutte le parole nella stessa sentence
        sent_df=sent_df.reset_index(drop=True)         #by resetting the index i get to iterate through all words

        for k,row2 in sent_df.iterrows():

            #condition for beginning of sentence
            if (k==0) and (idx==0): #beginning of trial
                scores.append(0) 
                print('i is:',idx)
                continue

                #average of previouse sentence
            if  (k==0) and (idx > 0):
                prev_sent_df=df_words[df_words['sentence_id']==num_sentences[id-1]]
                embedding=prev_sent_df['CLIP_embeddings'].reset_index(drop=True)
                embeddings=np.array([ast.literal_eval(embedding[j]) for j in range(len(embedding))])
                if len(embeddings) == 0:
                    scores.append(np.nan)
                    continue
                else:
                    embeddingavg=np.nanmean(embeddings,axis=0)
                    current_embedding = ast.literal_eval(sent_df['CLIP_embeddings'][k])
                if np.isnan(embeddingavg).any() or np.isnan(current_embedding).any():
                        scores.append(np.nan)
                else:
                        dissimilarity = 1 - np.corrcoef(embeddingavg.reshape(1, -1), np.array(current_embedding).reshape(1, -1))[0,1]
                        scores.append(dissimilarity)
                print('i is:',idx)
                continue

            #second word in trial
            tmp_df=sent_df[0:k]
            embedding=tmp_df['CLIP_embeddings'].reset_index(drop=True)
            embeddings=np.array([ast.literal_eval(embedding[j]) for j in range(len(embedding))])
            if len(embeddings) == 0:
                    scores.append(np.nan)
            else:
                embeddingavg=np.mean(embeddings,axis=0)
                current_embedding = ast.literal_eval(sent_df['CLIP_embeddings'][k])
            if np.isnan(embeddingavg).any() or np.isnan(current_embedding).any():
                    scores.append(np.nan)
            else:
                    dissimilarity = 1 - np.corrcoef(embeddingavg.reshape(1, -1), np.array(current_embedding).reshape(1, -1))[0,1]
                    scores.append(dissimilarity)
            print('i is:',idx)
            continue

    scores
    df['CLIP_diff'] = scores

    df = df.dropna(subset=['CLIP_diff'])

    df.to_csv(filename, index=False)

Processing file: ./text_data/file10_diff.csv
i is: 0
i is: 0
i is: 0
i is: 0
i is: 0
i is: 0
i is: 0
i is: 0
i is: 0
i is: 0
i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 1
i is: 2
i is: 2
i is: 2
i is: 2
i is: 2
i is: 2
i is: 3
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 4
i is: 5
i is: 5
i is: 5
i is: 5
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 6
i is: 7
i is: 7
i is: 7
i is: 7
i is: 7
i is: 7
i is: 7
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 8
i is: 9
i is: 9
i is: 9
i is: 9
i is: 9
i is: 10
i is: 10
i is: 11
i is: 11
i is: 11
i is: 11
i is: 11
i is: 11
i is: 12
i is: 12
i is: 12
i is: 12
i is: 12
i is: 12
i is: 12
i is: 12
i is: 13
i is: 13
i is: 13
i is: 13
i is: 13
i is: 13
i is: 13
i is: 13
i is: 14
i is: 14
i is: 14
i is: 14
i is: 14
i is: 14
i is: 14
i is: 15
i is: 15
i is: 15
i is: 15
i is: 15
i is: 15
i is: 15
i is: 15
i is: 15
i is: 15
i is: 15
i