<a href="https://colab.research.google.com/github/abhilasha-kumar/Connector/blob/master/search-models/search_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cloning git repository & load libraries



In [None]:
!git clone https://github.com/abhilasha-kumar/Connector.git
!pip install pybind11
!pip install graph-walker # fast random walk implementation from https://github.com/kerighan/graph-walker

# Importing embeddings, vocabulary, & functions file
We load the embeddings, vocabulary, and all the search functions. The functions have been predefined and are stored in the search-models subdirectory in the github repository, so we directly load them here.

In [None]:
# import glove embeddings
import pandas as pd
import json
import numpy as np
%cd /content/Connector/connector-cogsci2021/data
representations = {}
representations['glove'] = pd.read_csv("glove_embeddings.csv").transpose().values
representations['swow'] = pd.read_csv("swow_embeddings.csv").transpose().values
vocab = pd.read_csv("vocab.csv").rename(columns={"Word": "vocab_word"})
print(f"embeddings are shaped:", representations['glove'].shape)
print(f"vocab is {len(vocab)} words")
with open('boards.json', 'r') as json_file:
    boards = json.load(json_file)

## import empirical clues (cleaned)
expdata = pd.read_csv("final_board_clues_all.csv", encoding= 'unicode_escape')

## need to get similarity matrix of these words in this order to work with
target_df = pd.read_csv("connector_wordpairs_boards.csv")
target_df["wordpair"]= target_df["Word1"]+ "-"+target_df["Word2"]
print(target_df.head())

%cd /content/Connector/search-models
import search_funcs

# Running through full dataset

Having verified the functions, we will now run these functions on the full behavioral dataset. We will need to explore some parameters for this. 



We have 2 variables:
1.   Candidates (full/subset)
2.   Pragmatics (with/without)

We have the following search models that generate candidates:
1.   Union (RW)
2.   Intersection (RW)
3.   Predication (spreading activation)

We have the following models that can be run with/without candidates and with/without pragmatics:
1.   Speaker models
    - Target+Board
    - Pragmatic speaker
2.  Guesser models 
  - Literal Guesser
  - Pragmatic Guesser







### Step 1: On full vocab

In [None]:
## create boards and merge with expdata
combined_boards_df = pd.DataFrame(columns=['Experiment', 'Board','boardwords'])
combined_boards_df["Experiment"]  = ["E1"] * 10 + ["E2"] * 10
combined_boards_df["Board"] = ["TrialList" + str(i) for i in range(1,11)] * 2
combined_boards_df["boardnames"] = (['e1_board' + str(i) + '_words' for i in range(1,11)] 
                                  + ['e2_board' + str(i) + '_words' for i in range(1,11)])
combined_boards_df["boardwords"] = [boards[n] for n in combined_boards_df["boardnames"]]

expdata_new = pd.merge(expdata,combined_boards_df,on=['Board', 'Experiment'],how='left')
expdata_new["wordpair"] = expdata_new["Word1"] + "-" + expdata_new["Word2"]
board_combos = {board_name : search_funcs.RSA.compute_board_combos(board_name, boards) for board_name in boards.keys()}

#### Non - RSA method


In [None]:
board_optimal_params = {
    'swow' : (23.488850322875496, 1), # -13204
    'glove' : (20.952928531665275, 1), # -15774.814774380024)
    'bert-sum' : (19.983835225540847, 0.787924454045298),
}

In [None]:
cluescoredf = search_funcs.nonRSA.speaker_targetboard_cluescores(['swow', 'glove'], board_optimal_params, board_combos, boards, list(vocab.vocab_word), vocab, representations, target_df, expdata_new)
cluescoredf.head()

#### Models with RSA

In [None]:
rsa_optimal_params = {
    'swow' : (25.1522030761838, 0.03863169001849234),
    'glove' : (22.336514544537227, 0.039),
    'bert-sum' : (29.709602301411962, 0.031659060110267576), #-17533
}

In [None]:
pragmaticspeakerdf = search_funcs.RSA.get_speaker_df(representations, combined_boards_df,rsa_optimal_params, list(vocab.vocab_word), vocab, expdata_new, board_combos, target_df, boards)
pragmaticspeakerdf.head()

In [None]:
pragmaticspeaker_df

## Step 2: Candidate generation (Union & Intersection)

In [None]:
## here we generate candidates for each of our wordpairs: stored in target_df

candidates_df = pd.DataFrame()

# keep n_walks fixed to a large number
n_walks = 1000
n_steps = 50

## should probably try a range of values for threshold 

for modelname in ['glove']:
  for threshold in np.arange(0.2, 0.3, 0.1):
    print(f"threshold is {threshold}")
    # sim_matrix = search_funcs.search.create_similarity_matrix(representations[modelname])
    # Graph = search_funcs.search.create_graph(sim_matrix, threshold)
    print(f"graph has been created")
    for index, row in target_df.iterrows():
      w1 = row["Word1"]
      w2 = row["Word2"]
      print(f"for {w1} and {w2}")
      union_df, int_df = search_funcs.search.union_intersection(w1,w2, n_steps, n_walks, vocab, Graph)
      print(f"union/int calculation complete!")
      
      union_df["Word1"] = w1
      union_df["Word2"] = w2
      union_df["representation"] = modelname
      union_df["threshold"] = threshold
      union_df["n_walks"] = n_walks
      union_df["type"] = "union"
      

      int_df["Word1"] = w1
      int_df["Word2"] = w2
      int_df["representation"] = modelname
      int_df["threshold"] = threshold
      int_df["n_walks"] = n_walks
      int_df["type"] = "intersection"

      overall_df = pd.concat([union_df, int_df])

      candidates_df = pd.concat([candidates_df, overall_df])

      

In [None]:
candidates_df = pd.read_csv(parentfolder+'swow_candidates.csv')
candidates_df

In [None]:
df_filtered = candidates_df[(candidates_df['vocab_word'] != candidates_df["Word1"]) & (candidates_df['vocab_word'] != candidates_df["Word2"])]

df_filtered["wordpair"] = df_filtered["Word1"] + "-"+df_filtered["Word2"]
## group by number of steps in the RW and union/intersection
clong = df_filtered.groupby(['wordpair', 'type', 'n_steps'], as_index=False)['vocab_word'].agg(','.join)
clong['clue_list'] = clong['vocab_word'].str.split(',')
clong = clong.merge(target_df, on = "wordpair")
clong

### RSA

In [None]:
rsa_optimal_params = {
    'swow' : (25.1522030761838, 0.03863169001849234),
    'glove' : (82.83019661384789, 0.9997249702731884),
    'bert-sum' : (29.709602301411962, 0.031659060110267576), #-17533
}

In [None]:
parentfolder = "/content/drive/My Drive/search-models/"
pragmaticspeaker_df = pd.read_csv(parentfolder+'candidates_RSAprobs.csv')


In [None]:
## need to obtain list of candidates for each board separately

#pragmaticspeaker_df = pd.DataFrame()
modelname = 'swow'

beta = rsa_optimal_params[modelname][0]
cost = rsa_optimal_params[modelname][1]
for index, row in clong[1278:].iterrows():
  boardname = row["boardnames"]
  cluelist = row["clue_list"]
  wordpair = row["wordpair"]
  clue_probs = search_funcs.RSA.pragmatic_speaker(boardname, beta, cost, representations, 'swow', cluelist, vocab, boards)
  ## obtain the probs for the specific wordpair
  combos_df = search_funcs.RSA.compute_board_combos(boardname,boards)
  wordpairlist = list(combos_df["wordpair"])
  mainscores = clue_probs[wordpairlist.index(wordpair)]

  clue_board_df = pd.DataFrame({'Model': [modelname]})
  clue_board_df["boardnames"] = boardname
  clue_board_df["type"] = row["type"]
  clue_board_df["n_steps"] = row["n_steps"]   
  clue_board_df["wordpair"] = wordpair
  clue_board_df["cluelist"] = str(','.join(cluelist))
  clue_board_df["clue_score"] = str(np.round(mainscores,10).tolist())
    
  pragmaticspeaker_df = pd.concat([pragmaticspeaker_df, clue_board_df])
  pragmaticspeaker_df.to_csv(parentfolder+'candidates_RSAprobs.csv', index = False)

#### obtaining probabilities for data

Now that we have the candidate-level probabilities for the pragmatic speaker, we compute the probabilities for the behavioral data

In [None]:
#candidates_df["wordpair"] = candidates_df["Word1"]+"-"+candidates_df["Word2"]

#rsa_probs = pd.DataFrame()

for index, row in expdata[518:].iterrows():
  wordpair = row["wordpair"].replace(" - ", "-")
  clue = row["Clue1"]
  clue_df = df_filtered[(df_filtered["vocab_word"] == clue) & (df_filtered["wordpair"] == wordpair)]
  for i, j in clue_df.iterrows():
    ctype = j["type"]
    n_steps = j["n_steps"]
    clueprobs_df = pragmaticspeaker_df[(pragmaticspeaker_df["wordpair"]== wordpair) & (pragmaticspeaker_df["type"] == ctype) & (pragmaticspeaker_df["n_steps"]==n_steps)]
    if(len(clueprobs_df)>0):
      clue_index = list(clueprobs_df.cluelist)[0].split(',').index(clue)
      clue_score = list(clueprobs_df.clue_score)[0][1:-1].split(', ')[clue_index]

      clue_board_df = pd.DataFrame({'alpha': ["RSA"]})
      clue_board_df["type"] = ctype
      clue_board_df["n_steps"] = n_steps
      clue_board_df["wordpair"] = wordpair
      clue_board_df["Clue1"] = clue
      clue_board_df["clue_score"] = clue_score
        
      rsa_probs = pd.concat([rsa_probs, clue_board_df])
      rsa_probs.to_csv(parentfolder+'finalexpdata_RSA_speaker.csv', index = False)

In [None]:
rsa_probs.to_csv(parentfolder+'finalexpdata_RSA_speaker.csv', index = False)

# Comparing with online candidates study

In [None]:
parentfolder = "/content/drive/My Drive/search-models/"
online = pd.read_csv(parentfolder+'online_new_coded.csv')

In [None]:
main_online = online[["wordpair_id", "Level", "clueOption1", "clueOption2", "clueOption3", "clueOption4", "clueOption5", "clueOption6", "clueOption7", "clueOption8"]]
main_online = main_online.melt(id_vars=['wordpair_id', 'Level'], value_vars=["clueOption1", "clueOption2", "clueOption3", "clueOption4", "clueOption5", "clueOption6", "clueOption7", "clueOption8"])
main_online = main_online.dropna()
main_online = main_online.drop_duplicates()
main_online = main_online.groupby(['wordpair_id', 'variable', 'Level'], as_index=False)['value'].agg(','.join)
main_online['clue_list'] = main_online['value'].str.split(',')

In [None]:
common_candidates = pd.DataFrame()

for index, row in main_online.iterrows():
  wordpair = row["wordpair_id"]
  clue_list = row["clue_list"]
  # find all candidates for that wordpair in clong
  wp_candidates = clong[clong["wordpair"] == wordpair]
  if(len(wp_candidates) == 0):
    w1, w2 = wordpair.split("-")
    wordpair = w2 + "-" + w1
    wp_candidates = clong[clong["wordpair"] == wordpair]
    
  for i, j in wp_candidates.iterrows():
    candidate_list = j["clue_list"]
    intersection = list(set(clue_list).intersection(candidate_list))

    clue_board_df = pd.DataFrame({'wordpair': [wordpair]})
    clue_board_df["Level"] = row["Level"]
    clue_board_df["candidate_type"] = row["variable"]
    clue_board_df["type"] = j["type"]
    clue_board_df["n_steps"] = j["n_steps"]
    clue_board_df["n_model_candidates"] = len(candidate_list)
    clue_board_df["n_human_candidates"] = len(clue_list)
    clue_board_df["n_intersection"] = len(intersection)
    clue_board_df["intersection"] = str(intersection)
      
    common_candidates = pd.concat([common_candidates, clue_board_df])

common_candidates.to_csv(parentfolder+'common_candidates.csv', index = False)

# Writing files to CSV

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

In [None]:
parentfolder = "/content/drive/My Drive/search-models/"
common_candidates.to_csv(parentfolder+'common_candidates.csv', index = False)