<a href="https://colab.research.google.com/github/abhilasha-kumar/Connector/blob/master/search-models/search_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cloning git repository

In [1]:
!git clone https://github.com/abhilasha-kumar/Connector.git

Cloning into 'Connector'...
remote: Enumerating objects: 1029, done.[K
remote: Counting objects: 100% (127/127), done.[K
remote: Compressing objects: 100% (99/99), done.[K
remote: Total 1029 (delta 64), reused 66 (delta 27), pack-reused 902[K
Receiving objects: 100% (1029/1029), 76.21 MiB | 18.32 MiB/s, done.
Resolving deltas: 100% (514/514), done.
Checking out files: 100% (218/218), done.


# Importing embeddings & vocabulary

In [4]:
# import glove embeddings
import pandas as pd
import json
import numpy as np
%cd /content/Connector/connector-cogsci2021/data
representations = {}
representations['glove'] = pd.read_csv("glove_embeddings.csv").transpose().values
vocab = pd.read_csv("vocab.csv").rename(columns={"Word": "vocab_word"})
print(f"embeddings are shaped:", representations['glove'].shape)
print(f"vocab is {len(vocab)} words")
with open('boards.json', 'r') as json_file:
    boards = json.load(json_file)

/content/Connector/connector-cogsci2021/data
embeddings are shaped: (12218, 300)
vocab is 12218 words


# Importing search & RSA functions

These functions have been predefined and are stored in the search-models subdirectory in the github repository, so we directly load them here.

In [5]:
%cd /content/Connector/search-models
import search_funcs

/content/Connector/search-models


# Constructing similarity matrix & Graph

In [6]:
threshold = 0.4
sim_matrix_glove = search_funcs.search.create_similarity_matrix(representations['glove'])
Graph = search_funcs.search.create_graph(sim_matrix_glove, threshold)
board_combos = {board_name : search_funcs.RSA.compute_board_combos(board_name,boards) for board_name in boards.keys()}

# Running an example

In [8]:
target = 'exam-algebra'
w1, w2 = target.split(sep = "-")
n_steps = 5
n_walks = 50
# computing union and intersection of independent walks

u, i = search_funcs.search.union_intersection(w1,w2, n_steps, n_walks, vocab, Graph)

union_candidates = list(u.vocab_word)
int_candidates = list(i.vocab_word)

print(f"{len(u)} items in union: {union_candidates}")
print(f"{len(i)} items in intersection: {int_candidates}")

clue = 'mathematics'
n = 5
wordpairlist = search_funcs.RSA.get_wordpair_list(board_combos, 'e1_board1_words')
target_index = wordpairlist.index(target)
clue_index = list(vocab["vocab_word"]).index(clue)

print(f"for wordpair {target} and clue {clue}")

## compute predictions on FULL vocab

print(f"ON FULL VOCAB")

a = search_funcs.RSA.literal_guesser('e1_board1_words', representations, list(vocab.vocab_word) , vocab, boards)[:,clue_index]
y = search_funcs.RSA.pragmatic_speaker('e1_board1_words', 18.858, 0.004, representations,list(vocab.vocab_word) , vocab, boards)

print("literal guesser prediction is:", wordpairlist[np.argmax(a)])
top = y[target_index,:].argsort()[-n:][::-1].tolist()
top_words = [list(vocab["vocab_word"])[x] for x in top]
print(f"top {n} prag speaker predictions are:", top_words)

print(f"ON CANDIDATES")

b_union = search_funcs.RSA.literal_guesser('e1_board1_words', representations, union_candidates, vocab, boards)[:, union_candidates.index(clue)]
c_union = search_funcs.RSA.pragmatic_speaker('e1_board1_words', 18.858, 0.004, representations, union_candidates, vocab, boards)

b_int = search_funcs.RSA.literal_guesser('e1_board1_words', representations, int_candidates, vocab, boards)[:, int_candidates.index(clue)]
c_int = search_funcs.RSA.pragmatic_speaker('e1_board1_words', 18.858, 0.004, representations, int_candidates, vocab, boards)

print("GUESSER candidate UNION prediction is:", wordpairlist[np.argmax(b_union)])
print("GUESSER candidate INTERSECTION prediction is:", wordpairlist[np.argmax(b_int)])

top = c_union[target_index,:].argsort()[-n:][::-1].tolist()
top_words = [list(u.vocab_word)[x] for x in top]
print(f"top {n} SPEAKER candidate UNION are:", top_words)

top = c_int[target_index,:].argsort()[-n:][::-1].tolist()
top_words = [list(i.vocab_word)[x] for x in top]
print(f"top {n} prag SPEAKER candidate INTERSECTION are:", top_words)

print(f"SIMPLE UNION/INTERSECTION")
print(f"considering top {n} nodes visited by union and intersection...")
print(f"highly visited nodes in the union: {union_candidates[:n]}")
print(f"highly visited nodes in the intersection: {int_candidates[:n]}")



306 items in union: ['mathematics', 'examination', 'depend', 'study', 'fraction', 'reduce', 'formal', 'owe', 'patient', 'part of', 'particular', 'particle', 'parameter', 'absence', 'phenomenon', 'organize', 'opportunity', 'Olympic', 'occasion', 'obtain', 'observe', 'otherwise', 'politics', 'point', 'prepare', 'project', 'proficient', 'profession', 'procedure', 'private', 'prevent', 'predicate', 'polygon', 'precisely', 'precise', 'praise', 'postulate', 'possible', 'position', 'noun', 'normally', 'purpose', 'liquid', 'math', 'mandatory', 'male', 'main', 'logical', 'logic', 'litigation', 'linear', 'next', 'like', 'levels', 'legislation', 'lecture', 'learning', 'late', 'larger', 'mathematical', 'matrix', 'may', 'meaning', 'new', 'necessary', 'narrow', 'multiply', 'multiplication', 'multiple', 'much', 'month', 'money', 'middle', 'method', 'merit', 'mechanic', 'measurement', 'means', 'proof', 'quantum', 'quantity', 'understand', 'type', 'turn', 'trivial', 'trigonometry', 'traditional', 'top'

# Running through full dataset

Having verified the functions, we will now run these functions on the full behavioral dataset. We will need to explore some parameters for this. 



We have 2 variables:
1.   Candidates (full/subset)
2.   Pragmatics (with/without)

We have the following search models that generate candidates:
1.   Union (RW)
2.   Intersection (RW)
3.   Predication (spreading activation)

We have the following models that can be run with/without candidates and with/without pragmatics:
1.   Speaker models
    - Target+Board
    - Pragmatic speaker
2.  Guesser models 
  - Literal Guesser
  - Pragmatic Guesser







In [9]:
## import empirical clues (cleaned)
%cd /content/Connector/connector-cogsci2021/data
expdata = pd.read_csv("final_board_clues_all.csv", encoding= 'unicode_escape')

## now we run the functions on this dataset of clues

/content/Connector/connector-cogsci2021/data


In [13]:
## create boards and merge with expdata
combined_boards_df = pd.DataFrame(columns=['Experiment', 'Board','boardwords'])
combined_boards_df["Experiment"]  = ["E1"] * 10 + ["E2"] * 10
combined_boards_df["Board"] = ["TrialList" + str(i) for i in range(1,11)] * 2
combined_boards_df["boardnames"] = (['e1_board' + str(i) + '_words' for i in range(1,11)] 
                                  + ['e2_board' + str(i) + '_words' for i in range(1,11)])
combined_boards_df["boardwords"] = [boards[n] for n in combined_boards_df["boardnames"]]
print(combined_boards_df.head())

## need to get similarity matrix of these words in this order to work with
target_df = pd.read_csv("connector_wordpairs_boards.csv")
target_df["wordpair"]= target_df["Word1"]+ "-"+target_df["Word2"]
print(target_df.head())

  Experiment  ...       boardnames
0         E1  ...  e1_board1_words
1         E1  ...  e1_board2_words
2         E1  ...  e1_board3_words
3         E1  ...  e1_board4_words
4         E1  ...  e1_board5_words

[5 rows x 4 columns]
     Word1     Word2 Experiment        boardnames         wordpair
0     void     couch         E1   e1_board1_words       void-couch
1   giggle  abnormal         E1   e1_board1_words  giggle-abnormal
2     exam   algebra         E1   e1_board1_words     exam-algebra
3      tea      bean         E1  e1_board10_words         tea-bean
4  tourist    comedy         E1  e1_board10_words   tourist-comedy


In [None]:
## we want to run models with and without RSA
# models (C: #candidates):
# Random walk (steps = size(vocab) until all nodes, n_times, threshold): 
# Graph cutoff: only remove negative
# Union
# Intersection
# Run the RW until all nodes are visited * n_times = n_times lists of ordered nodes
# Where did the word appear (index of word) in the list of n_times RWs
# Reorder list based on order of visitation
# Order vs. times_visited - which is more salient?
# Predication (asymmetric) (m, k, t = 5)
# with/without RSA
# Cost could incorporate visited/index
# Old models? 
# [Target, target+board]: special case of candidates = vocab
# RSA [full vocabulary]: special case of candidates = vocab


## Models with RSA

In [15]:
rsa_optimal_params = {
    'swow' : (25.1522030761838, 0.03863169001849234),
    'glove' : (22.336514544537227, 0.039),
    'bert-sum' : (29.709602301411962, 0.031659060110267576), #-17533
}

board_optimal_params = {
    'swow' : (23.488850322875496, 1), # -13204
    'glove' : (20.952928531665275, 1), # -15774.814774380024)
    'bert-sum' : (19.983835225540847, 0.787924454045298),
}

In [14]:
def get_speaker_scores(group, speaker_word_pairs, y, y_sorted) :
    speaker_prob = []
    speaker_rank = []
    for index, row in group.iterrows():
        clue1 = row["Clue1"]
        wordpair = str(row["wordpair"]).replace(" ", "")
        wordpair_index = speaker_word_pairs.index(wordpair)
        w1_index, w2_index = [list(vocab["vocab_word"]).index(word) for word in wordpair.split('-')]
        
        # find index of clue
        if clue1 in list(vocab["vocab_word"]):
            clue_index = list(vocab["vocab_word"]).index(clue1)
            clue_probs = y[wordpair_index, clue_index]
            clue_rank = np.nonzero(y_sorted==clue_index)[1][wordpair_index]
        else:
            clue_rank = "NA"
            clue_probs = "NA"

        speaker_prob.append(clue_probs)
        speaker_rank.append(clue_rank)
    return speaker_prob, speaker_rank



In [17]:
speakerprobs_df = pd.DataFrame(columns=['representation', 'Experiment','Board', "Word1", "Word2", "Clue1", "clueCount", "wordpair", "prag_speaker_probs"])
for representation in representations.keys() :
    for index, row in combined_boards_df.iterrows():
        board = row["boardwords"]
        boardname = row["boardnames"]
        wordpairlist = search_funcs.RSA.get_wordpair_list(board_combos, boardname)
        speaker_word_pairs = target_df[(target_df["boardnames"] == row["boardnames"]) & 
                                       (target_df["Experiment"] == row["Experiment"])]["wordpair"]
        speaker_word_pairs = list(speaker_word_pairs)
        speaker_df_new = pd.DataFrame({'wordpair': speaker_word_pairs})
        params = rsa_optimal_params[representation]
        speaker_model = search_funcs.RSA.pragmatic_speaker(boardname, params[0], params[1],representations, list(vocab.vocab_word), vocab, boards)

        ## this is created at the BOARD level
        y = np.array([speaker_model[wordpairlist.index(wordpair)] for wordpair in speaker_word_pairs])
        y_sorted = np.argsort(-y)

        ## so y has 3 vectors of clue probabilities (the 3 pairs on this board)
        ## now we need to go into expdata and score the probabilities for those specific clues
        expdata_board = expdata[(expdata["Board"] == row["Board"]) & (expdata["Experiment"] == row["Experiment"])]
        speaker_prob, speaker_rank = get_speaker_scores(expdata_board, speaker_word_pairs, y, y_sorted)
        expdata_board.loc[:,"representation"] = representation
        expdata_board.loc[:,"prag_speaker_probs"] = speaker_prob
        expdata_board.loc[:,"prag_speaker_rank"] = speaker_rank
        speakerprobs_df = pd.concat([speakerprobs_df, expdata_board])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [18]:
speakerprobs_df

Unnamed: 0,representation,Experiment,Board,Word1,Word2,Clue1,clueCount,wordpair,prag_speaker_probs,prag_speaker_rank
0,glove,E1,TrialList1,exam,algebra,calculus,1,exam - algebra,0.002025,11
1,glove,E1,TrialList1,exam,algebra,equation,1,exam - algebra,0.000493,147
2,glove,E1,TrialList1,exam,algebra,knowledge,1,exam - algebra,0.000239,474
3,glove,E1,TrialList1,exam,algebra,math,22,exam - algebra,0.003153,5
4,glove,E1,TrialList1,exam,algebra,school,2,exam - algebra,0.001181,31
...,...,...,...,...,...,...,...,...,...,...
580,glove,E2,TrialList10,garage,bone,storage,5,garage - bone,0.000303,400
581,glove,E2,TrialList10,garage,bone,structure,1,garage - bone,0.000186,991
582,glove,E2,TrialList10,garage,bone,tool,1,garage - bone,0.000198,889
583,glove,E2,TrialList10,garage,bone,trash,2,garage - bone,0.000418,214


In [None]:
## for writing files
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
parentfolder = "/content/drive/My Drive/search-models/"