<a href="https://colab.research.google.com/github/abhilasha-kumar/Connector/blob/master/search-models/search_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cloning git repository

In [1]:
!git clone https://github.com/abhilasha-kumar/Connector.git

Cloning into 'Connector'...
remote: Enumerating objects: 1029, done.[K
remote: Counting objects: 100% (127/127), done.[K
remote: Compressing objects: 100% (99/99), done.[K
remote: Total 1029 (delta 64), reused 66 (delta 27), pack-reused 902[K
Receiving objects: 100% (1029/1029), 76.21 MiB | 18.32 MiB/s, done.
Resolving deltas: 100% (514/514), done.
Checking out files: 100% (218/218), done.


# Importing embeddings & vocabulary

In [4]:
# import glove embeddings
import pandas as pd
import json
import numpy as np
%cd /content/Connector/connector-cogsci2021/data
representations = {}
representations['glove'] = pd.read_csv("glove_embeddings.csv").transpose().values
vocab = pd.read_csv("vocab.csv").rename(columns={"Word": "vocab_word"})
print(f"embeddings are shaped:", representations['glove'].shape)
print(f"vocab is {len(vocab)} words")
with open('boards.json', 'r') as json_file:
    boards = json.load(json_file)

/content/Connector/connector-cogsci2021/data
embeddings are shaped: (12218, 300)
vocab is 12218 words


# Importing search & RSA functions

These functions have been predefined and are stored in the search-models subdirectory in the github repository, so we directly load them here.

In [5]:
%cd /content/Connector/search-models
import search_funcs

/content/Connector/search-models


# Constructing similarity matrix & Graph

In [6]:
threshold = 0.4
sim_matrix_glove = search_funcs.search.create_similarity_matrix(representations['glove'])
Graph = search_funcs.search.create_graph(sim_matrix_glove, threshold)
board_combos = {board_name : search_funcs.RSA.compute_board_combos(board_name,boards) for board_name in boards.keys()}

# Running an example

In [8]:
target = 'exam-algebra'
w1, w2 = target.split(sep = "-")
n_steps = 5
n_walks = 50
# computing union and intersection of independent walks

u, i = search_funcs.search.union_intersection(w1,w2, n_steps, n_walks, vocab, Graph)

union_candidates = list(u.vocab_word)
int_candidates = list(i.vocab_word)

print(f"{len(u)} items in union: {union_candidates}")
print(f"{len(i)} items in intersection: {int_candidates}")

clue = 'mathematics'
n = 5
wordpairlist = search_funcs.RSA.get_wordpair_list(board_combos, 'e1_board1_words')
target_index = wordpairlist.index(target)
clue_index = list(vocab["vocab_word"]).index(clue)

print(f"for wordpair {target} and clue {clue}")

## compute predictions on FULL vocab

print(f"ON FULL VOCAB")

a = search_funcs.RSA.literal_guesser('e1_board1_words', representations, list(vocab.vocab_word) , vocab, boards)[:,clue_index]
y = search_funcs.RSA.pragmatic_speaker('e1_board1_words', 18.858, 0.004, representations,list(vocab.vocab_word) , vocab, boards)

print("literal guesser prediction is:", wordpairlist[np.argmax(a)])
top = y[target_index,:].argsort()[-n:][::-1].tolist()
top_words = [list(vocab["vocab_word"])[x] for x in top]
print(f"top {n} prag speaker predictions are:", top_words)

print(f"ON CANDIDATES")

b_union = search_funcs.RSA.literal_guesser('e1_board1_words', representations, union_candidates, vocab, boards)[:, union_candidates.index(clue)]
c_union = search_funcs.RSA.pragmatic_speaker('e1_board1_words', 18.858, 0.004, representations, union_candidates, vocab, boards)

b_int = search_funcs.RSA.literal_guesser('e1_board1_words', representations, int_candidates, vocab, boards)[:, int_candidates.index(clue)]
c_int = search_funcs.RSA.pragmatic_speaker('e1_board1_words', 18.858, 0.004, representations, int_candidates, vocab, boards)

print("GUESSER candidate UNION prediction is:", wordpairlist[np.argmax(b_union)])
print("GUESSER candidate INTERSECTION prediction is:", wordpairlist[np.argmax(b_int)])

top = c_union[target_index,:].argsort()[-n:][::-1].tolist()
top_words = [list(u.vocab_word)[x] for x in top]
print(f"top {n} SPEAKER candidate UNION are:", top_words)

top = c_int[target_index,:].argsort()[-n:][::-1].tolist()
top_words = [list(i.vocab_word)[x] for x in top]
print(f"top {n} prag SPEAKER candidate INTERSECTION are:", top_words)

print(f"SIMPLE UNION/INTERSECTION")
print(f"considering top {n} nodes visited by union and intersection...")
print(f"highly visited nodes in the union: {union_candidates[:n]}")
print(f"highly visited nodes in the intersection: {int_candidates[:n]}")



306 items in union: ['mathematics', 'examination', 'depend', 'study', 'fraction', 'reduce', 'formal', 'owe', 'patient', 'part of', 'particular', 'particle', 'parameter', 'absence', 'phenomenon', 'organize', 'opportunity', 'Olympic', 'occasion', 'obtain', 'observe', 'otherwise', 'politics', 'point', 'prepare', 'project', 'proficient', 'profession', 'procedure', 'private', 'prevent', 'predicate', 'polygon', 'precisely', 'precise', 'praise', 'postulate', 'possible', 'position', 'noun', 'normally', 'purpose', 'liquid', 'math', 'mandatory', 'male', 'main', 'logical', 'logic', 'litigation', 'linear', 'next', 'like', 'levels', 'legislation', 'lecture', 'learning', 'late', 'larger', 'mathematical', 'matrix', 'may', 'meaning', 'new', 'necessary', 'narrow', 'multiply', 'multiplication', 'multiple', 'much', 'month', 'money', 'middle', 'method', 'merit', 'mechanic', 'measurement', 'means', 'proof', 'quantum', 'quantity', 'understand', 'type', 'turn', 'trivial', 'trigonometry', 'traditional', 'top'

# Running through full dataset

Having verified the functions, we will now run these functions on the full behavioral dataset. We will need to explore some parameters for this. 


In [9]:
## import empirical clues (cleaned)
%cd /content/Connector/connector-cogsci2021/data
expdata = pd.read_csv("final_board_clues_all.csv", encoding= 'unicode_escape')

## now we run the functions on this dataset of clues

/content/Connector/connector-cogsci2021/data


In [None]:
## for writing files
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
parentfolder = "/content/drive/My Drive/search-models/"