In [1]:
from pandas import DataFrame, read_csv
from os import path
from scipy.spatial.distance import pdist
from scipy.stats import spearmanr
from scipy.spatial import KDTree

import numpy as np

In [None]:
def parse_dataframe(df):
    coords = []
    words = []
    for column in df.iloc[:,1:]:
        coords.append(np.array([int(num) for num in column.split(',')]))
        words.append([word.split("'")[1::2][0] for word in df[column]])
    return coords, words

In [None]:
# SETS SIMILARITY

def jaccard_distance(wordlist1, wordlist2):
    wordset1 = set(wordlist1)
    wordset2 = set(wordlist2)
    return float(len(wordset1 & wordset2)) / len(wordset1 | wordset2)

def sorensen_dice_distance(wordlist1, wordlist2):
    intersection = np.logical_and(wordlist1, wordlist2)
    return 2. * intersection.sum() / (wordlist1.sum() + wordlist2.sum())

# RANKED LISTS SIMILARITY

def dcg(wordlist, score=0.0):
    for ind, word in enumerate(wordlist):
        score += float(word)/math.log((ind + 2))
    return score

def ndcg(wordlist1, wordlist2):
    return dcg(wordlist2)/dcg(wordlist1)

def precision_at_k(wordlist1, wordlist2, k, precision=0.0, relevant=0.0):
    for i, value in enumerate(wordlist2[:k]):
        if value == reference[i]:
            relevant += 1.0
    return relevant/k

### 

def compare_wordlists(wordlist1, wordlist2, metric='jaccard'):
    if metric == 'jaccard':
        return jaccard_distance(wordlist1, wordlist2)
    elif metric == 'ndcg':
        return ndcg(wordlist1, wordlist2)
    elif metric == 'p@k':
        return precision_at_k(wordlist1, wordlist2)

In [None]:
def compare_coordinates(coord1, coord2, metric='euclidean'):
    return pdist((coord1, coord2), metric)

In [None]:
coords, words = parse_dataframe(read_csv(path.join('data', 'output', 'brain_map.csv')))

In [None]:
def calculate_distances(coords, words, curr_ind=0):
    curr_coord = coords[curr_ind]
    curr_word = words[curr_ind]
    word_distances = []
    coord_distances = []
    for ind, coord in enumerate(coords):
        if (coord == curr_coord).all():
            continue
        coord_distances.append(compare_coordinates(curr_coord, coord))
        word_distances.append(compare_wordlists(curr_word, words[ind]))
    return coord_distances, word_distances

In [None]:
spearmans = []

for curr_ind in range(len(coords)):
    spearmans.append(spearmanr(*calculate_distances(coords, words)).correlation)
    
print(np.mean(np.array(spearmans)))
print(np.median(np.array(spearmans)))

In [None]:
tree = KDTree(coords)