In [1]:
from pandas import DataFrame, read_csv
from csv import reader
from numpy import mean, array, zeros, errstate
from collections import defaultdict
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr
from os import path

Load raw messy corpus proposed by the Laboratory of Neurolinguistics and transform it to a decent dataset

In [2]:
with open(path.join('data', 'data.csv'), mode='r') as infile:
    corpus_bytes = reader(infile)
    corpus = [i for i in corpus_bytes]

corpus = [i[0].split('\t') if len(i) == 1 else ''.join(i).split('\t') for i in corpus]
df = DataFrame(corpus[1:], columns=corpus[0])

Amount of unique words in a corpus

In [3]:
len(df['word.id'].unique())

801

Aggregate corpus data by unqiue words

In [4]:
df = df.groupby('Lemma').agg({
                            'average.accuracy': ', '.join,
                            'IA_DWELL_TIME': ', '.join,
                            'IA_FIRST_FIXATION_DURATION': ', '.join,
                            'IA_FIRST_RUN_DWELL_TIME': ', '.join,
                            'IA_FIRST_RUN_FIXATION_COUNT': ', '.join,
                            'IA_FIXATION_COUNT': ', '.join,
                            'IA_LEGAL': ', '.join,
                            'IA_REGRESSION_IN': ', '.join,
                            'IA_REGRESSION_OUT_FULL': ', '.join,
                            'IA_REGRESSION_PATH_DURATION': ', '.join,
                            'IA_SECOND_RUN_DWELL_TIME': ', '.join,
                            'ao': ', '.join,
                            'IA_SELECTIVE_REGRESSION_PATH_DURATION': ', '.join,
                            'IA_SKIP': ', '.join,
                            'IA_SPILLOVER': ', '.join,
                            'landing': ', '.join,
                            'dir': ', '.join,
                            'fixated.letter': ', '.join,
                            'one_fix': ', '.join,
                            'twoplus_fix': ', '.join
                        }).reset_index()

Serialize transformed dataset

In [5]:
df.to_csv('data_words.csv')

Calculate mean value of aggregated values for each word in the dataset

In [6]:
with errstate(divide='raise'):
    for column in df:
        for i in range(len(df)):
            try:
                values = list(filter(lambda a: a != 'NA', [a.strip() for a in df[column][i].split(',')]))
                df[column][i] = mean([float(a) for a in values])
            except ValueError: # if column is a column of words
                continue
            except FloatingPointError:
                df[column][i] = None
df = df.dropna()

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Make a dictionary of obtained embeddings

In [7]:
eye_embeddings = defaultdict()

for i, k in df.iterrows():
    eye_embeddings[k['Lemma']] = array(k[1:].values)

Loading and processing dataset of human judgements of embeddings

In [8]:
def load_sim_dataset(name, verbose=False):
    df = read_csv(path.join('data', '{}.csv'.format(name))).dropna()
    old_len = len(df)
    for i, m in df.iterrows():
        if not m['word1'] in eye_embeddings or not m['word2'] in eye_embeddings:
            df.drop(i, inplace=True)
    if verbose:
        print('Percent of dropped = {:2.1f}%'.format((old_len - len(df))/old_len*100))
    return df.reset_index(drop=True)

In [9]:
def make_sims_dataset(dataset):
    sims = zeros(shape=len(dataset), dtype='float32')
    for i, m in dataset.iterrows():
        sims[i] = 1 - cosine(eye_embeddings[m['word1']], eye_embeddings[m['word2']])
    return sims

In [10]:
for name, printed_name in [
             ('hj', 'HJ: Human Judgements of Word Pairs'),
             ('rt', 'RT: Synonyms and Hypernyms from the Thesaurus RuThes'), 
             ('ae2', 'AE: Cognitive Associations from the Sociation.org Experiment'),
            ]:
    print(printed_name)
    dataset = load_sim_dataset(name, True)
    eye_sims = make_sims_dataset(dataset)
    print('Correlation: {}'.format(spearmanr(eye_sims, dataset.sim)))

HJ: Human Judgements of Word Pairs
Percent of dropped = 97.2%
Correlation: SpearmanrResult(correlation=0.1425321252592967, pvalue=0.67590154052978346)
RT: Synonyms and Hypernyms from the Thesaurus RuThes
Percent of dropped = 99.9%
Correlation: SpearmanrResult(correlation=0.057346295562877694, pvalue=0.55179510954640909)
AE: Cognitive Associations from the Sociation.org Experiment
Percent of dropped = 99.7%
Correlation: SpearmanrResult(correlation=-0.13052904375047683, pvalue=0.023288976503718622)
