In [1]:
from pandas import DataFrame, read_csv, concat, options
from csv import reader
from numpy import mean, array, zeros, errstate, seterr, isfinite, random
from collections import defaultdict
from scipy.spatial.distance import cosine
from scipy.spatial import KDTree
from scipy.stats import spearmanr
from os import path
from gensim.models import KeyedVectors, Word2Vec
from pymorphy2 import MorphAnalyzer
from itertools import product
import string

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.preprocessing import normalize, MinMaxScaler

from nltk.tag import pos_tag, map_tag

translator = str.maketrans('', '', string.punctuation)
morph = MorphAnalyzer()
seterr(all='raise')

import numpy as np

options.mode.chained_assignment = None

Serialize transformed dataset

In [2]:
rsc = 'rsc'
provo = 'provo'
geco = 'geco'

eye_tracking_datasets = [rsc, provo, geco]
eye_tracking_data = {}

for dataset in eye_tracking_datasets:
    eye_tracking_data[dataset] = read_csv(path.join('data', 'eye-tracking', '{}.csv'.format(dataset)))

Loading and processing dataset of human judgements of embeddings

In [3]:
datasets = ['men',
            'mc-30',
            'rg-65',
            'rw',
            'yp-130',
            'verb-143',
            'mturk-771',
            'mturk-287',
            'simlex',
            'wordsim353-similarity',
            'wordsim353-relatedness',
             'simverb-3500',
             'semeval17']

languages = ['en', 'ru']

In [4]:
word_similarity = defaultdict(lambda: {})

for dataset in datasets:
    for language in languages:
        if language == 'ru' and dataset not in [ 'wordsim353-similarity', 'wordsim353-relatedness', 'simlex', 'mc-30', 'rg-65']:
            continue
        word_similarity[language][dataset] = read_csv(path.join('data', 'word-similarity', language, '{}.csv'.format(dataset))) 

In [5]:
word2vec_ru_ruscorpora = KeyedVectors.load_word2vec_format(path.join('data', 'models', 'ruwikiruscorpora-superbigrams_skipgram_300_2_2018.vec'))
word2vec_ru_araneum = KeyedVectors.load_word2vec_format(path.join('data', 'models', 'araneum_upos_skipgram_300_2_2018.vec'))

In [6]:
word2vec_en_googlenews = KeyedVectors.load_word2vec_format(path.join('data',  'models', 'googlenews.bin'), binary=True)
word2vec_en_bnc = KeyedVectors.load_word2vec_format(path.join('data',  'models', 'bnc.vec'))

# Functions

In [7]:
def add_pos_tag(word, lang='ru'):
    if lang == 'en':
        word = str(word)
        tagged = [(w, map_tag('en-ptb', 'universal', tag)) for w, tag in pos_tag([word])][0]
        return '{}_{}'.format(tagged[0], tagged[1])
    tag = str(morph.parse(word)[0].tag.POS)
    if tag == 'ADJF':
        tag = 'ADJ'
    elif tag == 'INFN':
        tag = 'VERB'
    if word == 'объем': 
        tag = 'NOUN'
    if word == 'струя':
        tag = 'NOUN'
    if word == 'чай':
        tag = 'NOUN'
    if word == 'два':
        word = 'двадцать'
        tag = 'NUM'
    return '{}_{}'.format(word, tag)

In [8]:
def load_sim_dataset(df_, embeddings, rusvectores=False, lang='ru'):
    df = df_.copy()
    old_len = len(df)
    for i, m in df.iterrows():
        if not rusvectores:
            if m['word1'] not in embeddings or m['word2'] not in embeddings:
                df.drop(i, inplace=True)
        else:
            if add_pos_tag(m['word1'], lang) not in embeddings or add_pos_tag(m['word2'], lang) not in embeddings:
                df.drop(i, inplace=True)
    return df.reset_index(drop=True), (old_len - len(df))/old_len*100, len(df)

In [9]:
def make_sims_dataset(dataset, embeddings, rusvectores=False, lang='en'):
    sims = zeros(shape=len(dataset), dtype='float32')
    for i, m in dataset.iterrows():
        if not rusvectores:
            sims[i] = 1 - cosine(embeddings[m['word1']], embeddings[m['word2']])
        else:
            sims[i] = 1 - cosine(embeddings[add_pos_tag(m['word1'], lang)], embeddings[add_pos_tag(m['word2'], lang)])
    return sims

In [10]:
def make_word2vec_dataset(dataset, model):
    sims = zeros(shape=len(dataset), dtype='float32')
    for i, m in dataset.iterrows():
        sims[i] = 1 - cosine(model[add_pos_tag(m['word1'])], model[add_pos_tag(m['word2'])])
    return sims

In [11]:
def make_word2vec_eye_dataset(dataset, model, lang='ru', pos_tags=False):
    pairs = list(product(dataset, repeat=2))
    sims_w2v = zeros(shape=len(pairs), dtype='float32')
    sims_eye = zeros(shape=len(pairs), dtype='float32')
    for i in enumerate(pairs):
        try:
            if lang == 'en':
                if pos_tags:
                    sims_w2v[i[0]] = 1 - cosine(model[add_pos_tag(i[1][0], lang)], model[add_pos_tag(i[1][1], lang)])
                    sims_eye[i[0]] = 1 - cosine(dataset[i[1][0]], dataset[i[1][1]])
                else:
                    sims_w2v[i[0]] = 1 - cosine(model[i[1][0]], model[i[1][1]])
                    sims_eye[i[0]] = 1 - cosine(dataset[i[1][0]], dataset[i[1][1]])
            else: # add pos tags
                sims_w2v[i[0]] = 1 - cosine(model[add_pos_tag(i[1][0])], model[add_pos_tag(i[1][1])])
                sims_eye[i[0]] = 1 - cosine(dataset[i[1][0]], dataset[i[1][1]])
        except KeyError:
            continue
    return sims_w2v, sims_eye

# Correlation with word similarity

In [13]:
results_ru = {'RSC': {}, 'Ruswikicorpora': {}, 'Aranea': {}}
dropped_ru = {'RSC': {}, 'Ruswikicorpora': {}, 'Aranea': {}}

eye_embeddings_rsc = {}

for i, k in eye_tracking_data[rsc].iloc[:,2:len(eye_tracking_data[rsc].columns)].iterrows():
    normalized = eye_tracking_data[rsc].loc[:,'average.accuracy':].apply(lambda x: normalize(x[:,np.newaxis], axis=0).ravel())
    x = normalized.values 
    min_max_scaler = MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    x = DataFrame(x_scaled)
    eye_embeddings_rsc[eye_tracking_data[rsc]['word'][i]] = array(x.loc[i].values, dtype=float)

for dataset_ in word_similarity['ru']:
    for result in results_ru.keys():
        if result == 'Ruswikicorpora':
            model = word2vec_ru_ruscorpora
            pos_tags = True
        elif result == 'Aranea':
            model = word2vec_ru_araneum
            pos_tags = True
        else:
            model = eye_embeddings_rsc
            pos_tags = False
        try:
            _dataset, percentage, amount = load_sim_dataset(word_similarity['ru'][dataset_], model, pos_tags)
            dropped_ru[result][dataset_] = '{:0.2f}%, {}'.format(percentage, amount)
            dataset = make_sims_dataset(_dataset, model, pos_tags, 'ru')
        except ZeroDivisionError:
            results_ru[result][dataset_] = 0
            continue
        results_ru[result][dataset_] = round(spearmanr(dataset, _dataset.similarity)[0], 2), round(spearmanr(dataset, _dataset.similarity)[1], 2)

In [14]:
DataFrame(results_ru)

Unnamed: 0,Aranea,RSC,Ruswikicorpora
mc-30,"(0.78, 0.0)","(nan, nan)","(0.82, 0.0)"
rg-65,"(0.76, 0.0)","(nan, nan)","(0.8, 0.0)"
simlex,"(0.39, 0.0)","(0.24, 0.29)","(0.28, 0.0)"
wordsim353-relatedness,"(0.61, 0.0)","(0.05, 0.91)","(0.57, 0.0)"
wordsim353-similarity,"(0.8, 0.0)","(-0.1, 0.87)","(0.75, 0.0)"


In [15]:
DataFrame(dropped_ru)

Unnamed: 0,Aranea,RSC,Ruswikicorpora
mc-30,"0.00%, 30","96.67%, 1","0.00%, 30"
rg-65,"0.00%, 65","98.46%, 1","0.00%, 65"
simlex,"1.97%, 946","97.82%, 21","1.55%, 950"
wordsim353-relatedness,"10.80%, 223","97.20%, 7","4.80%, 238"
wordsim353-similarity,"11.88%, 178","97.52%, 5","5.94%, 190"


In [16]:
results_en_geco = {'gaze vectors geco': {}, 'Googlenews': {}, 'BNC': {}}
dropped_en_geco = {'gaze vectors geco': {}, 'Googlenews': {}, 'BNC': {}}

eye_embeddings_geco = {}

for i, k in eye_tracking_data[geco].iloc[:,2:len(eye_tracking_data[geco].columns)].iterrows():
    normalized = eye_tracking_data[geco].loc[:,'TRIAL_TOTAL_READING_TIME':].apply(lambda x: normalize(x[:,np.newaxis], axis=0).ravel())
    x = normalized.values 
    min_max_scaler = MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    x = DataFrame(x_scaled)
    eye_embeddings_geco[eye_tracking_data[geco]['word'][i]] = array(x.loc[i].values, dtype=float)

for dataset_ in word_similarity['en']:
    for result in results_en_geco.keys():
        if result == 'Googlenews':
            model = word2vec_en_googlenews
            pos_tags = False
        elif result == 'BNC':
            model = word2vec_en_bnc
            pos_tags = True
        else:
            model = eye_embeddings_geco
            pos_tags = False
        try:
            _dataset, percentage, amount = load_sim_dataset(word_similarity['en'][dataset_], model, pos_tags, 'en')
            dropped_en_geco[result][dataset_] = '{:0.2f}%, {}'.format(percentage, amount)
            dataset = make_sims_dataset(_dataset, model, pos_tags, 'en')
        except ZeroDivisionError:
            results_en_geco[result][dataset_] = 0
            continue
        results_en_geco[result][dataset_] = round(spearmanr(dataset, _dataset.similarity)[0], 2), round(spearmanr(dataset, _dataset.similarity)[1], 2)

In [17]:
DataFrame(results_en_geco)

Unnamed: 0,BNC,Googlenews,gaze vectors geco
mc-30,"(0.82, 0.0)","(0.79, 0.0)","(nan, nan)"
men,"(0.76, 0.0)","(0.77, 0.0)","(-0.19, 0.4)"
mturk-287,"(0.75, 0.0)","(0.68, 0.0)","(nan, nan)"
mturk-771,"(0.69, 0.0)","(0.67, 0.0)","(0.14, 0.76)"
rg-65,"(0.8, 0.0)","(0.76, 0.0)","(nan, nan)"
rw,"(0.35, 0.0)","(0.53, 0.0)","(-0.4, 0.6)"
semeval17,"(0.74, 0.0)","(0.72, 0.0)","(1.0, 0.0)"
simlex,"(0.25, 0.0)","(0.35, 0.0)","(-0.28, 0.09)"
simverb-3500,"(0.18, 0.0)","(0.36, 0.0)","(0.06, 0.33)"
verb-143,"(0.24, 0.09)","(0.57, 0.0)","(nan, nan)"


In [18]:
DataFrame(dropped_en_geco)

Unnamed: 0,BNC,Googlenews,gaze vectors geco
mc-30,"0.00%, 30","0.00%, 30","100.00%, 0"
men,"14.60%, 2562","1.80%, 2946","99.27%, 22"
mturk-287,"48.43%, 148","4.18%, 275","99.65%, 1"
mturk-771,"5.97%, 725","0.00%, 771","99.09%, 7"
rg-65,"0.00%, 65","0.00%, 65","100.00%, 0"
rw,"69.17%, 627","10.28%, 1825","99.80%, 4"
semeval17,"40.00%, 300","25.00%, 375","99.40%, 3"
simlex,"26.13%, 738","0.00%, 999","96.30%, 37"
simverb-3500,"51.57%, 1695","0.00%, 3500","93.31%, 234"
verb-143,"60.00%, 52","3.08%, 126","100.00%, 0"


In [19]:
results_en_provo = {'gaze vectors provo': {}, 'Googlenews': {}, 'BNC': {}}
dropped_en_provo = {'gaze vectors provo': {}, 'Googlenews': {}, 'BNC': {}}

eye_embeddings_provo = {}

for i, k in eye_tracking_data[provo].iloc[:,2:len(eye_tracking_data[provo].columns)].iterrows():
    normalized = eye_tracking_data[provo].loc[:,'IA_DWELL_TIME':].apply(lambda x: normalize(x[:,np.newaxis], axis=0).ravel())
    x = normalized.values 
    min_max_scaler = MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    x = DataFrame(x_scaled)
    eye_embeddings_provo[eye_tracking_data[provo]['word'][i]] = array(x.loc[i].values, dtype=float)

for dataset_ in word_similarity['en']:
    for result in results_en_provo.keys():
        if result == 'Googlenews':
            model = word2vec_en_googlenews
            pos_tags = False
        elif result == 'BNC':
            model = word2vec_en_bnc
            pos_tags = True
        else:
            model = eye_embeddings_provo
            pos_tags = False
        try:
            _dataset, percentage, amount = load_sim_dataset(word_similarity['en'][dataset_], model, pos_tags, 'en')
            dropped_en_provo[result][dataset_] = '{:0.2f}%, {}'.format(percentage, amount)
            dataset = make_sims_dataset(_dataset, model, pos_tags, 'en')
        except ZeroDivisionError:
            results_en_provo[result][dataset_] = 0
            continue
        results_en_provo[result][dataset_] = round(spearmanr(dataset, _dataset.similarity)[0], 2), round(spearmanr(dataset, _dataset.similarity)[1], 2)

In [20]:
DataFrame(results_en_provo)

Unnamed: 0,BNC,Googlenews,gaze vectors provo
mc-30,"(0.82, 0.0)","(0.79, 0.0)","(nan, nan)"
men,"(0.76, 0.0)","(0.77, 0.0)","(-0.09, 0.43)"
mturk-287,"(0.75, 0.0)","(0.68, 0.0)","(-1.0, nan)"
mturk-771,"(0.69, 0.0)","(0.67, 0.0)","(0.19, 0.44)"
rg-65,"(0.8, 0.0)","(0.76, 0.0)","(nan, nan)"
rw,"(0.35, 0.0)","(0.53, 0.0)","(-1.0, nan)"
semeval17,"(0.74, 0.0)","(0.72, 0.0)","(0.77, 0.07)"
simlex,"(0.25, 0.0)","(0.35, 0.0)","(-0.09, 0.58)"
simverb-3500,"(0.18, 0.0)","(0.36, 0.0)","(0.01, 0.92)"
verb-143,"(0.24, 0.09)","(0.57, 0.0)","(nan, nan)"


In [21]:
DataFrame(dropped_en_provo)

Unnamed: 0,BNC,Googlenews,gaze vectors provo
mc-30,"0.00%, 30","0.00%, 30","100.00%, 0"
men,"14.60%, 2562","1.80%, 2946","97.43%, 77"
mturk-287,"48.43%, 148","4.18%, 275","99.30%, 2"
mturk-771,"5.97%, 725","0.00%, 771","97.54%, 19"
rg-65,"0.00%, 65","0.00%, 65","100.00%, 0"
rw,"69.17%, 627","10.28%, 1825","99.90%, 2"
semeval17,"40.00%, 300","25.00%, 375","98.80%, 6"
simlex,"26.13%, 738","0.00%, 999","95.90%, 41"
simverb-3500,"51.57%, 1695","0.00%, 3500","97.86%, 75"
verb-143,"60.00%, 52","3.08%, 126","100.00%, 0"


# Correlation with word embeddings

In [22]:
correlations_ru = defaultdict(lambda: {})
                
correlations_ru['word2vec_ru_araneum']['rsc'] = spearmanr(*make_word2vec_eye_dataset(eye_embeddings_rsc, word2vec_ru_araneum))
correlations_ru['word2vec_ru_ruscorpora']['rsc'] = spearmanr(*make_word2vec_eye_dataset(eye_embeddings_rsc, word2vec_ru_ruscorpora))

In [23]:
DataFrame(correlations_ru)

Unnamed: 0,word2vec_ru_araneum,word2vec_ru_ruscorpora
rsc,"(0.649707653801, 0.0)","(0.634759933137, 0.0)"


In [24]:
correlations_en = defaultdict(lambda: {})
                
correlations_en['Googlenews']['geco'] = spearmanr(*make_word2vec_eye_dataset(eye_embeddings_geco, word2vec_en_googlenews, 'en'))
correlations_en['BNC']['geco'] = spearmanr(*make_word2vec_eye_dataset(eye_embeddings_geco, word2vec_en_bnc, 'en', True))
correlations_en['Googlenews']['provo'] = spearmanr(*make_word2vec_eye_dataset(eye_embeddings_provo, word2vec_en_googlenews, 'en'))
correlations_en['BNC']['provo'] = spearmanr(*make_word2vec_eye_dataset(eye_embeddings_provo, word2vec_en_bnc, 'en', True))

In [25]:
DataFrame(correlations_en)

Unnamed: 0,BNC,Googlenews
geco,"(0.994503589483, 0.0)","(0.14823886411, 0.0)"
provo,"(0.969495089554, 0.0)","(0.230083271838, 0.0)"


# Find nearest neighbors

In [26]:
def compare_nearest_neighbours(eye_embeddings, dis_embeddings_1, dis_embeddings_2, word, pos_tags=False, lang='ru', k=2):
    vectors = []
    keys = []
    for key, vec in eye_embeddings.items():
        vectors.append(vec)
        keys.append(key) 
    tree = KDTree(vectors)
    distances = tree.query(eye_embeddings[word], k=k)
    eye_words = []
    for key in distances[1]:
        eye_words.append(keys[key])
    dis_words_1 = []
    dis_words_2 = []
    if lang == 'en':
        dis_words_1.append([word[0] for word in dis_embeddings_1.most_similar(word, topn=k)])
        dis_words_2.append([word[0].split('_')[0] for word in dis_embeddings_2.most_similar(add_pos_tag(word, lang), topn=k)])
    else:
        if pos_tags:
            dis_words_1.append([word[0].split('_')[0] for word in dis_embeddings_1.most_similar(add_pos_tag(word, lang), topn=k)])
            dis_words_2.append([word[0].split('_')[0] for word in dis_embeddings_2.most_similar(add_pos_tag(word, lang), topn=k)])
        else:
            dis_words_1.append([word[0] for word in dis_embeddings_1.most_similar(word, topn=k)])
            dis_words_2.append([word[0] for word in dis_embeddings_2.most_similar(word, topn=k)])
    return (eye_words[1], dis_words_1[0][1], dis_words_2[0][1])

In [27]:
word2vec_ru_ruscorpora.most_similar(add_pos_tag('специя'))

[('пряность_NOUN', 0.7660060524940491),
 ('различный::специя_ADJNOUN', 0.7247569561004639),
 ('другой::специя_ADJNOUN', 0.7135683298110962),
 ('смесь::специя_NOUNNOUN', 0.701219916343689),
 ('другой::пряность_ADJNOUN', 0.6963601112365723),
 ('черный::перец_ADJNOUN', 0.6822734475135803),
 ('приправа_NOUN', 0.6816896200180054),
 ('цедра_NOUN', 0.6775134205818176),
 ('кинза_NOUN', 0.6630920171737671),
 ('корица_NOUN', 0.6608870029449463)]

In [28]:
result_ru = {'target':[], 'rsc': [], 'Ruscorpora': [], 'Aranea':[]}

for word in random.choice(list(eye_embeddings_rsc.keys()), 10):
    while(True):
        try:
            result = compare_nearest_neighbours(eye_embeddings_rsc, word2vec_ru_ruscorpora, word2vec_ru_araneum, word, True)
            break
        except KeyError:
            word = random.choice(list(eye_embeddings_rsc.keys()))
    result_ru['rsc'].append(result[0])
    result_ru['Ruscorpora'].append(result[1])
    result_ru['Aranea'].append(result[2])
    result_ru['target'].append(word)
DataFrame(result_ru)

Unnamed: 0,Aranea,Ruscorpora,rsc,target
0,низкий,невысокий,можно,высокий
1,обоняние,тонкий::чутье,сук,нюх
2,тащиться,пойти,расти,идти
3,вознамериваться,решать::тогда,шлейф,решать
4,австралийский,японский,треугольный,американский
5,большущий,большой,суметь,огромный
6,детектив,знаменитый::сыщик,ожог,сыщик
7,подбородок,морщинистый::щека,ресторан,щека
8,тырить,плутовать,министр,воровать
9,пожевать,прожевывать,ресница,жевать


In [29]:
result_en = {'target':[], 'GECO': [], 'Googlenews': [], 'BNC':[]}

for word in random.choice(list(eye_embeddings_geco.keys()), 10):
    while(True):
        try:
            result = compare_nearest_neighbours(eye_embeddings_geco, word2vec_en_googlenews, word2vec_en_bnc, word, True, 'en')
            break
        except KeyError:
            word = random.choice(list(eye_embeddings_geco.keys()))
    result_en['GECO'].append(result[0])
    result_en['Googlenews'].append(result[1])
    result_en['BNC'].append(result[2])
    result_en['target'].append(word)
DataFrame(result_en)

Unnamed: 0,BNC,GECO,Googlenews,target
0,yamaichi,staying,secuirty,security
1,achievement,filed,sucess,success
2,tynecastle,hed,moving,move
3,Forrestal,dont,thestate,state
4,Cia,evidences,intel,intelligence
5,restriction,warned,imposes,impose
6,criss,call,crossed,cross
7,comment,gentleman,replies,reply
8,claymore,battering,broadsword,scimitar
9,oot::noo,came,raced,ran


In [30]:
result_en = {'target':[], 'provo': [], 'Googlenews': [], 'BNC':[]}

for word in random.choice(list(eye_embeddings_provo.keys()), 10):
    while(True):
        try:
            result = compare_nearest_neighbours(eye_embeddings_provo, word2vec_en_googlenews, word2vec_en_bnc, word, True, 'en')
            break
        except KeyError:
            word = random.choice(list(eye_embeddings_provo.keys()))
    result_en['provo'].append(result[0])
    result_en['Googlenews'].append(result[1])
    result_en['BNC'].append(result[2])
    result_en['target'].append(word)
DataFrame(result_en)

Unnamed: 0,BNC,Googlenews,provo,target
0,anyway,got,owls,get
1,unbarred,stubby_tail,treat,tail
2,bail,custoday,estimated,custody
3,horror-struck,mysteriously,listened,suddenly
4,american,americans,steroid,american
5,efficient,efficient,forming,effective
6,hard-muscled,felt,use,feel
7,enormously,immensely,forming,greatly
8,Creutzfeldt-jakob,infection,achieve,disease
9,five-storey,Exume_tried,shell,block


# Regression model

In [31]:
models_ru = [(word2vec_ru_araneum, 'Aranea'),
         (word2vec_ru_ruscorpora, 'Ruscorpora')]

results_regression_ru = defaultdict(lambda: {})

for model, model_name in models_ru:
    for feature in eye_tracking_data[rsc].columns[2:]:
        X = []
        y = []
        dst = eye_tracking_data[rsc][['word',feature]]
        for i, k in dst.iterrows():
            if add_pos_tag(k[0]) in model.vocab:
                y.append(k[1])
                X.append(model[add_pos_tag(k[0])])   
        X_train, X_test = train_test_split(X, test_size=1/4)
        y_train, y_test = train_test_split(y, test_size=1/4)
        clf = Ridge(alpha=1.0)
        clf.fit(X_train, y_train) 
        predictions = clf.predict(X_test)
        results_regression_ru[model_name][feature] = round(mean_squared_error(predictions, y_test), 2)

In [32]:
russian_regression = DataFrame({'Aranea': results_regression_ru['Aranea'], 'Ruscorpora':results_regression_ru['Ruscorpora']})

In [33]:
russian_regression.Ruscorpora.sort_values()[:10]

twoplus_fix                    0.01
IA_REGRESSION_IN               0.01
landing                        0.01
one_fix                        0.01
IA_REGRESSION_OUT_FULL         0.02
dir                            0.02
IA_SKIP                        0.02
IA_FIRST_RUN_FIXATION_COUNT    0.03
average.accuracy               0.04
IA_LEGAL                       0.09
Name: Ruscorpora, dtype: float64

In [34]:
russian_regression.Aranea.sort_values()[:10]

twoplus_fix                    0.01
IA_REGRESSION_IN               0.01
IA_REGRESSION_OUT_FULL         0.01
landing                        0.01
dir                            0.02
IA_SKIP                        0.02
one_fix                        0.02
IA_FIRST_RUN_FIXATION_COUNT    0.03
average.accuracy               0.04
IA_LEGAL                       0.09
Name: Aranea, dtype: float64

In [35]:
models_en = [(word2vec_en_googlenews, 'Googlenews'),
             (word2vec_en_bnc, 'BNC')]

results_regression_en = defaultdict(lambda: {})

for model, model_name in models_en:
    for feature in eye_tracking_data[geco].columns[2:]:
        X = []
        y = []
        dst = eye_tracking_data[geco][['word',feature]]
        for i, k in dst.iterrows():
            if model_name != 'Googlenews':
                if add_pos_tag(k[0], 'en') in model.vocab:
                    y.append(k[1])
                    X.append(model[add_pos_tag(k[0], 'en')]) 
            else:
                if k[0] in model.vocab:
                    y.append(k[1])
                    X.append(model[k[0]])
        X_train, X_test = train_test_split(X, test_size=1/4)
        y_train, y_test = train_test_split(y, test_size=1/4)
        clf = Ridge(alpha=1.0)
        clf.fit(X_train, y_train) 
        predictions = clf.predict(X_test)
        results_regression_en[model_name][feature] = mean_squared_error(predictions, y_test)

In [36]:
geco_regression = DataFrame({'Googlenews':results_regression_en['Googlenews'], 'BNC':results_regression_en['BNC']})

In [37]:
geco_regression.BNC.sort_values()[:10]

WORD_THIRD_RUN_FIXATION_%         1.750988e-07
WORD_SECOND_RUN_FIXATION_%        3.380094e-06
WORD_FIRST_RUN_FIXATION_%         1.403519e-05
WORD_FIXATION_%                   2.339347e-05
WORD_TOTAL_READING_TIME_%         2.536115e-05
WORD_THIRD_RUN_FIXATION_COUNT     2.387210e-03
WORD_SECOND_RUN_FIXATION_COUNT    2.026677e-02
WORD_SKIP                         2.625370e-02
WORD_FIRST_FIX_PROGRESSIVE        3.220510e-02
WORD_THIRD_FIXATION_RUN           3.576946e-02
Name: BNC, dtype: float64

In [38]:
geco_regression.Googlenews.sort_values()[:10]

WORD_THIRD_RUN_FIXATION_%         5.429686e-07
WORD_SECOND_RUN_FIXATION_%        4.403462e-06
WORD_FIRST_RUN_FIXATION_%         1.461357e-05
WORD_FIXATION_%                   2.941411e-05
WORD_TOTAL_READING_TIME_%         4.257606e-05
WORD_THIRD_RUN_FIXATION_COUNT     4.354629e-03
WORD_FIRST_FIX_PROGRESSIVE        3.284470e-02
WORD_SKIP                         3.551326e-02
WORD_SECOND_RUN_FIXATION_COUNT    3.793351e-02
WORD_THIRD_FIXATION_RUN           4.760168e-02
Name: Googlenews, dtype: float64

In [39]:
models_en = [(word2vec_en_googlenews, 'Googlenews'),
             (word2vec_en_bnc, 'BNC')]

results_regression_en = defaultdict(lambda: {})

for model, model_name in models_en:
    for feature in eye_tracking_data[provo].columns[2:]:
        X = []
        y = []
        dst = eye_tracking_data[provo][['word',feature]]
        for i, k in dst.iterrows():
            if model_name != 'Googlenews':
                if add_pos_tag(k[0], 'en') in model.vocab:
                    y.append(k[1])
                    X.append(model[add_pos_tag(k[0], 'en')]) 
            else:
                if k[0] in model.vocab:
                    y.append(k[1])
                    X.append(model[k[0]])
        X_train, X_test = train_test_split(X, test_size=1/4)
        y_train, y_test = train_test_split(y, test_size=1/4)
        clf = Ridge(alpha=1.0)
        clf.fit(X_train, y_train) 
        predictions = clf.predict(X_test)
        results_regression_en[model_name][feature] = mean_squared_error(predictions, y_test)

In [40]:
provo_regression = DataFrame({'Googlenews':results_regression_en['Googlenews'], 'BNC':results_regression_en['BNC']})

In [41]:
geco_regression.BNC.sort_values()[:10]

WORD_THIRD_RUN_FIXATION_%         1.750988e-07
WORD_SECOND_RUN_FIXATION_%        3.380094e-06
WORD_FIRST_RUN_FIXATION_%         1.403519e-05
WORD_FIXATION_%                   2.339347e-05
WORD_TOTAL_READING_TIME_%         2.536115e-05
WORD_THIRD_RUN_FIXATION_COUNT     2.387210e-03
WORD_SECOND_RUN_FIXATION_COUNT    2.026677e-02
WORD_SKIP                         2.625370e-02
WORD_FIRST_FIX_PROGRESSIVE        3.220510e-02
WORD_THIRD_FIXATION_RUN           3.576946e-02
Name: BNC, dtype: float64

In [42]:
geco_regression.Googlenews.sort_values()[:10]

WORD_THIRD_RUN_FIXATION_%         5.429686e-07
WORD_SECOND_RUN_FIXATION_%        4.403462e-06
WORD_FIRST_RUN_FIXATION_%         1.461357e-05
WORD_FIXATION_%                   2.941411e-05
WORD_TOTAL_READING_TIME_%         4.257606e-05
WORD_THIRD_RUN_FIXATION_COUNT     4.354629e-03
WORD_FIRST_FIX_PROGRESSIVE        3.284470e-02
WORD_SKIP                         3.551326e-02
WORD_SECOND_RUN_FIXATION_COUNT    3.793351e-02
WORD_THIRD_FIXATION_RUN           4.760168e-02
Name: Googlenews, dtype: float64