In [407]:
from pandas import DataFrame, read_csv, concat, options
from csv import reader
from numpy import mean, array, zeros, errstate, seterr, isfinite, random
from collections import defaultdict
from scipy.spatial.distance import cosine
from scipy.spatial import KDTree
from scipy.stats import spearmanr
from os import path
from gensim.models import KeyedVectors, Word2Vec
from pymorphy2 import MorphAnalyzer
from itertools import product
import string

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

from nltk.tag import pos_tag, map_tag

translator = str.maketrans('', '', string.punctuation)
morph = MorphAnalyzer()
seterr(all='raise')

options.mode.chained_assignment = None

Serialize transformed dataset

In [2]:
rsc = 'rsc'
provo = 'provo'
geco = 'geco'

eye_tracking_datasets = [rsc, provo, geco]
eye_tracking_data = {}

for dataset in eye_tracking_datasets:
    eye_tracking_data[dataset] = read_csv(path.join('data', 'eye-tracking', '{}.csv'.format(dataset)))

Loading and processing dataset of human judgements of embeddings

In [134]:
datasets = ['men',
            'mc-30',
            'rg-65',
            'rw',
            'yp-130',
            'verb-143',
            'mturk-771',
            'mturk-287',
            'simlex',
            'wordsim353-similarity',
            'wordsim353-relatedness',
             # 'simverb-3500',
             'semeval17']

languages = ['en', 'ru']

In [371]:
word_similarity = defaultdict(lambda: {})

for dataset in datasets:
    for language in languages:
        if language == 'ru' and dataset not in [ 'wordsim353-similarity', 'wordsim353-relatedness', 'simlex', 'mc-30', 'rg-65']:
            continue
        word_similarity[language][dataset] = read_csv(path.join('data', 'word-similarity', language, '{}.csv'.format(dataset))) 

In [329]:
word2vec_ru_ruscorpora = KeyedVectors.load_word2vec_format(path.join('data', 'models', 'ruwikiruscorpora-superbigrams_skipgram_300_2_2018.vec'))
word2vec_ru_araneum = KeyedVectors.load_word2vec_format(path.join('data', 'models', 'araneum_upos_skipgram_300_2_2018.vec'))

In [334]:
word2vec_en_googlenews = KeyedVectors.load_word2vec_format(path.join('data',  'models', 'googlenews.bin'), binary=True)
word2vec_en_bnc = KeyedVectors.load_word2vec_format(path.join('data',  'models', 'bnc.vec'))

# Functions

In [383]:
def add_pos_tag(word, lang='ru'):
    if lang == 'en':
        word = str(word)
        tagged = [(w, map_tag('en-ptb', 'universal', tag)) for w, tag in pos_tag([word])][0]
        return '{}_{}'.format(tagged[0], tagged[1])
    tag = str(morph.parse(word)[0].tag.POS)
    if tag == 'ADJF':
        tag = 'ADJ'
    elif tag == 'INFN':
        tag = 'VERB'
    if word == 'объем': 
        tag = 'NOUN'
    if word == 'струя':
        tag = 'NOUN'
    if word == 'чай':
        tag = 'NOUN'
    if word == 'два':
        word = 'двадцать'
        tag = 'NUM'
    return '{}_{}'.format(word, tag)

In [374]:
def load_sim_dataset(df_, embeddings, rusvectores=False, lang='ru'):
    df = df_.copy()
    old_len = len(df)
    for i, m in df.iterrows():
        if not rusvectores:
            if not m['word1'] in embeddings or not m['word2'] in embeddings:
                df.drop(i, inplace=True)
        else:
            if not add_pos_tag(m['word1'], lang) in embeddings or not add_pos_tag(m['word2'], lang) in embeddings:
                df.drop(i, inplace=True)
    return df.reset_index(drop=True), (old_len - len(df))/old_len*100, len(df)

In [376]:
def make_sims_dataset(dataset, embeddings, rusvectores=False, lang='en'):
    sims = zeros(shape=len(dataset), dtype='float32')
    for i, m in dataset.iterrows():
        if not rusvectores:
            sims[i] = 1 - cosine(embeddings[m['word1']], embeddings[m['word2']])
        else:
            sims[i] = 1 - cosine(embeddings[add_pos_tag(m['word1'], lang)], embeddings[add_pos_tag(m['word2'], lang)])
    return sims

In [252]:
def make_sims_dataset_2(dataset, embeddings, rusvectores=False):
    sims = zeros(shape=len(dataset), dtype='float32')
    for i, m in dataset.iterrows():
        sims[i] = embeddings[m['word1']] + embeddings[m['word2']]
    return sims

In [69]:
def make_word2vec_dataset(dataset, model):
    sims = zeros(shape=len(dataset), dtype='float32')
    for i, m in dataset.iterrows():
        sims[i] = 1 - cosine(model[add_pos_tag(m['word1'])], model[add_pos_tag(m['word2'])])
    return sims

In [401]:
def make_word2vec_eye_dataset(dataset, model, lang='ru', pos_tags=False):
    pairs = list(product(dataset, repeat=2))
    sims_w2v = zeros(shape=len(pairs), dtype='float32')
    sims_eye = zeros(shape=len(pairs), dtype='float32')
    for i in enumerate(pairs):
        try:
            if lang == 'en':
                if pos_tags:
                    sims_w2v[i[0]] = 1 - cosine(model[add_pos_tag(i[1][0], lang)], model[add_pos_tag(i[1][1], lang)])
                    sims_eye[i[0]] = 1 - cosine(dataset[i[1][0]], dataset[i[1][1]])
                else:
                    sims_w2v[i[0]] = 1 - cosine(model[i[1][0]], model[i[1][1]])
                    sims_eye[i[0]] = 1 - cosine(dataset[i[1][0]], dataset[i[1][1]])
            else: # add pos tags
                sims_w2v[i[0]] = 1 - cosine(model[add_pos_tag(i[1][0])], model[add_pos_tag(i[1][1])])
                sims_eye[i[0]] = 1 - cosine(dataset[i[1][0]], dataset[i[1][1]])
        except KeyError:
            continue
    return sims_w2v, sims_eye

# Correlation with word similarity

In [140]:
results_ru = {'RSC': {}, 'Ruswikicorpora': {}, 'Aranea': {}}
dropped_ru = {'RSC': {}, 'Ruswikicorpora': {}, 'Aranea': {}}

eye_embeddings_rsc = {}

for i, k in eye_tracking_data[rsc].iloc[:,1:len(eye_tracking_data[rsc].columns)].iterrows():
    eye_embeddings_rsc[k['word']] = array(k[1:].values, dtype=float)

for dataset_ in word_similarity['ru']:
    for result in results_ru.keys():
        if result == 'Ruswikicorpora':
            model = word2vec_ru_ruscorpora
            pos_tags = True
        elif result == 'Aranea':
            model = word2vec_ru_araneum
            pos_tags = True
        else:
            model = eye_embeddings_rsc
            pos_tags = False
        try:
            _dataset, percentage, amount = load_sim_dataset(word_similarity['ru'][dataset_], model, pos_tags)
            dropped_ru[result][dataset_] = '{:0.2f}%, {}'.format(percentage, amount)
            dataset = make_sims_dataset(_dataset, model, pos_tags)
        except ZeroDivisionError:
            results_ru[result][dataset_] = 0
            continue
        results_ru[result][dataset_] = spearmanr(dataset, _dataset.similarity)[0]

In [141]:
DataFrame(results_ru)

Unnamed: 0,Aranea,RSC,Ruswikicorpora
mc-30,0.78457,,0.824044
men,0.675703,0.183928,0.632679
mturk-287,0.582777,,0.616388
mturk-771,0.538461,0.788377,0.501846
rg-65,0.763959,,0.795813
rw,0.456829,0.894427,0.399639
semeval17,0.664529,1.0,0.629193
simlex,0.390513,0.042263,0.277433
verb-143,0.364818,,0.134122
wordsim353-relatedness,0.612668,0.836364,0.568158


In [142]:
DataFrame(dropped_ru)

Unnamed: 0,Aranea,RSC,Ruswikicorpora
mc-30,"0.00%, 30","96.67%, 1","0.00%, 30"
men,"16.80%, 2496","98.67%, 40","9.30%, 2721"
mturk-287,"34.49%, 188","99.65%, 1","11.85%, 253"
mturk-771,"13.36%, 668","98.31%, 13","8.56%, 705"
rg-65,"0.00%, 65","98.46%, 1","0.00%, 65"
rw,"36.63%, 1289","99.75%, 5","29.15%, 1441"
semeval17,"37.40%, 313","99.40%, 3","33.40%, 333"
simlex,"1.97%, 946","97.82%, 21","1.55%, 950"
verb-143,"50.77%, 64","100.00%, 0","9.23%, 118"
wordsim353-relatedness,"10.80%, 223","97.20%, 7","4.80%, 238"


In [394]:
results_en_geco = {'gaze vectors geco': {}, 'Googlenews': {}, 'BNC': {}}
dropped_en_geco = {'gaze vectors geco': {}, 'Googlenews': {}, 'BNC': {}}

eye_embeddings_geco = {}

for i, k in eye_tracking_data[geco].iloc[:,1:len(eye_tracking_data[geco].columns)].iterrows():
    eye_embeddings_geco[k['word']] = array(k[1:].values, dtype=float)

for dataset_ in word_similarity['en']:
    for result in results_en_geco.keys():
        if result == 'Googlenews':
            model = word2vec_en_googlenews
            pos_tags = False
        elif result == 'BNC':
            model = word2vec_en_bnc
            pos_tags = True
        else:
            model = eye_embeddings_geco
            pos_tags = False
        try:
            _dataset, percentage, amount = load_sim_dataset(word_similarity['en'][dataset_], model, pos_tags, 'en')
            dropped_en_geco[result][dataset_] = '{:0.2f}%, {}'.format(percentage, amount)
            dataset = make_sims_dataset(_dataset, model, pos_tags, 'en')
        except ZeroDivisionError:
            results_en_geco[result][dataset_] = 0
            continue
        results_en_geco[result][dataset_] = spearmanr(dataset, _dataset.similarity)[0]

In [395]:
DataFrame(results_en_geco)

Unnamed: 0,BNC,Googlenews,gaze vectors geco
mc-30,0.822207,0.788607,
men,0.764628,0.770806,0.074074
mturk-287,0.751452,0.683969,
mturk-771,0.690091,0.67131,0.678571
rg-65,0.804056,0.760783,
rw,0.34546,0.53421,-0.4
semeval17,0.741103,0.724397,0.5
simlex,0.254586,0.349608,-0.114828
verb-143,0.240806,0.569509,
wordsim353-relatedness,,0.635451,-0.4


In [396]:
DataFrame(dropped_en_geco)

Unnamed: 0,BNC,Googlenews,gaze vectors geco
mc-30,"0.00%, 30","0.00%, 30","100.00%, 0"
men,"14.60%, 2562","1.80%, 2946","99.27%, 22"
mturk-287,"48.43%, 148","4.18%, 275","99.65%, 1"
mturk-771,"5.97%, 725","0.00%, 771","99.09%, 7"
rg-65,"0.00%, 65","0.00%, 65","100.00%, 0"
rw,"69.17%, 627","10.28%, 1825","99.80%, 4"
semeval17,"40.00%, 300","25.00%, 375","99.40%, 3"
simlex,"26.13%, 738","0.00%, 999","96.30%, 37"
verb-143,"60.00%, 52","3.08%, 126","100.00%, 0"
wordsim353-relatedness,"9.09%, 230","0.40%, 252","98.42%, 4"


In [397]:
results_en_provo = {'gaze vectors provo': {}, 'Googlenews': {}, 'BNC': {}}
dropped_en_provo = {'gaze vectors provo': {}, 'Googlenews': {}, 'BNC': {}}

eye_embeddings_provo = {}

for i, k in eye_tracking_data[provo].iloc[:,1:len(eye_tracking_data[provo].columns)].iterrows():
    eye_embeddings_provo[k['word']] = array(k[1:].values, dtype=float)

for dataset_ in word_similarity['en']:
    for result in results_en_provo.keys():
        if result == 'Googlenews':
            model = word2vec_en_googlenews
            pos_tags = False
        elif result == 'BNC':
            model = word2vec_en_bnc
            pos_tags = True
        else:
            model = eye_embeddings_provo
            pos_tags = False
        try:
            _dataset, percentage, amount = load_sim_dataset(word_similarity['en'][dataset_], model, pos_tags, 'en')
            dropped_en_provo[result][dataset_] = '{:0.2f}%, {}'.format(percentage, amount)
            dataset = make_sims_dataset(_dataset, model, pos_tags, 'en')
        except ZeroDivisionError:
            results_en_provo[result][dataset_] = 0
            continue
        results_en_provo[result][dataset_] = spearmanr(dataset, _dataset.similarity)[0]

In [398]:
DataFrame(results_en_provo)

Unnamed: 0,BNC,Googlenews,gaze vectors provo
mc-30,0.822207,0.788607,
men,0.764628,0.770806,-0.052094
mturk-287,0.751452,0.683969,-1.0
mturk-771,0.690091,0.67131,0.25362
rg-65,0.804056,0.760783,
rw,0.34546,0.53421,-1.0
semeval17,0.741103,0.724397,0.6
simlex,0.254586,0.349608,-0.117817
verb-143,0.240806,0.569509,
wordsim353-relatedness,,0.635451,0.045662


In [399]:
DataFrame(dropped_en_provo)

Unnamed: 0,BNC,Googlenews,gaze vectors provo
mc-30,"0.00%, 30","0.00%, 30","100.00%, 0"
men,"14.60%, 2562","1.80%, 2946","97.43%, 77"
mturk-287,"48.43%, 148","4.18%, 275","99.30%, 2"
mturk-771,"5.97%, 725","0.00%, 771","97.54%, 19"
rg-65,"0.00%, 65","0.00%, 65","100.00%, 0"
rw,"69.17%, 627","10.28%, 1825","99.90%, 2"
semeval17,"40.00%, 300","25.00%, 375","98.80%, 6"
simlex,"26.13%, 738","0.00%, 999","95.90%, 41"
verb-143,"60.00%, 52","3.08%, 126","100.00%, 0"
wordsim353-relatedness,"9.09%, 230","0.40%, 252","95.65%, 11"


# Correlation with word embeddings

In [402]:
correlations_ru = defaultdict(lambda: {})
                
correlations_ru['word2vec_ru_araneum']['rsc'] = spearmanr(*make_word2vec_eye_dataset(eye_embeddings_rsc, word2vec_ru_araneum))[0]
correlations_ru['word2vec_ru_ruscorpora']['rsc'] = spearmanr(*make_word2vec_eye_dataset(eye_embeddings_rsc, word2vec_ru_ruscorpora))[0]

In [403]:
DataFrame(correlations_ru)

Unnamed: 0,word2vec_ru_araneum,word2vec_ru_ruscorpora
rsc,0.64417,0.64232


In [404]:
correlations_en = defaultdict(lambda: {})
                
correlations_en['Googlenews']['geco'] = spearmanr(*make_word2vec_eye_dataset(eye_embeddings_geco, word2vec_en_googlenews, 'en'))[0]
correlations_en['BNC']['geco'] = spearmanr(*make_word2vec_eye_dataset(eye_embeddings_geco, word2vec_en_bnc, 'en', True))[0]
correlations_en['Googlenews']['provo'] = spearmanr(*make_word2vec_eye_dataset(eye_embeddings_provo, word2vec_en_googlenews, 'en'))[0]
correlations_en['BNC']['provo'] = spearmanr(*make_word2vec_eye_dataset(eye_embeddings_provo, word2vec_en_bnc, 'en', True))[0]

In [405]:
DataFrame(correlations_en)

Unnamed: 0,BNC,Googlenews
geco,0.994496,0.118627
provo,0.969176,0.237111


# Find nearest neighbors

In [437]:
def compare_nearest_neighbours(eye_embeddings, dis_embeddings_1, dis_embeddings_2, word, pos_tags=False, lang='ru', k=2):
    vectors = []
    keys = []
    for key, vec in eye_embeddings.items():
        vectors.append(vec)
        keys.append(key) 
    tree = KDTree(vectors)
    distances = tree.query(eye_embeddings[word], k=k)
    eye_words = []
    for key in distances[1]:
        eye_words.append(keys[key])
    dis_words_1 = []
    dis_words_2 = []
    if lang == 'en':
        dis_words_1.append([word[0] for word in dis_embeddings_1.most_similar(word, topn=k)])
        dis_words_2.append([word[0].split('_')[0] for word in dis_embeddings_2.most_similar(add_pos_tag(word, lang), topn=k)])
    else:
        if pos_tags:
            dis_words_1.append([word[0].split('_')[0] for word in dis_embeddings_1.most_similar(add_pos_tag(word, lang), topn=k)])
            dis_words_2.append([word[0].split('_')[0] for word in dis_embeddings_2.most_similar(add_pos_tag(word, lang), topn=k)])
        else:
            dis_words_1.append([word[0] for word in dis_embeddings_1.most_similar(word, topn=k)])
            dis_words_2.append([word[0] for word in dis_embeddings_2.most_similar(word, topn=k)])
    return (eye_words[1], dis_words_1[0][1], dis_words_2[0][1])

In [435]:
result_ru = {'target':[], 'rsc': [], 'Ruscorpora': [], 'Aranea':[]}

for word in random.choice(list(eye_embeddings_rsc.keys()), 10):
    while(True):
        try:
            result = compare_nearest_neighbours(eye_embeddings_rsc, word2vec_ru_ruscorpora, word2vec_ru_araneum, word, True)
            break
        except KeyError:
            word = random.choice(list(eye_embeddings_rsc.keys()))
    result_ru['rsc'].append(result[0])
    result_ru['Ruscorpora'].append(result[1])
    result_ru['Aranea'].append(result[2])
    result_ru['target'].append(word)
DataFrame(result_ru)

Unnamed: 0,Aranea,Ruscorpora,rsc,target
0,краевой,суровый::край,хотя,край
1,лисица,герб::лис,лист,лис
2,вынимать,вынимать,досье,доставать
3,написаный,сочинять,несчастный,написать
4,б,ю,л,его
5,социум,провинциальный::общество,объяснять,общество
6,т,с,е,от
7,кинообраз,образ::относиться,обретать,образ
8,сопоставляться,анализ,соринка,сопоставление
9,ворос,вопрос::бытие,водяной,вопрос


In [444]:
result_en = {'target':[], 'GECO': [], 'Googlenews': [], 'BNC':[]}

for word in random.choice(list(eye_embeddings_geco.keys()), 10):
    while(True):
        try:
            result = compare_nearest_neighbours(eye_embeddings_geco, word2vec_en_googlenews, word2vec_en_bnc, word, True, 'en')
            break
        except KeyError:
            word = random.choice(list(eye_embeddings_geco.keys()))
    result_en['GECO'].append(result[0])
    result_en['Googlenews'].append(result[1])
    result_en['BNC'].append(result[2])
    result_en['target'].append(word)
DataFrame(result_en)

Unnamed: 0,BNC,GECO,Googlenews,target
0,bizzy,down,off,away
1,usage,in,utilize,use
2,frank,hed,postage_stamps,stamp
3,greenside::bunker,thinking,shooting,shot
4,yamaichi,flickering,secuirty,security
5,scruff,door,held,hold
6,Gargrave,besieged,vicinity,near
7,pinpoint,dim,yellowish_glow,light
8,dietician,reply,pharmacists,pharmacist
9,Marie::Enge,trying,boy,man


In [446]:
result_en = {'target':[], 'provo': [], 'Googlenews': [], 'BNC':[]}

for word in random.choice(list(eye_embeddings_provo.keys()), 10):
    while(True):
        try:
            result = compare_nearest_neighbours(eye_embeddings_provo, word2vec_en_googlenews, word2vec_en_bnc, word, True, 'en')
            break
        except KeyError:
            word = random.choice(list(eye_embeddings_provo.keys()))
    result_en['provo'].append(result[0])
    result_en['Googlenews'].append(result[1])
    result_en['BNC'].append(result[2])
    result_en['target'].append(word)
DataFrame(result_en)

Unnamed: 0,BNC,Googlenews,provo,target
0,changing,changing,lord's,change
1,lewisian::gneiss,rockers,spine,rock
2,difficult,easiest,these,easy
3,silent,deafening_silence,station,silence
4,androcentric,zygote,recognized,conception
5,painting,painter,enact,artist
6,Azumah::Nelson,crown,us,title
7,generally,widely,entering,commonly
8,manic-depressive,depression_anxiety,history,depression
9,distinct,Separate,raising,separate


# Regression model

In [449]:
models_ru = [(word2vec_ru_araneum, 'Aranea'),
         (word2vec_ru_ruscorpora, 'Ruscorpora')]

results_regression_ru = defaultdict(lambda: {})

for model, model_name in models_ru:
    for feature in eye_tracking_data[rsc].columns[2:]:
        X = []
        y = []
        dst = eye_tracking_data[rsc][['word',feature]]
        for i, k in dst.iterrows():
            if add_pos_tag(k[0]) in model.vocab:
                y.append(k[1])
                X.append(model[add_pos_tag(k[0])])   
        X_train, X_test = train_test_split(X, test_size=1/4)
        y_train, y_test = train_test_split(y, test_size=1/4)
        clf = Ridge(alpha=1.0)
        clf.fit(X_train, y_train) 
        predictions = clf.predict(X_test)
        results_regression_ru[model_name][feature] = round(mean_squared_error(predictions, y_test), 2)

In [450]:
DataFrame([results_regression_ru['Aranea'], results_regression_ru['Ruscorpora']]).transpose()

Unnamed: 0,0,1
IA_DWELL_TIME,9860.84,7656.75
IA_FIRST_FIXATION_DURATION,340.45,370.55
IA_FIRST_RUN_DWELL_TIME,1948.41,1696.79
IA_FIRST_RUN_FIXATION_COUNT,0.03,0.02
IA_FIXATION_COUNT,0.15,0.13
IA_LEGAL,0.11,0.1
IA_REGRESSION_IN,0.01,0.01
IA_REGRESSION_OUT_FULL,0.01,0.01
IA_REGRESSION_PATH_DURATION,16909.83,12976.56
IA_SECOND_RUN_DWELL_TIME,1484.53,2282.12


In [467]:
models_en = [(word2vec_en_googlenews, 'Googlenews'),
             (word2vec_en_bnc, 'BNC')]

results_regression_en = defaultdict(lambda: {})

for model, model_name in models_en:
    for feature in eye_tracking_data[geco].columns[2:]:
        X = []
        y = []
        dst = eye_tracking_data[geco][['word',feature]]
        for i, k in dst.iterrows():
            if model_name != 'Googlenews':
                if add_pos_tag(k[0], 'en') in model.vocab:
                    y.append(k[1])
                    X.append(model[add_pos_tag(k[0], 'en')]) 
            else:
                if k[0] in model.vocab:
                    y.append(k[1])
                    X.append(model[k[0]])
        X_train, X_test = train_test_split(X, test_size=1/4)
        y_train, y_test = train_test_split(y, test_size=1/4)
        clf = Ridge(alpha=1.0)
        clf.fit(X_train, y_train) 
        predictions = clf.predict(X_test)
        results_regression_en[model_name][feature] = mean_squared_error(predictions, y_test)

In [468]:
DataFrame([results_regression_en['Googlenews'], results_regression_en['BNC']]).transpose()

Unnamed: 0,0,1
TRIAL_TOTAL_READING_TIME,8225781.0,9618600.0
WORD_AVERAGE_FIX_PUPIL_SIZE,130725.4,98594.31
WORD_FIRST_FIXATION_DURATION,2241.518,1741.121
WORD_FIRST_FIXATION_INDEX,532.8916,282.0872
WORD_FIRST_FIXATION_RUN_INDEX,365.2881,285.6555
WORD_FIRST_FIXATION_TIME,42447860.0,24214830.0
WORD_FIRST_FIXATION_VISITED_WORD_COUNT,246.8631,226.6181
WORD_FIRST_FIXATION_X,42840.33,35954.58
WORD_FIRST_FIXATION_Y,16563.4,10180.34
WORD_FIRST_FIX_PROGRESSIVE,0.03714544,0.03007783


In [469]:
models_en = [(word2vec_en_googlenews, 'Googlenews'),
             (word2vec_en_bnc, 'BNC')]

results_regression_en = defaultdict(lambda: {})

for model, model_name in models_en:
    for feature in eye_tracking_data[provo].columns[2:]:
        X = []
        y = []
        dst = eye_tracking_data[provo][['word',feature]]
        for i, k in dst.iterrows():
            if model_name != 'Googlenews':
                if add_pos_tag(k[0], 'en') in model.vocab:
                    y.append(k[1])
                    X.append(model[add_pos_tag(k[0], 'en')]) 
            else:
                if k[0] in model.vocab:
                    y.append(k[1])
                    X.append(model[k[0]])
        X_train, X_test = train_test_split(X, test_size=1/4)
        y_train, y_test = train_test_split(y, test_size=1/4)
        clf = Ridge(alpha=1.0)
        clf.fit(X_train, y_train) 
        predictions = clf.predict(X_test)
        results_regression_en[model_name][feature] = mean_squared_error(predictions, y_test)

In [470]:
DataFrame([results_regression_en['Googlenews'], results_regression_en['BNC']]).transpose()

Unnamed: 0,0,1
IA_DWELL_TIME,9759.041,6781.117
IA_FIRST_FIXATION_DURATION,575.9891,510.0147
IA_FIRST_FIXATION_TIME,15538320.0,12794070.0
IA_FIRST_FIXATION_VISITED_IA_COUNT,128.0929,85.89793
IA_FIRST_FIXATION_X,146563.5,109048.5
IA_FIRST_FIXATION_Y,6531.971,6113.736
IA_FIRST_FIX_PROGRESSIVE,0.01911574,0.01651642
IA_FIRST_RUN_DWELL_TIME,3493.17,2122.07
IA_FIRST_RUN_END_TIME,15227600.0,12507160.0
IA_FIRST_RUN_FIXATION_COUNT,0.04267923,0.02770007
