In [1]:
import ujson
import numpy as np

from glob import glob
from itertools import islice

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [70]:
def read_xy(pattern):
    for path in glob(pattern):
        with open(path) as fh:
            for line in fh:
                row = ujson.loads(line.strip())
                del row['x']['avg_word_len']
                yield row['x'], row['y']

In [71]:
iter_dev = read_xy('/Users/dclure/Projects/sent-order/data/xy-dev.json/*.json')

In [72]:
x_dev, y_dev = zip(*islice(iter_dev, 200000))

In [73]:
iter_test = read_xy('/Users/dclure/Projects/sent-order/data/xy-test.json/*.json')

In [74]:
x_test, y_test = zip(*islice(iter_test, 50000))

In [75]:
dv = DictVectorizer()

In [76]:
x_dev = dv.fit_transform(x_dev)

In [77]:
x_dev

<200000x6062 sparse matrix of type '<class 'numpy.float64'>'
	with 13833417 stored elements in Compressed Sparse Row format>

In [78]:
model = LinearRegression()

In [79]:
fit = model.fit(x_dev, y_dev)

In [80]:
x_test = dv.transform(x_test)

In [81]:
y_test_pred = fit.predict(x_test)

In [82]:
r2_score(y_test, y_test_pred)

0.31155738897730334

In [83]:
names = dv.get_feature_names()

In [84]:
bidx = fit.coef_.argsort()
eidx = np.flip(fit.coef_.argsort(), 0)

In [85]:
for i in bidx[:100]:
    print(fit.coef_[i], names[i])

-0.57870565611 _lemma3_so_-_call
-0.337775598174 _lemma2_monte_carlo
-0.337307281525 _lemma2_of_freedom
-0.307914364007 _lemma3_in_agreement_with
-0.301032002614 _lemma3_monte_-_carlo
-0.246654113832 _lemma2_with_respect
-0.216465180662 _lemma2_,_due
-0.20596629136 _lemma3_with_the_help
-0.199269716571 _lemma3_in_addition_to
-0.194646204049 _lemma1_rev
-0.18266064169 _lemma2_in_term
-0.179966330132 _lemma2_this_note
-0.173352544878 _lemma2_this_paper
-0.166188165635 _lemma2_this_article
-0.159650532729 _lemma1_\it
-0.15765020171 _lemma3_phase_diagram_of
-0.155786472617 _lemma2_-PRON-_study
-0.153201489091 _lemma2_-PRON-_present
-0.152643959117 _lemma2_and_thus
-0.151273395952 _lemma3_a_sample_of
-0.148412621313 _lemma2_-PRON-_consider
-0.147211252119 _lemma3_and_the_result
-0.144066777019 _lemma3_-_the_-
-0.144060972876 _lemma3_be_consider_.
-0.143404080285 _lemma2_-PRON-_report
-0.140539899074 _lemma3_be_argue_that
-0.138778777033 _lemma3_the_context_of
-0.13767086257 _lemma2_out_that

In [86]:
for i in eidx[:100]:
    print(fit.coef_[i], names[i])

0.521413652883 _lemma2_-_call
0.34297957958 _lemma1_carlo
0.327359208141 _lemma1_finally
0.319736879556 _lemma3_with_respect_to
0.284507634319 _lemma2_in_agreement
0.282593478006 _lemma3_degree_of_freedom
0.24403173715 _lemma1_furthermore
0.231327216922 _lemma1_conclude
0.22904624464 _lemma1_also
0.22670358571 _lemma3_in_term_of
0.21627444905 _lemma3_,_-PRON-_report
0.211809787404 _lemma2_&_gt
0.204731940122 _lemma1_further
0.196455485593 _lemma3_here_-PRON-_report
0.195602892863 _lemma3_,_due_to
0.186936765224 _lemma1_implication
0.185341925567 _lemma1_illustrate
0.180076967864 _lemma1_moreover
0.177623375734 _lemma1_conclusion
0.177115603986 _lemma3_,_-PRON-_present
0.17603946766 _lemma2_these_result
0.175240520395 _lemma3_the_help_of
0.174134983734 _lemma3_of_this_paper
0.165436457601 _lemma2_in_addition
0.161271922033 _lemma1_finding
0.159578504665 _lemma2_as_a
0.157404822436 _lemma2_conjurer_present
0.157145340925 _lemma3_the_sense_of
0.156841405677 _lemma3_-PRON-_propose_that
0.1