In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from marc_embeddings import lc, marc
import itertools

P = 1000
N = 1000

records = list(itertools.islice(lc.load_from_xml('include/marc/BooksAll.2014.part01.xml'), 0, P, int(P/N)))
print("Loaded %i records." % len(records))

FIELDS = marc.select(
    marc.TitleRelated.TITLE_STATEMENT,
    marc.MainEntry.PERSONAL_NAME
)

hv = HashingVectorizer(ngram_range=(1, 2), stop_words='english')
tfidf = TfidfTransformer()

vectorizer = Pipeline([
    ('fields', FeatureUnion(
        list(map(lambda field: (field, Pipeline([
            ('read xml', lc.LCTransformer([field])),
            ('flatten', lc.FlattenTransformer()),
            ('hash, tf-idf', Pipeline([
                ('hash', hv),
                #('tf-idf', tfidf)
            ]))
        ])), FIELDS))
    ))
])

X = vectorizer.fit_transform(records)
print('Done.')

Loaded 1000 records.
Done.


In [8]:
print(X[0])

  (0, 114817)	-0.14907119849998599
  (0, 126699)	0.14907119849998599
  (0, 153918)	0.14907119849998599
  (0, 168408)	0.29814239699997197
  (0, 195258)	-0.14907119849998599
  (0, 269373)	-0.14907119849998599
  (0, 306380)	-0.14907119849998599
  (0, 318027)	0.29814239699997197
  (0, 334785)	-0.14907119849998599
  (0, 342790)	0.29814239699997197
  (0, 495083)	0.14907119849998599
  (0, 541244)	0.14907119849998599
  (0, 546876)	0.14907119849998599
  (0, 585123)	0.14907119849998599
  (0, 732722)	-0.4472135954999579
  (0, 733462)	-0.14907119849998599
  (0, 741852)	-0.29814239699997197
  (0, 790116)	0.14907119849998599
  (0, 877461)	0.14907119849998599
  (0, 882038)	0.14907119849998599
  (0, 905244)	-0.14907119849998599
  (0, 942212)	0.14907119849998599
  (0, 943745)	-0.14907119849998599
  (0, 978383)	-0.14907119849998599
  (0, 1002839)	0.14907119849998599
  :	:
  (0, 1395180)	-0.10369516947304253
  (0, 1431232)	-0.10369516947304253
  (0, 1470615)	-0.10369516947304253
  (0, 1475309)	-0.1036951