In [47]:
# Start by loading some data.

from marc_embeddings.zephir import load_from_api

marc_data = [
    zephir.load_from_api('txu.059173018498868'),
    zephir.load_from_api('gri.ark:/13960/t91875m5j'),
    zephir.load_from_api('mdp.39015022391018')
]

print(type(marc_data[0]))

GET http://d2d-zephir-stg.cdlib.org/api/item/txu.059173018498868.json
GET http://d2d-zephir-stg.cdlib.org/api/item/gri.ark:/13960/t91875m5j.json
GET http://d2d-zephir-stg.cdlib.org/api/item/mdp.39015022391018.json


<class 'marc_embeddings.zephir.ZephirRecord'>


In [48]:
# Output as a pandas dataframe.

from marc_embeddings.zephir import ZephirTransformer

dataframe = ZephirTransformer(['245', '100'], dataframe = True).transform(marc_data)
print(type(dataframe))
display(dataframe)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,245,100
txu.059173018498868,The aftermath of sovereignty: West Indian pers...,"Lowenthal, David."
gri.ark:/13960/t91875m5j,As sepulturas do Espinheiro / por Anselmo Braa...,"Braamcamp Freire, Anselmo, 1849-1921."
mdp.39015022391018,"Chemical information systems, edited by Janet ...","Ash, Janet E."


In [49]:
# Output as a simple list of list of strings.

data = ZephirTransformer(['245', '100']).transform(marc_data)
print(data)

[['The aftermath of sovereignty: West Indian perspectives. Edited and introduced by David Lowenthal and Lambros Comitas.', 'Lowenthal, David.'], ['As sepulturas do Espinheiro / por Anselmo Braamcamp Freire.', 'Braamcamp Freire, Anselmo, 1849-1921.'], ['Chemical information systems, edited by Janet E. Ash [and] Ernest Hyde.', 'Ash, Janet E.']]


In [50]:
# Count the words that occur in the 245 field.

# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
from sklearn.feature_extraction.text import CountVectorizer

data = ZephirTransformer(['245']).transform(marc_data)

# This is a list of list of Strings
print(data)

# This won't work, because CountVectorizer expects a list of string documents,
# not a list of lists of string documents.
try:
    counts = CountVectorizer().fit_transform(data)
except:
    raise

[['The aftermath of sovereignty: West Indian perspectives. Edited and introduced by David Lowenthal and Lambros Comitas.'], ['As sepulturas do Espinheiro / por Anselmo Braamcamp Freire.'], ['Chemical information systems, edited by Janet E. Ash [and] Ernest Hyde.']]


AttributeError: 'list' object has no attribute 'lower'

In [51]:
from marc_embeddings.zephir import FlattenTransformer

try:
    counts = CountVectorizer().fit_transform(FlattenTransformer().transform(data))
    print(type(counts))
except:
    raise

<class 'scipy.sparse.csr.csr_matrix'>


In [52]:
# See https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html
# particularly the "Advantages of the CSR format" and "Disadvantages of the CSR format" sections.

print(counts.shape)
print(counts.todense())

(3, 30)
[[1 2 0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 1 0 1 1 1 1 0 0 1 0 1 1]
 [0 0 1 1 0 1 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0]
 [0 1 0 0 1 0 1 1 0 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0]]


In [53]:
# Finally, why not normalize these word counts?

# See http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
from sklearn.feature_extraction.text import TfidfTransformer

normalized = TfidfTransformer().fit_transform(counts)
print(type(normalized))
print(normalized.shape)
print(normalized.todense())

<class 'scipy.sparse.csr.csr_matrix'>
(3, 30)
[[ 0.25424316  0.38671695  0.          0.          0.          0.
   0.19335847  0.          0.25424316  0.25424316  0.          0.19335847
   0.          0.          0.          0.          0.25424316  0.
   0.25424316  0.          0.25424316  0.25424316  0.25424316  0.25424316
   0.          0.          0.25424316  0.          0.25424316  0.25424316]
 [ 0.          0.          0.35355339  0.35355339  0.          0.35355339
   0.          0.          0.          0.          0.35355339  0.          0.
   0.35355339  0.35355339  0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.35355339  0.35355339
   0.          0.          0.          0.        ]
 [ 0.          0.25732238  0.          0.          0.338348    0.
   0.25732238  0.338348    0.          0.          0.          0.25732238
   0.338348    0.          0.          0.338348    0.          0.338348    0.
   0.338348    0.         

In [54]:
# All of these steps, involving a 'fit' or 'transform' can be combined into a single object.
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('Extract the MARC 245 data.', ZephirTransformer(['245'])),
    ('Remove extra [].', FlattenTransformer()),
    ('Count distinct words.', CountVectorizer()),
    ('Normalize for word frequency, inverse document frequency.', TfidfTransformer())
])

display(pipeline)
display(pipeline.named_steps)

Pipeline(memory=None,
     steps=[('Extract the MARC 245 data.', ZephirTransformer(dataframe=None, selection=['245'])), ('Remove extra [].', FlattenTransformer()), ('Count distinct words.', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='conte...cument frequency.', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True))])

{'Count distinct words.': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 1), preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None),
 'Extract the MARC 245 data.': ZephirTransformer(dataframe=None, selection=['245']),
 'Normalize for word frequency, inverse document frequency.': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True),
 'Remove extra [].': FlattenTransformer()}

In [55]:
pipeline.fit(marc_data)
pipeline.transform(marc_data).todense()

matrix([[ 0.25424316,  0.38671695,  0.        ,  0.        ,  0.        ,
          0.        ,  0.19335847,  0.        ,  0.25424316,  0.25424316,
          0.        ,  0.19335847,  0.        ,  0.        ,  0.        ,
          0.        ,  0.25424316,  0.        ,  0.25424316,  0.        ,
          0.25424316,  0.25424316,  0.25424316,  0.25424316,  0.        ,
          0.        ,  0.25424316,  0.        ,  0.25424316,  0.25424316],
        [ 0.        ,  0.        ,  0.35355339,  0.35355339,  0.        ,
          0.35355339,  0.        ,  0.        ,  0.        ,  0.        ,
          0.35355339,  0.        ,  0.        ,  0.35355339,  0.35355339,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.35355339,
          0.35355339,  0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.25732238,  0.        ,  0.        ,  0.338348  ,
          0.        ,  0.25732238,  

In [56]:
# Next step, read Scikit's tutorial on pipelines and parameter optimization.
# http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html