# Metadata exploration

We are exploring the metadata files that I pulled from the open library.

In [59]:
import pymongo
import xmltodict
from pathlib import Path
from tqdm import tqdm
import re

## Get the path to the metadata files

In [25]:
p = Path("../meta/")

In [29]:
file_list = [x for x in p.glob("*.txt")]

In [30]:
len(file_list)

869172

## Start with a small slice of your files

In [43]:
# Define a function to read in your XML files
def xml_reader(fid):
    with open(fid, "rb") as f:
        my_dictionary = xmltodict.parse(f)
    
    return my_dictionary.get('metadata')

In [44]:
meta_data = []
file_limit = 1000

for i in tqdm(file_list[:file_limit], total=file_limit):
    meta_data.append(xml_reader(i))

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 853.20it/s]


## Clean the data for insertion into the database

In [78]:
# Data cleaning process
meta_dict = []
for x in meta_data:
    if x:
        meta_dict.append({k.replace('.', ''): v for k, v in x.items()})
    else:
        pass

In [80]:
meta_dict

[{'language': ['Ukrainian', 'English'],
  'scanningcenter': 'alberta',
  'sponsor': 'University of Alberta Libraries',
  'contributor': 'Canadian Institute of Ukrainian Studies',
  'date': '1993',
  'subject': ['Ukrainian literature', 'Ukrainian literature'],
  'title': 'Поза традиції: антологія модерної української поезії в діяспор',
  'creator': ['Boychuk, Bohdan',
   'Fizer, John',
   'Makaryk, Irena R.',
   'Struk, Danylo H'],
  'isbn': '0920862888',
  'collection': ['cius_books',
   'ualberta_ukrainian_studies',
   'university_of_alberta_libraries',
   'toronto'],
  'shiptracking': 'cius02',
  'cat_key': '1622335',
  'patron_notes': 'The anthology was inspired by a symposium on Ukrainian émigré poetry held at the University of Ottawa in 1985. Contributors include Vasyl Barka, Marta Kalytovska, Emma Andrijevska, Natalia Livytska-Kholoda, Marta Kalytovska, Vera Vovk, Yuri Tarnawsky, Boh',
  'publisher': '[Edmonton] : Vyd-vo Kanadsʹkoho instytutu ukr. studiĭ, Alʹbertsʹkyĭ universytet

## Set up the database

In [48]:
mongo_client = pymongo.MongoClient("mongodb://localhost:27017/")

In [82]:
database = mongo_client['open_library']
collection = database['meta_data_info']

In [113]:
collection.drop()

In [114]:
collection.insert_many(meta_dict)

<pymongo.results.InsertManyResult at 0x150478fca48>

In [116]:
q = collection.find({''})

for x in q:
    print(x)

{'_id': ObjectId('5fa1a22c8016ac27739bef8a'), 'language': ['Ukrainian', 'English'], 'scanningcenter': 'alberta', 'sponsor': 'University of Alberta Libraries', 'contributor': 'Canadian Institute of Ukrainian Studies', 'date': '1993', 'subject': ['Ukrainian literature', 'Ukrainian literature'], 'title': 'Поза традиції: антологія модерної української поезії в діяспор', 'creator': ['Boychuk, Bohdan', 'Fizer, John', 'Makaryk, Irena R.', 'Struk, Danylo H'], 'isbn': '0920862888', 'collection': ['cius_books', 'ualberta_ukrainian_studies', 'university_of_alberta_libraries', 'toronto'], 'shiptracking': 'cius02', 'cat_key': '1622335', 'patron_notes': 'The anthology was inspired by a symposium on Ukrainian émigré poetry held at the University of Ottawa in 1985. Contributors include Vasyl Barka, Marta Kalytovska, Emma Andrijevska, Natalia Livytska-Kholoda, Marta Kalytovska, Vera Vovk, Yuri Tarnawsky, Boh', 'publisher': '[Edmonton] : Vyd-vo Kanadsʹkoho instytutu ukr. studiĭ, Alʹbertsʹkyĭ universytet

## Scratch pad

In [8]:
test_collection = database['test']
d = {'website': 'www.carrefax.com', 'author': 'Daniel Hoadley', 'colour': 'purple'}
test_collection.insert_one(d)

<pymongo.results.InsertOneResult at 0x1507df5b208>

In [23]:
g = test_collection.find({'website': 'www.carrefax.com'}, {'website': 1})
g
for x in g:
    print(x)

{'_id': ObjectId('5fa14d948016ac27739bef86'), 'website': 'www.carrefax.com'}
{'_id': ObjectId('5fa14d9d8016ac27739bef87'), 'website': 'www.carrefax.com'}
