## US Fed Docs Registry (OCLC numbers)

The US Fed Docs Registry: https://github.com/HTGovdocs/feddoc_oclc_nums

In [None]:
import datetime
import json
import os
import shutil


import git

### Clone data from Github repository (frequently updated)

In [None]:
# clone data fresh, remove existing repository if needed.
if os.path.exists("feddoc_oclc_nums"):
    shutil.rmtree("feddoc_oclc_nums")
print("Cloning data from Github...")
repo = git.Repo.clone_from("https://github.com/HTGovdocs/feddoc_oclc_nums", "feddoc_oclc_nums")

### Copy file to data directory with a manifest

In [None]:
dataset_name = "feddoc_oclc_nums"
dataset_file = "data/{}.txt".format(dataset_name)

if not os.path.exists("data"):
    os.makedirs("data")

# copy file to data folder
shutil.copyfile("feddoc_oclc_nums/feddoc_oclc_nums.txt", dataset_file)

In [None]:
# create manifest file
manifest = {}
manifest["name"] = "feddoc_oclc_nums"
manifest["description"] = "A daily updated list of OCLC numbers determined to be Federal Documents."

# use the latest commit as a proxy for datetime
commit = repo.head.commit
file_datetime_proxy = datetime.datetime.utcfromtimestamp(commit.committed_date).isoformat()
manifest["datetime"] = str(file_datetime_proxy)

manifest["schema"] = {
    "oclc": "object"
}

manifest["format"] = {
    "type": "text",
    "extension": "txt",
    "header": False,
    }

manifest["data-origins"] = [{
    "origin": "https://github.com/HTGovdocs/feddoc_oclc_nums",
    "datetime": str(file_datetime_proxy)
}]

# create manifest to accompany data
manifest_file = "data/{}.manifest.json".format(manifest["name"])
with open(manifest_file, 'w') as outfile:
    json.dump(manifest, outfile, indent=4, sort_keys=True)

### Finishing up!

In [None]:
print("Completed notebook ({}).".format(datetime.datetime.utcnow().isoformat()))
print("Output created:")
print(dataset_file)
print(manifest_file)