<a href="https://colab.research.google.com/github/aknip/Streamlit-Gradio/blob/main/Kindle%20Clippings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Kindle Clippings

xxx
Sources:
- https://github.com/dannberg/kindle-clippings-to-obsidian
- https://github.com/stephenfmann/kindle-clippings/tree/master

## First run:

Go to "Setup and Configuration" and check "inital_setup_mode". Then run all cells. Afterwards uncheck "Setup and Configuration".

# Setup and Configuration

In [17]:
# @title Settings

# @markdown Output only notes younger than x days
max_days_old = 99 # @param {type:"number"}

# @markdown Initial Setup Mode for pip install, fetch credentials etc.
initial_setup_mode = False # @param {type:"boolean"}

# @markdown Debug Mode for extensive logging.
debug_mode = True # @param {type:"boolean"}

# @markdown iOS Mode to develop helper functions, no Gradio.
# @markdown Useful for development on iOS, eg. with Carnets App
ios_mode = False # @param {type:"boolean"}

In [12]:
folders = {
    'clippings-input': 'in',
    'clippings-output': 'out'
}

In [13]:
if initial_setup_mode == True:
  !pip install dateparser
else:
  print('No initial setup.')




# Helper Functions

- **create_file_directory**: Creates a new directory - if it not exists yet. The always_delete flag forces a deletion even if it exists.

In [14]:
import shutil
import os

def create_file_directory(directory, always_delete=False):
  # Creates a new directory - if it not exists yet. The always_delete flag forces a deletion even if it exists.
  # Examples:
  # - create_file_directory('texts', False) => creates a new directory only if it not exists yet
  # - create_file_directory('texts', True) => always deletes existing directory and creates a new one
  if os.path.exists(directory):
    if always_delete:
      # delete the diectory recursively
      shutil.rmtree(directory)
  # create directory
  if not os.path.exists(directory):
    os.mkdir(directory)


def find_files(path, extensions=[".txt"], recursive=False):
    # Recursively (optional) find all files with extension in path
    my_files = []
    for root, dirs, files in os.walk(path):
        for f in files:
            if extensions == []:
                my_files.append(os.path.join(root, f))
            else:
                for ext in extensions:
                    if f.endswith(ext):
                        my_files.append(os.path.join(root, f))
        # no recursion / don't look inside any subdirectory
        if recursive == False:
            break
    return my_files


def merge_textfiles(path, extensions=[".txt"], recursive=False, new_filename='merged.txt'):
    # Recursively (optional) find all files with extension in path
    my_files = find_files(path, extensions, recursive)
    merged_text = ''
    for filename in my_files:
      # print(filename)
      f= open(filename,'r')
      if f.mode == 'r':
            contents =f.read()
      f.close()
      merged_text = merged_text + contents + '\n\n\n'

    f= open(new_filename,'w+')
    f.write(merged_text)
    f.close()

# 1. Create MD Files

In [44]:
if ios_mode == False:
  import re
  import hashlib
  from dateutil.parser import parse
  import os
  from datetime import datetime, timedelta, timezone
  import getpass
  import sys
  import dateparser
  import json

  infile = folders['clippings-input'] + '/My Clippings.txt'
  infile_fixed = folders['clippings-input'] + '/My Clippings-fixed.txt'
  outpath = folders['clippings-output'] + '/'

  create_file_directory(folders['clippings-input'], False)
  create_file_directory(folders['clippings-output'], True)

  f= open(infile,'r')
  if f.mode == 'r':
        contents =f.read()
  f.close()
  contents2 = contents.replace(chr(65279), "")
  f= open(infile_fixed,'w+')
  f.write(contents2)
  f.close()


  def getvalidfilename(filename):
      import unicodedata
      clean = unicodedata.normalize('NFKD', filename)
      cleaned_filename = re.sub('[()\'\?\!:&]', '', clean) #^\w\s
      return cleaned_filename


  note_sep = '=========='

  commentstr = '.. '  # RST (reStructuredText) comment

  # EN Clipboard
  #regex_title = re.compile('^(.*)\((.*)\)$')
  #regex_info = re.compile(r'^- (\S+) (.*)[\s|]+Added on\s+(.+)$')
  #regex_loc = re.compile('Loc\. ([\d\-]+)')
  #regex_page = re.compile('Page ([\d\-]+)')
  #regex_date = re.compile('Added on\s+(.+)$')

  # DE Clipboard
  regex_title = re.compile('^(.*)\((.*)\)$')
  regex_info = re.compile(r'^- (\S+) (.*)[\s|]+Hinzugefügt am\s+(.+)$')
  regex_loc = re.compile('bei Position ([\d\-]+)')
  regex_page = re.compile('Seite ([\d\-]+)')
  regex_date = re.compile('Hinzugefügt am\s+(.+)$')

  regex_hashline = re.compile('^\.\.\s*([a-fA-F0-9]+)' + '\s*')


  pub_title = {}
  pub_author = {}
  pub_notes = {}
  pub_hashes = {}

  notes = {}
  locations = {}
  types = {}
  dates = {}

  existing_hashes = {}

  print('Scanning output dir', outpath)
  for directory, subdirlist, filelist in os.walk(outpath):
      for fname in filelist:
          ext = fname[-4:]
          if ext == '.rst' or ext == '.RST':
              print('Found RST file', fname, 'in directory', directory)
              # open file, find commend lines, store hashes
              rst = open(directory + '/' + fname, 'r')
              line = rst.readline()
              lines = 0
              hashes = 0
              while line:
                  lines += 1
                  findhash_result = regex_hashline.findall(line)
                  if len(findhash_result):
                      foundhash = findhash_result[0]
                      existing_hashes[foundhash] = fname
                      hashes += 1
                  line = rst.readline()
              rst.close()
              print(hashes, 'hashes found in', lines, 'scanned lines')
          else:
              print('File', fname, 'does not seem to be RST, skipping', ext)

  print('Found', len(existing_hashes), 'existing note hashes')
  print('Processing clippings file', infile)

  mc = open(infile_fixed, 'r')

  mc.read(0)  # Was initially: mc.read(1) Skip first character - not necessary? Fixed with 0

  line = mc.readline().strip()

  all_notes = []

  while line:

      key = line.strip()
      result_title = regex_title.findall(key)    # Extract title and author
      line = mc.readline().strip()                # Read information line
      note_type, location, date = regex_info.findall(line)[0]    # Extract note type, location and date
      result_loc = regex_loc.findall(location)
      result_page = regex_page.findall(location)
      if len(result_title):
          title, author = result_title[0]
      else:
          title = key
          author = 'Unknown'

      if len(result_loc):
          note_loc = result_loc[0]
      else:
          note_loc = ''

      if len(result_page):
          note_page = result_page[0]
      else:
          note_page = ''

      note_text = ''
      line = mc.readline()                # Skip empty line
      line = mc.readline().strip()

      while line != note_sep:
          note_text += line + '\n'
          line = mc.readline().strip()

      note_hash = hashlib.sha256(note_text.strip().encode('utf8')).hexdigest()[:8]

      if key not in pub_notes:
          pub_notes[key] = []
          pub_hashes[key] = []

      pub_title[key] = title.strip()
      pub_author[key] = author.strip()
      pub_notes[key].append(note_text.strip())
      pub_hashes[key].append(note_hash)

      locstr = ''
      if note_loc:
          locstr = 'loc.' + note_loc
      if note_page:
          if note_loc:
              locstr += ', '
          locstr += 'p.' + note_page

      try:
          #datestr = str(parse(date)) # works only for US dates
          datestr = str(dateparser.parse(date)) # , languages=['de']
      except:
          datestr = date
          print("Date parsing exception: " + date)

      notes[note_hash] = note_text.strip()
      locations[note_hash] = locstr
      types[note_hash] = note_type
      dates[note_hash] = datestr

      line = mc.readline().strip()

      data = {}
      data['hash'] = note_hash
      data['title'] = title.strip()
      data['author'] = author.strip()
      data['date'] = datestr
      data['note_text'] = note_text.strip()
      all_notes.append(data)

  mc.close()

  for key in pub_title.keys():
      nr_notes = len(pub_notes[key])
      author = pub_author[key]
      title = pub_title[key]
      short_title = title.split('|')[0]
      short_title = short_title.split(' - ')[0]
      short_title = short_title.split('. ')[0]
      # shorten title for filename
      if len(short_title) > 25:
          short_title = short_title[:24]

      fname = author + ' - ' + short_title.strip() + '.rst'
      short = 0
      #if (nr_notes > 2):
      #    fname = author + ' - ' + short_title.strip() + '.rst'
      #    short = 0
      #else:
      #    fname = 'short_notes.rst'
      #    short = 1

      new_hashes = 0
      for note_hash in pub_hashes[key]:
          note_date = dateparser.parse(dates[note_hash])
          days_old = (datetime.now() - note_date).days
          #print(str(note_date) + "/" + str(days_old))
          if (note_hash not in existing_hashes) and (days_old < max_days_old):
              new_hashes += 1

      if new_hashes > 0:
          print(new_hashes, 'new notes found for', title)
      else:
          continue            # Skip to next title if there are no new hashes

      outfile = os.path.join(outpath, getvalidfilename(fname))

      newfile = os.path.isfile(outfile)

      out = open(outfile, 'a')

      if short:
          # Short note, output a small header and append to short note file
          if author != 'Unknown':
              titlestr = author + ' - ' + title
          else:
              titlestr = title
          out.write(titlestr + '\n')
          out.write(('-' * len(titlestr)) + '\n\n')
      elif not newfile:
          # Many notes, output with header and metadata in a separate file
          titlestr = title # 'Highlights from ' +
          out.write(titlestr + '\n')
          out.write(('=' * len(titlestr)) + '\n\n')
          if author != 'Unknown':
              out.write(author + '\n\n') # 'Authors: ' +
          #out.write('Recommended By:: \nTags:: [[Books]]\n\n# ' + title + '\n\n### Highlights\n')
          # out.write('## ' + title + '\n\n')
          out.write('## Highlights\n')

      last_date = datetime.now()

      for note_hash in pub_hashes[key]:
          note = notes[note_hash]
          note_type = types[note_hash]
          note_date = dates[note_hash]
          note_loc = locations[note_hash]
          if note_hash in existing_hashes:
              print('Note', note_hash, 'is already in', existing_hashes[note_hash])
          else:
              print('Adding new note to', outfile + ':', note_hash, note_type, note_loc, note_date)

              comment = str(commentstr + note_hash + ' ; ' + note_type + ' ; ' + note_loc + ' ; ' + note_date)

              if short:
                  comment += ' ; ' + author + ' ; ' + title

              # this adds metadata before each note.
              # out.write(comment + '\n\n')
              out.write('- ' + note + '\n')
          try:
              #last_date = parse(note_date) # works only for US dates
              last_date = dateparser.parse(note_date)
          except:
              pass

      out.close()

      # Update file modification time to time of last note

      if last_date.tzinfo is None or last_date.tzinfo.utcoffset(last_date) is None:
          epoch = datetime(1970, 1, 1)
      else:
          epoch = datetime(1970, 1, 1, tzinfo=timezone.utc)
      note_timestamp = (last_date - epoch) / timedelta(seconds=1)
      os.utime(outfile, (note_timestamp, note_timestamp))

      # Write all_notes.md
      merge_textfiles(folders['clippings-output'], ['.rst'], False, folders['clippings-output'] + '/all_notes.md')

      # Write all_notes.json
      all_notes_json = json.dumps(all_notes, sort_keys=False, indent=2)
      f= open(folders['clippings-output'] + '/all_notes.json','w+')
      f.write(all_notes_json)
      f.close()

      shutil.make_archive(outpath + 'archive', 'zip', outpath)

else:
  print('iOS Mode - Nothing to do.')

Scanning output dir out/
Found 0 existing note hashes
Processing clippings file in/My Clippings.txt
4 new notes found for ChatGPT for Better Business Communication: How to Use ChatGPT to Increase Productivity and Communicate More Effectively at Work (ChatGPT prompts, tips, and examples that help you in the workplace)
Adding new note to out/Osman, Hassan - ChatGPT for Better Busin.rst: 207abae3 Deine loc.198-206, p.13 2023-07-10 08:42:25
Adding new note to out/Osman, Hassan - ChatGPT for Better Busin.rst: 6a6bd197 Deine loc.264-267, p.18 2023-07-12 08:42:41
Adding new note to out/Osman, Hassan - ChatGPT for Better Busin.rst: 36b510f9 Deine loc.319-323, p.22 2023-07-14 08:43:08
Adding new note to out/Osman, Hassan - ChatGPT for Better Busin.rst: dd754ee1 Deine loc.380-383, p.25 2023-07-24 08:45:30
3 new notes found for Don't Reply All: 18 Email Tactics That Help You Write Better Emails and Improve Communication with Your Team
Adding new note to out/Osman, Hassan - Dont Reply All 18 Emai.

In [43]:
# Test array / JSON

query_title = 'DIE WELT'
query_title = 'ChatGPT for Better Business Communication: How to Use ChatGPT to Increase Productivity and Communicate More Effectively at Work (ChatGPT prompts, tips, and examples that help you in the workplace)'

search_result = [element for element in all_notes if element['title'] == query_title]
# search_result = list(filter(lambda mynote: mynote['title'] == query_title, all_notes))
print(json.dumps(search_result , sort_keys=False, indent=2))
json_data = json.dumps(all_notes)
#print(json_data)



[
  {
    "hash": "207abae3",
    "title": "ChatGPT for Better Business Communication: How to Use ChatGPT to Increase Productivity and Communicate More Effectively at Work (ChatGPT prompts, tips, and examples that help you in the workplace)",
    "author": "Osman, Hassan",
    "date": "2023-07-10 08:42:25",
    "note_text": "How ChatGPT works ChatGPT is a type of language model, which means that it is trained to predict the next word or phrase in a given context. To do this, ChatGPT is fed a large dataset of human-generated text, which it uses to learn the patterns and structures of natural language. Once it has been trained, ChatGPT can then generate text that is coherent and engaging and is often difficult to distinguish from text written by a human. One of the main capabilities of ChatGPT is its ability to generate text on a wide range of topics. This makes it a useful tool for working professionals, as it can assist with brainstorming and idea generation, as well as generating text

# 2. Create JSON File

In [16]:
import re, json, sys, io, argparse
from time import strptime
from datetime import datetime
from dateutil.tz import tzlocal


DUMMY_AUTHOR = "zznoauthor" # Lower case to ensure it goes at the end.

config_infile = folders['clippings-input'] + '/My Clippings.txt'
DEFAULT_IN = "in/My Clippings.txt"      # clippings file
DEFAULT_OUT = "out/clippings.json"      # output file
DEFAULT_SUB = "in/subs.json"            # substitute authors/titles
DEFAULT_COMBINE = "in/combine.json"     # existing quotes to combine with output
DEFAULT_TIMEZONE = tzlocal()            # timezone for quote timestamps: note this assumes the system timezone because the kindle doesn't store this information

PROG_USAGE = 'python clippings.py '+\
        '[-i <input.txt>] '+\
        '[-o <output.json>] '+\
        '[-s [<substitute_file.json>]] '+\
        '[-c [<combine_file.json>]] ' +\
        '[-z]'

def do_clippings():
    """
        Wrapper
    """

    ## 1. Get user input
    # f_in,f_out,f_substitute,f_combine, f_timezone = parse_arguments()
    f_in = open(DEFAULT_IN,"r",encoding='utf-8')
    f_out = io.open(DEFAULT_OUT,"w",encoding='utf-8')
    f_substitute = open(DEFAULT_SUB,"r",encoding='utf-8')
    f_combine = DEFAULT_COMBINE
    f_timezone = DEFAULT_TIMEZONE

    ## 2. Log
    print("Extracting clippings from "+f_in.name)

    ## 3. Get raw clippings
    #with open(f_in,"r",encoding='utf-8') as f:
    #    raw = f.read()
    raw = f_in.read()
    f_in.close()

    ## 4. Parse into dictionary
    dict_all = parse_raw(raw)

    ## 5. Organise the dictionary
    dict_all = organise(dict_all,f_timezone)

    ## 6. Make substitutions if necessary
    if f_substitute:
        dict_all = substitute(dict_all,f_substitute)

    ## 7. Combine if necessary
    if f_combine:
        dict_all = combine(dict_all,f_combine)

    ## 8. Output the dictionary
    output(dict_all,f_out)

def parse_arguments():
    """
        Controls how the script deals with command-line arguments.
    """

    parser = argparse.ArgumentParser( # See https://docs.python.org/3.7/library/argparse.html
                description='Convert Kindle clippings to JSON format.',
                usage=PROG_USAGE)

    ## Is there an input filepath specified manually?
    parser.add_argument(
            '-i',
            nargs='?', # expects one argument after -i
            const=DEFAULT_IN, # default if -i is provided but no file specified
            default=DEFAULT_IN, # default if -i is not provided
            help='TXT file of Kindle clippings. See README.md.',
            type=argparse.FileType('r',encoding="utf-8") # expect a filename
            )

    ## Is there an output filepath specified manually?
    parser.add_argument(
            '-o',
            nargs='?', # expects one argument after -o
            const=DEFAULT_OUT, # default if -o is provided but no file specified
            default=DEFAULT_OUT, # default if -o is not provided
            help='JSON file for output. See README.md for format.',
            type=argparse.FileType('w',encoding="utf-8") # expect a filename
            )

    ## Are we substituting author/book titles?
    parser.add_argument(
            '-s',
            nargs='?', # expects one argument after -s
            const=DEFAULT_SUB, # default if -s is provided but no file specified
            default=None, # default if -s is not provided
            help='JSON file specifying author/title substitutions. See README.md for correct format.',
            type=argparse.FileType('r',encoding="utf-8") # expect a filename
            )

    ## Are we combining quotes from an external file?
    parser.add_argument(
            '-c',
            nargs='?', # expects one argument after -c
            const=DEFAULT_COMBINE, # default if -c is provided but no file specified
            default=None, # default if -c is not provided
            help='JSON file specifying existing quotations to combine. See README.md for correct format.',
            type=argparse.FileType('r',encoding="utf-8") # expect a filename
            )

    ## Are we assigning the user's local timezone to timestamps?
    parser.add_argument('-z', action='store_true',help="Flag: if set, the user's current timezone will be added to all timestamps.")

    args = parser.parse_args()

    if args.z: args.z = DEFAULT_TIMEZONE # if the flag was set, switch to timezone

    return args.i, args.o, args.s, args.c, args.z


"""
    Primary functions:
        parse_raw, organise, output
"""
def parse_raw(raw):
    """
        Convert Kindle clippings text file to JSON and print to JSON file
    """

    ## 1. Preprocessing e.g. byte order mark
    raw = preprocess(raw)

    ## 2. Get regular expressions
    regex_author_str, regex_noauthor_str = build_regexes()

    ## 3. Perform the regex for entries with an author
    regex_author = re.compile(regex_author_str)
    progress("Regex complete ",0,2)
    dict_author = {"notes_author":regex_author.findall(raw)}

    ## 4. Perform the regex for entries without an author
    regex_noauthor = re.compile(regex_noauthor_str)
    progress("Regex complete ",1,2)
    dict_noauthor = {"notes_noauthor":regex_noauthor.findall(raw)}

    ## 5. Create the dictionary
    dict_all = dict_author
    dict_all.update(dict_noauthor)

    return dict_all


def organise(dict,f_timezone=None):
    """
        How do you want your JSON output organised?
        The input looks like this:
            {
                notes_noauthor:     [
                    [   <title>,
                        Loc.|on Page,
                        <loc or page>,
                        <??>,
                        <??>,
                        <day name>,
                        <month name>,
                        <day number>,
                        <year>,
                        <hour>,
                        <minute>,
                        AM/PM,
                        <quote>
                    ],
                    [...],[...]
                ],
                notes_author:       [
                    [   <title>,
                        <author>,
                        Loc.|on Page,
                        <loc or page>,
                        <??>,
                        <??>,
                        <day name>,
                        <month name>,
                        <day number>,
                        <year>,
                        <hour>,
                        <minute>,
                        AM/PM,
                        <quote>
                    ],
                    [...],[...]
                ]
            }
        I want something like this:
            {
                "Kate Chopin": {
                    "The Awakening and Selected Short Stories": {
                        "l1197": [
                            {
                                "date": "20180405-1257",
                                "quote": "She had reached a stage when she seemed to be no longer feeling her way, working, when in the humor, with sureness and ease. And being devoid of ambition, and striving not toward accomplishment, she drew satisfaction from the work in itself."
                            }
                        ]
                    }
                }
            }
    """

    dict_new = {}

    ## 1. Quotes with an author
    i=0
    total = len(dict["notes_author"]) + len(dict["notes_noauthor"])
    for line in dict["notes_author"]:

        dict_line = build_dict_line(line,f_timezone)

        dict_new = add_line_to_dict_deep(dict_new,dict_line)

        progress("Author complete: ",i,total)
        i+=1

    ## 2. Quotes with no author
    for line in dict["notes_noauthor"]:
        line2 = (line[0],DUMMY_AUTHOR)+line[1:]
        dict_line = build_dict_line(line2,f_timezone)

        dict_new = add_line_to_dict_deep(dict_new,dict_line)

        progress("No author complete: ",i,total)
        i+=1

    ## 3. Pad location keys.
    ##     See function comment text for explanation.
    dict_new = pad_location_keys(dict_new)

    return dict_new

def substitute(dict_all, f_substitute):
    """
        Substitute errant authors/titles with the correct ones.
        Subs file should be a list of objects with form:
            {
                "old":  {
                    "author":   "Batchie",
                    "title":    "Jose_Saramago_Seeing__"
                "new":  {
                    "author_new":   "José Saramago",
                    "title_new":    "Seeing"
                }
            }
    """

    print("Substituting features from "+f_substitute.name)

    dict_subs = json.loads(f_substitute.read())
    f_substitute.close()

    for entry in dict_subs:
        author = entry["old"]["author"]
        title = entry["old"]["title"]
        author_new = entry["new"]["author"]
        title_new = entry["new"]["title"]

        ## Create new entry
        line={author_new:{title_new:dict_all[author][title]}}
        add_line_to_dict_deep(dict_all,line)

        ## Delete old entry
        del dict_all[author][title]

        ## Delete old author if empty
        if len(dict_all[author]) < 1:
            del dict_all[author]

    return dict_all

def combine(dict_all, f_combine):
    """
        Combine quotes from file f_combine into dict_all.
    """

    ## TODO
    pass

    return dict_all

def output(dict_all,f_out=None):
    """
        Print or file write
    """

    if f_out:
        #with io.open(f_out,"w",encoding='utf-8') as f:
        #    f.write(
        f_out.write(
                json.dumps(     # convert dictionary to string and output
                    dict_all,
                    indent=4,
                    sort_keys=True,
                    ensure_ascii=False  # unicode characters
                )
            )

        print("Output JSON to "+f_out.name)
        return

    ## Else, output to STDOUT
    print(json.dumps(dict_all))


"""
    Helper functions
"""

def preprocess(raw):
    """
        Basic text formatting e.g. BOM at start of file
    """


    ## 1. Remove byte order marks if necessary

    if raw[0]=='\ufeff':
        raw = raw[1:]

    # if raw[0] == '\xef':
        # raw = raw[1:]

    # if raw[0] == '\xbb':
        # raw = raw[1:]

    # if raw[0] == '\xbf':
        # raw = raw[1:]

    return raw


def build_regexes():
    """
        Create regular expressions to extract quote data.
    """


    ## 1. Regex parts
    title_regex = "(.+)"
    title_author_regex = "(.+) \((.+)\)"

    ## 2. Regex locations
    loc_all_regex = "(Loc.|on Page) ([0-9]+)( |-([0-9]+)  )"

    ## 3. Regex date and time
    date_regex = "([a-zA-Z]+), ([a-zA-Z]+) ([0-9]+), ([0-9]+)"  # Date
    time_regex = "([0-9]+):([0-9]+) (AM|PM)"  # Time

    ## 4. Regex quote
    content_regex = "(.*)"
    footer_regex = "=+"

    ## 5. Regex newline
    nl_re = "\n*"

    ## 6. Regex quotes with an author
    regex_author_str =\
    title_author_regex + nl_re +\
    "- Highlight " + loc_all_regex + "\| Added on " +\
    date_regex + ", " + time_regex + nl_re +\
    content_regex + nl_re +\
    footer_regex

    ## 7. Regex quotes with no author
    regex_noauthor_str =\
    title_regex + nl_re +\
    "- Highlight " + loc_all_regex + "\| Added on " +\
    date_regex + ", " + time_regex + nl_re +\
    content_regex + nl_re +\
    footer_regex

    return regex_author_str,regex_noauthor_str


def build_dict_line(line,f_timezone=None):
    """
        Convert the line to the new format.
        The current order is:
            0 <title>,
            1 <author>,
            2 "Loc." or "on Page",
            3 <loc> or <page>,
            4 <??>,
            5 <??>,
            6 <day>,
            7 <month>,
            8 <date>,
            9 <year>,
            10 <hour>,
            11 <minute>,
            12 "AM" or "PM",
            13 <quote>
        New format:
            {
                "Ursula K. Le Guin":  {
                    "Tales from Earthsea":   {
                        "l1321": [
                            {
                                "date": "20170618-1726",
                                "quote": "The wizard kept the name Roke in his memory, and when he heard it again, and in the same connection, he knew Hound had been on a true track again."
                            }
                        ],
                    }
                }
            }
    """

    ## 1. Quote
    ##  Add formatting here if necessary.
    quote = line[13]

    ## 2. Timestamp
    # # line[9]
    # # line[7]
    # # line[8]
    # # line[10]
    # # line[11] # minute
    # # line[12] # AM/PM
    year = int(line[9])                             # year
    month = int(strptime(line[7][:3],'%b').tm_mon)  # month (in words)
    day = int(line[8])                              # day
    hour = int(line[10])                            # hour
    if line[12] == "PM":                            # fix for 24 hour clock
        hour = (hour + 12) % 24
    minute = int(line[11])                          # minute

    ## Quote object is quote with timestamp.
    ## Timestamp might have a timezone assigned, or might be naive.
    if f_timezone:
        date = datetime(year,month,day,hour,minute,tzinfo=f_timezone)     # full date, with timezone
    else:
        date = datetime(year,month,day,hour,minute)     # full date, naive

    ## 3. Page/location format
    if line[2] == "on Page":
        loc = "p"
    elif line[2] == "Loc.":
        loc = "l"
    else:
        loc = ""

    loc = loc + str(line[3]) # combine location prefix with location.

    ## 4. Author format
    author = line[1]
    author = author.replace('\\','') # Remove backslashes

    ## 5. Build line.
    ## Timestamp will ignore the "%z" part of the format declaration
    ##  if no timezone has been assigned.
    dict_line = {
        author:{
            line[0]:{ # title
                loc:[
                    {"quote":quote,"date":date.strftime("%Y-%m-%dT%H:%M%z")}
                ]
            }
        }
    }

    return dict_line


def add_line_to_dict_deep(d,line):
    """
        The dict looks like:
            {
                "John Updike": {
                    "Rabbit, Run": {
                        "l4467": [{
                                "date": "20171011-2249",
                                "quote": "Two thoughts comfort him, let a little light through the dense pack of impossible alternatives. Ruth has parents, and she will let his baby live: two thoughts that are perhaps the same thought, the vertical order of parenthood, a kind of thin tube upright in time in which our solitude is somewhat diluted."
                            }
                        ]
                    }
                },
            ...
            }
        So we need to check the title and location to make sure it's not overwritten.
    """

    ## 1. Initialise
    author      = list(line.keys())[0]
    title       = list(line[author].keys())[0]
    location    = list(line[author][title].keys())[0]


    if author not in d:
        ## 2. Author not yet added.
        d.update(line)
        return d

    if title not in d[author]:
        ## 3. Title not yet added.
        d[author].update(line[author])
        return d

    if location not in d[author][title]:
        ## 4. Location not yet added.
        d[author][title].update(line[author][title])
        return d

    ## 5. The location is already there (should be impossible for Locs, rare for Pages)
    ## Just need to add the entry
    d[author][title][location] += line[author][title][location]
    return d


def pad_location_keys(dict_new):
    """
        Multiple locations for the same book may have different lengths,
         which affects the correct ordering of quotes.
        For example, "l163" should be earlier than "l1466", but JSON puts the latter first.
        To fix this, we need to get the max length of location keys for each publication,
         and pad all of that publication's location keys that are shorter
         with leading zeros after the initial letter.
    """

    for books in dict_new.values():
        for key,book in books.items():
            loc_length = longest_loc_length(book)
            book_new = pad_locs(book,loc_length)
            books[key] = book_new # can I do this within a loop??
    return dict_new


def longest_loc_length(book):
    """
        Return the length of the longest location key string.
    """

    loc_length = 0
    for loc_string in book.keys():
        if len(loc_string) > loc_length: loc_length = len(loc_string)

    return loc_length


def pad_locs(book,loc_length):
    """
        Pad location keys as necessary
    """

    book_new = {}
    for key,value in book.items():
        pad = loc_length - len(key) # how much we need to pad
        newkey=key
        while pad > 0:
            newkey = newkey[0] + "0" + newkey[1:]
            pad-=1
        book_new[newkey] = value

    return book_new


def progress(message,step,total):
    """
        Print progress.
    """

    print(message+str(int(step*100/total))+"%", end="\r")


do_clippings()


Extracting clippings from in/My Clippings.txt
Regex complete 0%Regex complete 50%Substituting features from in/subs.json
Output JSON to out/clippings.json
