In [None]:
import pandas as pd
import numpy as np

In [None]:
pip install xlsxwriter

Collecting xlsxwriter
[?25l  Downloading https://files.pythonhosted.org/packages/2b/98/17875723b6814fc4d0fc03f0997ee00de2dbd78cf195e2ec3f2c9c789d40/XlsxWriter-1.3.3-py2.py3-none-any.whl (144kB)
[K     |████████████████████████████████| 153kB 2.7MB/s 
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-1.3.3


In [None]:
#Creating a function to read the clipping file and return it as a string
def file_read(file_path, encoding):
  """
  Uses the file path to read the Kindle text clipping file and return it as a
  string.
  """
  with open(file_path, mode = 'r', encoding = encoding) as file:
    text = file.read()
    text = text.replace("\ufeff", "")
  return text

In [None]:
# Function for removing repeated highlights made in error

def error_remover(df_test):
  """
  Takes a dataframe and removes the repeated highlights...and returns the filtered dataframe.
  The index of the returned dataframe is changed.

  df_test because its testing was done on a dataframe named df_test and it seemed unnecessry to
  change the name of the dataframe.
  """
  # Downward Clearance
  df_test["Repeated"] = False
  for i in range(len(df_test) - 1):
    highlight_1 = df_test.Highlight[i]
    highlight_2 = df_test.Highlight[i+1]
    if (highlight_1 in highlight_2):
      df_test.Repeated[i] = True
    else:
      continue

  df_test.drop(index = df_test[df_test.Repeated == True].index.values, inplace = True)
  df_test.reset_index(drop = True, inplace = True)

  # Upward Clearance
  for i in range(1, len(df_test)):
    highlight_1 = df_test.Highlight[i]
    highlight_2 = df_test.Highlight[i - 1]
    if (highlight_1 in highlight_2):
      df_test.Repeated[i] = True
    else:
      continue

  df_test.drop(index = df_test[df_test.Repeated == True].index.values, inplace = True)
  df_test.reset_index(drop = True, inplace = True)

  df_test.drop(["Repeated"], axis = 1, inplace = True)
  return df_test

In [None]:
# Creating a function to extract the highlights from Kindle Clippings and export the highlights as a csv file
def clipping_extractor(file_path, encoding):
  """
  Takes the file path to the Kindle Clippings file and exports a .xlsx file with all the highlights in My Drive...
  Use encoding 'UTF-8'
  """
  # Using the file_read function to read the My Clippings file and store it as a string in clipping
  clipping = file_read(file_path, encoding)

  # Splitting the clipping string wrt "==========" and storing all the entries in a dataframe
  clipping_entries = clipping.split("==========")
  df_clips = pd.DataFrame(data = {"Entries": clipping_entries})
  df_clips.head(5)

  #Making the first entry consistent with other entries
  df_clips.Entries[0] = "\n" + df_clips.Entries[0]

  # Removing the last entry which is empty
  df_clips.drop(axis = 0, index = len(df_clips) - 1, inplace = True)
  df_clips.tail()

  # Extracting info from the entries column
  # Splitting the Entries column w.r.t "\n" and removing ""
  df_clips["Entries_split_list"] = df_clips.Entries.str.split("\n").apply(lambda x: [i for i in x if i != ""])

  # Creating new columns
  df_clips["Book_info"] = df_clips.Entries_split_list.str[0]
  df_clips["Highlight_info"] = df_clips.Entries_split_list.str[1]
  df_clips["Highlight"] = df_clips.Entries_split_list.str[2]

  # Extracting highlight location, date and time
  df_clips["Highlight_location"] = df_clips.Highlight_info.str.split("|").str[0].str[20:]
  df_clips["Highlight_day_time"] = df_clips.Highlight_info.str.split("|").str[-1].str[10:]
  df_clips["Highlight_date"] = df_clips.Highlight_day_time.str[:-11]
  df_clips["Highlight_time"] = df_clips.Highlight_day_time.str[-11:]

  # Removing NaN Highlights and resetting indexes
  df_clips.dropna(subset = ["Highlight"] , inplace = True)
  df_clips.reset_index(inplace = True, drop = True)

  # Removing unnecessary columns
  df_clips.drop(['Entries', 'Entries_split_list','Highlight_info', 'Highlight_day_time'], axis = 1, inplace = True)

  # Rearranging the columns
  df_clips = df_clips[['Book_info', 'Highlight_date', 'Highlight_time', 'Highlight_location', 'Highlight']]

  # Creating a column for calculating highlight word count
  df_clips["Highlight_word_count"] = df_clips.Highlight.str.split(" ").apply(lambda x: len(x))

  # Removing highlights with words less than 4
  df_clips.drop(index = df_clips[df_clips.Highlight_word_count < 4].index, inplace = True)
  df_clips.reset_index(inplace = True, drop = True)

  # Removing Repeated highlights using the error_remover function
  df_clips = error_remover(df_clips)

  # Parsing time of highlight into a separate column
  df_clips["Highlight_time"] = df_clips.Highlight_time.apply(lambda x: x[1:] if (x[0] == " ") else x)
  df_clips["Highlight_date_and_time"] = df_clips.Highlight_date + " " +  df_clips.Highlight_time
  df_clips["Parsed_date_and_time"] = pd.to_datetime(df_clips.Highlight_date_and_time)
  df_clips.drop(["Highlight_date", "Highlight_time"], axis = 1, inplace = True)

  # Sorting the entries according to Book and then time
  df_clips.sort_values(by = ["Book_info", "Parsed_date_and_time"], inplace = True)
  df_clips.reset_index(drop = True, inplace = True)

  # Removing unnecessary columns
  df_clips.drop(['Highlight_date_and_time', 'Highlight_word_count'], axis = 1, inplace = True)

  # Rearranging the columns
  df_clips = df_clips[['Book_info', 'Parsed_date_and_time', 'Highlight_location', 'Highlight']]

  # Printing list of Books present in the clippings and their highlights count
  books = list(df_clips.Book_info.unique())
  print("\nBooks included in the Clippings File:\n")
  for i in books:
    highlight_count = len(df_clips[df_clips.Book_info == i])
    print(f"{i}: {highlight_count} Highlights")
  print("\n")
  print(f"Total Highlights: {len(df_clips)}")

  # Renaming the columns appropriately
  df_clips.rename(mapper = {'Book_info': "Book (Author)", 'Parsed_date_and_time': "Time of Highlight", 'Highlight_location': "Highlight Location", 'Highlight': "Highlight"}, inplace = True, axis = 1)

  # Exporting the dataframe as an excel file
  df_clips.to_excel("/content/drive/My Drive/Google_Colab/Kindle_clipping/df_clips.xlsx", index = False, engine='xlsxwriter')

  print("\nOperation Successful...\n Path to the output excel file is: '/content/drive/My Drive/Google_Colab/Kindle_clipping/df_clips.xlsx' ")
  return df_clips

In [None]:
df_clips = clipping_extractor("/content/drive/My Drive/Google_Colab/Kindle_clipping/Input/My Clippings.txt", encoding= 'UTF-8')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Books included in the Clippings File:

A Brief History of Time (Stephen Hawking;Grover Gardner): 5 Highlights
And Then There Were None (Agatha Christie): 9 Highlights
Astrophysics for People in a Hurry (Neil DeGrasse Tyson): 41 Highlights
Atomic Habits: Tiny Changes, Remarkable Results (James Clear): 19 Highlights
Brief Answers to the Big Questions (Stephen Hawking): 3 Highlights
Crude Volatility (Robert McNally): 355 Highlights
Deep Work (Cal Newport): 14 Highlights
End of Watch (The Bill Hodges Trilogy Book 3) (Stephen King): 12 Highlights
Finders Keepers (Stephen King): 1 Highlights
Golden Son (The Red Rising Trilogy, Book 2) (Brown, Pierce): 8 Highlights
Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow (Aurélien Géron): 7 Highlights
Harry Potter 01 &The Philosopher's Stone (Illustrated) (J.K. Rowling): 2 Highlights
How To Win Friends And Influence People (Carnegie, Dale): 28 Highlights
Kindle User's Guide (Amazon): 1 Highlights
Little Fires Everywhere (Celeste Ng

In [None]:
print(df_clips)

                                         Book (Author)  ...                                          Highlight
0    A Brief History of Time (Stephen Hawking;Grove...  ...  Newton postulated a law of universal gravitati...
1    A Brief History of Time (Stephen Hawking;Grove...  ...  the concept of time has no meaning before the ...
2    A Brief History of Time (Stephen Hawking;Grove...  ...  physical reasons why there had to be a beginni...
3    A Brief History of Time (Stephen Hawking;Grove...  ...  a similar star at half the distance. This law ...
4    A Brief History of Time (Stephen Hawking;Grove...  ...  Because of the equivalence of energy and mass,...
..                                                 ...  ...                                                ...
953                  You Don't Know Me (Imran Mahmood)  ...  hand while other people had theirs hands in th...
954                  You Don't Know Me (Imran Mahmood)  ...  tube. Camden on a Saturday night is pretty muc...
9