In [1]:
%%sh
cd .. && make clean

Cleaning *_report.txt files, *.log files, tmp* files and __pycache__ directories...
Clean complete.


In [2]:
import notebooks_setup

notebooks_setup.setup()

Added /Users/arjunbawa/Desktop/cluster-links to Python path.
Added /Users/arjunbawa/Desktop/cluster-links/src to Python path
Current working directory: /Users/arjunbawa/Desktop/cluster-links/notebooks


In [3]:
import json
from pathlib import Path

data_input_dir = notebooks_setup.ROOT_DIR.joinpath("data/input")
data_output_dir = notebooks_setup.ROOT_DIR.joinpath("data/output")
print(f"{data_input_dir=}")
print(f"{data_output_dir=}")

data_input_dir=PosixPath('/Users/arjunbawa/Desktop/cluster-links/data/input')
data_output_dir=PosixPath('/Users/arjunbawa/Desktop/cluster-links/data/output')


In [4]:
from data_collection import ChromeBookmarkCollector

cbc = ChromeBookmarkCollector(data_input_dir.glob("*.html").__next__())
chrome_bm_json_path = cbc.save_json(
    data_output_dir.joinpath("bookmarks.json"), return_path=True
)
print(f"Bookmarks saved to {chrome_bm_json_path}")
chrome_bm_json = json.loads(chrome_bm_json_path.read_text(encoding="utf-8"))
assert cbc.get_bookmarks() == chrome_bm_json

Bookmarks saved to /Users/arjunbawa/Desktop/cluster-links/data/output/bookmarks.json


In [5]:
def _get_links(file_path: str) -> list[str]:
    """
    Read a file containing links (one per line or in markdown format) and
    return a list of URLs that contain 'http'. Extracts the URL starting
    from the first occurrence of 'http' on each line.
    """
    path = Path(file_path).expanduser()
    print(f"Reading links from {path}")

    with path.open("r", encoding="utf-8") as f:
        lines = f.readlines()

    links = [line[line.find("http") :].strip() for line in lines if "http" in line]
    return links


def get_links_from_file():
    from config import load_config

    config = load_config()  # Loads config.json
    file_path = config.get("LINKS_FILE", "links.txt")
    links = _get_links(file_path)
    print("Extracted links:")
    for link in links:
        print(link)

In [6]:
def print_attr_tree(bookmarks_json):
    starter_set = [(0, k, v) for k, v in bookmarks_json.items()]
    while starter_set:
        item = starter_set.pop(0)
        indent_level = item[0]
        k = item[1]
        v = item[2]
        print(indent_level * " ", k, type(v))
        if isinstance(v, list) and v:
            starter_set = [(indent_level + 1, f"{k}_item", v[0])] + starter_set
        elif isinstance(v, dict):
            starter_set = [(indent_level + 1, k, v) for k, v in v.items()] + starter_set


print_attr_tree(chrome_bm_json)

 title <class 'str'>
 links <class 'list'>
  links_item <class 'dict'>
   title <class 'str'>
   href <class 'str'>
   add_date <class 'str'>
   icon <class 'str'>
 children <class 'list'>
  children_item <class 'dict'>
   title <class 'str'>
   add_date <class 'str'>
   last_modified <class 'str'>
   personal_toolbar_folder <class 'str'>
   links <class 'list'>
    links_item <class 'dict'>
     title <class 'str'>
     href <class 'str'>
     add_date <class 'str'>
     icon <class 'str'>
   children <class 'list'>
    children_item <class 'dict'>
     title <class 'str'>
     add_date <class 'str'>
     last_modified <class 'str'>
     links <class 'list'>
     children <class 'list'>
      children_item <class 'dict'>
       title <class 'str'>
       add_date <class 'str'>
       last_modified <class 'str'>
       links <class 'list'>
        links_item <class 'dict'>
         title <class 'str'>
         href <class 'str'>
         add_date <class 'str'>
       children <class 'l

In [7]:
from data_collection.models import load_bookmarks_from_json, load_bookmarks_from_dict

# Load and validate bookmarks from JSON
bookmarks_from_json = load_bookmarks_from_json(chrome_bm_json_path)
bookmarks_from_dict = load_bookmarks_from_dict(chrome_bm_json)

assert (bookmarks_from_dict == bookmarks_from_json) and (
    bookmarks_from_dict.model_dump_json() == bookmarks_from_json.model_dump_json()
)
bookmarks = bookmarks_from_dict

# Get statistics
print(f"Total bookmarks: {bookmarks.total_bookmarks}")
print(f"Total folders: {bookmarks.total_folders}")

Total bookmarks: 3655
Total folders: 70


In [8]:
# Find bookmarks by domain
google_bookmarks = []
for folder in bookmarks.children:
    google_bookmarks.extend(folder.find_links_by_domain("google.com"))
list(map(lambda x: (x.domain, x.title, x.href), google_bookmarks))[:5]

[('calendar.google.com',
  'GCal IOS Sync Settings',
  'https://calendar.google.com/calendar/syncselect'),
 ('calendar.google.com',
  'Calendar',
  'https://calendar.google.com/calendar/u/0/r/week'),
 ('sites.google.com',
  'Math 211 / ECE 205 - Homepage of Eduardo Martin-Martinez',
  'https://sites.google.com/site/emmfis/teaching/math-211'),
 ('sites.google.com',
  "mat267-spring-2017 - Dmitry Panchenko's homepage",
  'https://sites.google.com/site/panchenkomath/Home/mat267-spring-2017'),
 ('sites.google.com',
  'Homepage of Eduardo Martin-Martinez',
  'https://sites.google.com/site/emmfis/home')]

In [9]:
# Find a specific folder
math_folder = bookmarks.find_folder_by_path(["Bookmarks Bar", "Math/CS/ML/AI/Data"])
if math_folder:
    print(f"Math folder contains {len(math_folder.links)} direct bookmarks")
# Get a flat list of all bookmarks with paths
flat_list = bookmarks.get_flat_bookmarks()
flat_list[:5]

[{'title': 'Google Bookmarks',
  'url': 'https://accounts.google.com/Login?continue=http://www.google.com/bookmarks&hl=en&service=bookmarks',
  'folder': 'Unnamed',
  'path': ['Unnamed'],
  'add_date': '2014-08-24T07:30:57',
  'domain': 'accounts.google.com'},
 {'title': "ePub Bud - Publish, Convert, Store, and Download free children's ebooks online for the iPad and nook color!",
  'url': 'http://www.epubbud.com/',
  'folder': 'Unnamed',
  'path': ['Unnamed'],
  'add_date': '2014-08-24T07:30:50',
  'domain': 'www.epubbud.com'},
 {'title': 'FREE Mobile Rhyming Dictionary',
  'url': 'http://m.rhymer.com/',
  'folder': 'Unnamed',
  'path': ['Unnamed'],
  'add_date': '2014-08-24T07:30:48',
  'domain': 'm.rhymer.com'},
 {'title': 'Bookmarks',
  'url': 'chrome://bookmarks/',
  'folder': 'Bookmarks Bar',
  'path': ['Unnamed', 'Bookmarks Bar'],
  'add_date': '2015-05-16T00:00:03',
  'domain': ''},
 {'title': 'Canada Revenue Agency - CRA Login',
  'url': 'https://cms-sgj.cra-arc.gc.ca/gol-ged/a