## Setup

Requires a HubSpot Private app with an API access token with the following scopes:

- `crm.objects.custom.read`
- `crm.objects.companies.read`
- `crm.objects.contacts.read`
- `crm.objects.users.read`

The token should be stored in an environment variable called `HUBSPOT_ACCESS_TOKEN`.

You can copy the sample environment file to get started; run the following command from the root of this repository:

```bash
cp .env.sample .env
```

Then open `.env` and fill in with your access token.

In [None]:
import os
from pathlib import Path

from hubspot import HubSpot

from data.utils import hubspot_to_df, write_json_records


ACCESS_TOKEN = os.environ["HUBSPOT_ACCESS_TOKEN"]
LAST_NOTE_PATH = Path("last_note_id")
PAGE_SIZE = 10

PROPERTIES = ["hs_created_by", "hs_createdate", "hs_note_body"]
ASSOCIATION_TYPES = ["companies", "contacts", "vendors"]
ASSOCIATION_COLUMNS = [f"associations.{assoc}.results" for assoc in ASSOCIATION_TYPES]

hubspot = HubSpot(access_token=ACCESS_TOKEN)
hubspot_notes_api = hubspot.crm.objects.notes.basic_api
hubspot_objects_api = hubspot.crm.objects.basic_api
hubspot_properties_api = hubspot.crm.properties.core_api

In [None]:
def get_last_note_id():
    try:
        return LAST_NOTE_PATH.read_text(encoding="utf-8").strip()
    except FileNotFoundError:
        return None


def update_last_note_id(last_note_id):
    LAST_NOTE_PATH.write_text(last_note_id, encoding="utf-8")

In [None]:
LAST_NOTE_PATH.unlink(missing_ok=True)

In [None]:
props_response = hubspot_properties_api.get_all("notes", archived=False)
props_df = hubspot_to_df(props_response)
write_json_records(props_df, "notes_props.json")

props_response = hubspot_properties_api.get_all("users", archived=False)
props_df = hubspot_to_df(props_response)
write_json_records(props_df, "users_props.json")

In [None]:
def get_all_pages(api, **kwargs):
    kwargs["limit"] = PAGE_SIZE
    pages = []

    response = api.get_page(**kwargs)
    pages.append(response)

    while response.paging:
        kwargs["after"] = response.paging.next.after
        response = api.get_page(**kwargs)
        pages.append(response)

    return pages

In [None]:
user_props = ["hs_given_name", "hs_family_name"]
users_responses = get_all_pages(hubspot_objects_api, object_type="users", properties=user_props)
users = hubspot_to_df(users_responses)

In [None]:
users.info()

In [None]:
last_note_id = get_last_note_id()

notes_response = hubspot_notes_api.get_page(
    after=last_note_id,
    limit=PAGE_SIZE,
    properties=PROPERTIES,
    associations=ASSOCIATION_TYPES
)

update_last_note_id(notes_response.paging.next.after)

notes = hubspot_to_df(notes_response)

In [None]:
# add any missing association columns
missing_assoc = [col for col in ASSOCIATION_COLUMNS if col not in notes.columns]
notes = notes.reindex(columns=notes.columns.tolist() + missing_assoc)

# drop notes without a body
notes = notes.dropna(subset=["properties.hs_note_body"])

# drop notes without a creator
notes = notes.dropna(subset=["properties.hs_created_by"])

# drop notes without any of the association types (e.g. all are NA)
notes = notes.dropna(subset=ASSOCIATION_COLUMNS, how="all")

In [None]:
notes.info()