## Setup

Requires a HubSpot Private app with an API access token with the following scopes:

- `crm.objects.custom.read`
- `crm.objects.companies.read`
- `crm.objects.contacts.read`
- `crm.objects.users.read`

The token should be stored in an environment variable called `HUBSPOT_ACCESS_TOKEN`.

You can copy the sample environment file to get started; run the following command from the root of this repository:

```bash
cp .env.sample .env
```

Then open `.env` and fill in with your access token.

In [None]:
import os
from pathlib import Path

from hubspot import HubSpot
import pandas as pd

from data.utils import hubspot_get_all_pages, hubspot_to_df, write_json_records


ACCESS_TOKEN = os.environ["HUBSPOT_ACCESS_TOKEN"]
LAST_NOTE_PATH = Path("last_note_id")
PAGE_SIZE = int(os.environ["HUBSPOT_PAGE_SIZE"])

PROPERTIES = ["hs_created_by", "hs_createdate", "hs_note_body"]
ASSOCIATION_TYPES = ["companies", "vendors"]
ASSOCIATION_COLUMNS = [f"associations.{assoc}.results" for assoc in ASSOCIATION_TYPES]
ASSOCIATION_COLUMNS_COMPANIES = ASSOCIATION_COLUMNS[0]
ASSOCIATION_COLUMNS_VENDORS = ASSOCIATION_COLUMNS[1]

hubspot = HubSpot(access_token=ACCESS_TOKEN)
hubspot_notes_api = hubspot.crm.objects.notes.basic_api
hubspot_objects_api = hubspot.crm.objects.basic_api
hubspot_properties_api = hubspot.crm.properties.core_api

In [None]:
def get_last_note_id():
    try:
        return LAST_NOTE_PATH.read_text(encoding="utf-8").strip()
    except FileNotFoundError:
        return None


def update_last_note_id(last_note_id):
    LAST_NOTE_PATH.write_text(last_note_id, encoding="utf-8")

In [None]:
props_response = hubspot_properties_api.get_all("notes", archived=False)
props_df = hubspot_to_df(props_response)
write_json_records(props_df, "notes_props.json")

props_response = hubspot_properties_api.get_all("users", archived=False)
props_df = hubspot_to_df(props_response)
write_json_records(props_df, "users_props.json")

In [None]:
user_props = ["hs_given_name", "hs_family_name"]
users_responses = hubspot_get_all_pages(hubspot_objects_api, page_size=PAGE_SIZE, object_type="users", properties=user_props)
users = hubspot_to_df(users_responses)

In [None]:
users.info()

In [None]:
LAST_NOTE_PATH.unlink(missing_ok=True)

In [None]:
last_note_id = get_last_note_id()

notes_responses = hubspot_get_all_pages(
    hubspot_notes_api,
    page_size=PAGE_SIZE,
    after=last_note_id,
    properties=PROPERTIES,
    associations=ASSOCIATION_TYPES
)

update_last_note_id(notes_responses[-1].results[-1].id)

notes = hubspot_to_df(notes_responses)

In [None]:
notes.info()

In [None]:
# rename vendor association column
# weird name, maybe because it is a custom association type?
notes[ASSOCIATION_COLUMNS_VENDORS] = notes["associations.p5519226_vendors.results"]

# select only the columns needed for later
cols = {
    "created_at": "created_at",
    "id": "id_note",
    "properties.hs_created_by": "id_user",
    "properties.hs_note_body": "body",
    ASSOCIATION_COLUMNS_COMPANIES: ASSOCIATION_COLUMNS_COMPANIES,
    ASSOCIATION_COLUMNS_VENDORS: ASSOCIATION_COLUMNS_VENDORS
}
notes = notes[cols.keys()]
# and rename some for simplicity
notes = notes.rename(columns=cols)

# drop notes without a body
notes = notes.dropna(subset=["body"])

# drop notes without a creator
notes = notes.dropna(subset=["id_user"])

# drop notes without any of the association types (e.g. all are NA)
notes = notes.dropna(subset=ASSOCIATION_COLUMNS, how="all")

# expand list-like association columns into rows
# there should be only max 1 of each association type per row
# explode each column separately since they have different counts of NAs
# the resulting DataFrame should have
#    row count == count of rows with company association
#               + count of rows with vendor association
notes = (notes.explode(ASSOCIATION_COLUMNS_COMPANIES, ignore_index=True)
              .explode(ASSOCIATION_COLUMNS_VENDORS, ignore_index=True))

# expand dict row values into columns
# e.g. the columns have values like:
#     {"id": 12345, "type": "note_to_company"}
# and we want to pull the value from "id" into its own column in the DataFrame

# .apply(pd.Series) converts the value to a DataFrame with a column for each key in the dict (id, type)
# ["id"] keeps only the column we need
# then .concat puts the columns into a single DataFrame
id_company = notes[ASSOCIATION_COLUMNS_COMPANIES].apply(pd.Series)["id"]
notes = pd.concat([notes, id_company], axis=1).rename(columns={"id": "id_company"})

id_vendor = notes[ASSOCIATION_COLUMNS_VENDORS].apply(pd.Series)["id"]
notes = pd.concat([notes, id_vendor], axis=1).rename(columns={"id": "id_vendor"})

# remove now-expanded columns and clean up index
notes = notes.drop(columns=ASSOCIATION_COLUMNS)
notes.reset_index(drop=True, inplace=True)

In [None]:
notes.info()