# import and setup

In [24]:
import os
from pathlib import Path 
import datetime as dt 
import os, json, time, math, requests
from intuitlib.client import AuthClient
import subprocess, shlex

In [13]:
from pyspark.sql import SparkSession, Row, types as T, functions as F 
import re

In [14]:
config_path = "projects/ETLPipeline/ETLCodeBase_Spark/config/info.json"
with open(Path.home()/config_path, "r") as f:
    config = json.load(f)
config

{'systeminfo': {'last_load_date': '2000-01-01', 'last_fx': 1.0},
 'companyinfo': {'us_companies': ['MFUSA', 'MFAZ', 'MSUSA', 'MPUSA'],
  'ca_companies': ['MSL', 'NexGen', 'MFBC', 'MPL', 'MFL']},
 'directories': {'base': 'projects/ETLPipeline/Database',
  'credentials': {'QBO': '.inputs/dev'},
  'bronze': {'QBO': {'PL': 'Bronze/QBO/PL'}},
  'silver': {'QBO': {'PL': 'Silver/QBO/PL'}}}}

In [15]:
today = dt.date.today()
config["systeminfo"]["last_load_date"] = today.isoformat()

# Job Class - task creater

In [16]:
class Job:
    def __init__(self, light_load:bool = True, lastFY:bool = False):
        self.today = dt.date.today()
        current_FY = self.today.year + 1 if self.today.month >= 11 else self.today.year
        if light_load:
            if lastFY:
                first_year = current_FY - 1
            else:
                first_year = current_FY 
        else:
            first_year = 2019
        self.scope = list(range(first_year, current_FY+1, 1))
    
    def get_fx(self):
        key  = os.getenv("ALPHAVANTAGE_KEY")
        url  = ("https://www.alphavantage.co/query?"
                "function=CURRENCY_EXCHANGE_RATE"
                "&from_currency=USD&to_currency=CAD"
                f"&apikey={key}")
        rate = float(requests.get(url, timeout=10).json()
                    ["Realtime Currency Exchange Rate"]["5. Exchange Rate"])
        self.fx = rate

    def check_file(self, path: Path) -> None:
        path.mkdir(parents=True, exist_ok=True)

In [17]:
self = Job(light_load=True,lastFY=True)
self.scope

[2025, 2026]

In [20]:
today.year

2025

## create jobs

In [21]:
jobs_MFL = []
jobs_others = []
last_day = {3: 31, 6:30, 9:30, 12:31}
for company in config["companyinfo"]["us_companies"] + config["companyinfo"]["ca_companies"]:
    if company == "MFL":
        jobs = jobs_MFL 
    else:
        jobs = jobs_others
    fy = self.scope[0]
    jobs.append((company,
                 dt.date(fy-1, 10, 1),
                 dt.date(fy-1, 12, 31)))    # add last quarter from last for fiscal year consistency
    for year in self.scope:
        for month in [1, 4, 7, 10]:
            if dt.date(year,month,1) > today:
                continue
            jobs.append((company, 
                         dt.date(year, month, 1), 
                         dt.date(year, month+2, last_day[month+2])))
len(jobs_MFL), len(jobs_others)

(5, 40)

In [22]:
jobs_MFL

[('MFL', datetime.date(2024, 10, 1), datetime.date(2024, 12, 31)),
 ('MFL', datetime.date(2025, 1, 1), datetime.date(2025, 3, 31)),
 ('MFL', datetime.date(2025, 4, 1), datetime.date(2025, 6, 30)),
 ('MFL', datetime.date(2025, 7, 1), datetime.date(2025, 9, 30)),
 ('MFL', datetime.date(2025, 10, 1), datetime.date(2025, 12, 31))]

In [None]:
BASE_DIR = Path.home() / config["directories"]["base"]
token_path = BASE_DIR / config["directories"]["credentials"]["QBO"]
os.listdir(token_path)

['client_secrets.json', 'copies', 'tokens.json']

In [None]:
def _refresh_auth_client(company: str, config:dict) -> AuthClient:
    """ 
        create auth_client object for company called with, return auth_client for data extraction
    """
    mode = "production"
    BASE_DIR = Path.home() / config["directories"]["base"]
    token_path = BASE_DIR / config["directories"]["credentials"]["QBO"]
    with open(token_path/"client_secrets.json", "r") as f:
        secret = json.load(f)
    # create auth_client object
    if company in ["MFUSA","MPUSA","MFAZ","MSUSA"]:
        auth_client = AuthClient(client_id = secret["USA"]["client_id"],
                        client_secret = secret["USA"]["client_secret"],
                        redirect_uri = "https://developer.intuit.com/v2/OAuth2Playground/RedirectUrl",
                        environment = mode)
    else:
        auth_client = AuthClient(client_id = secret["CA"]["client_id"],
                                client_secret = secret["CA"]["client_secret"],
                                redirect_uri = "https://developer.intuit.com/v2/OAuth2Playground/RedirectUrl",
                                environment = mode)
    # assign tokens
    with open(token_path/"tokens.json", "r") as f:
        tokens = json.load(f)
    auth_client.access_token = tokens[company]["access_token"]
    auth_client.refresh_token = tokens[company]["refresh_token"]
    auth_client.realm_id = tokens[company]["realm_id"]
    # # refresh
    # auth_client.refresh()
    # # save refreshed tokens
    # tokens[company]["access_token"] = auth_client.access_token 
    # tokens[company]["refresh_token"] = auth_client.refresh_token 
    # tokens[company]["realm_id"] = auth_client.realm_id 
    # with open(token_path/"tokens.json", "w") as f:
    #     json.dump(tokens, f, indent=4)
    return auth_client 

## append credentials

In [57]:
extract_MFL = []
extract_others = []
current_company = ""
for (company, start, end) in jobs_MFL + jobs_others:
    if company != current_company:
        # refresh company credential
        auth_client = _refresh_auth_client(company, config)
        current_company = company
    if company == "MFL":
        extract = extract_MFL 
    else:
        extract = extract_others
    extract.append({
        "company": company,
        "realm_id": auth_client.realm_id,
        "token": auth_client.access_token,
        "start": start.isoformat(),
        "end": end.isoformat(),
        "report": "ProfitAndLossDetail",
        "out_path": BASE_DIR / config["directories"]["bronze"]["QBO"]["PL"]/company/(str(start.year)+"_"+str(start.month)+".json")
    })
len(extract_MFL) + len(extract_others)

45

# start Spark

In [None]:
# spark.stop()

In [84]:
# get Poetry's python path
PY = subprocess.check_output(shlex.split("poetry env info --path"), text=True).strip() + "/bin/python"
spark = (
    SparkSession.builder
      .appName("test")
      .master("local[*]")                           # use all cores during dev
      .config("spark.local.ip", "127.0.0.1")        # silences loopback complaints
      .config("spark.driver.bindAddress", "127.0.0.1")
      .config("spark.driver.host", "127.0.0.1")
      .config("spark.pyspark.driver.python", PY)    # ensure Poetry python on driver
      .config("spark.pyspark.python", PY)           # ...and executors
      # .config("spark.python.use.daemon", "true")  # default; faster
      .getOrCreate()
)

# Extract

## partition

In [68]:
# MFL 3 partitions, others 5 partitions
# MFL_partition = [extract_MFL[i::3] for i in range(3)]
# other_partition = [extract_others[i::5] for i in range(5)]
# partitions = MFL_partition + other_partition 

partitions = extract_MFL + extract_others
len(partitions)

45

In [76]:
def extract_partition(it) -> None:
    """ 
        This function processes tasks inside one partition
            one task is extract raw content from QBO API call
    """
    BASE_URL = "https://quickbooks.api.intuit.com"
    minor_version = 75
    session = requests.Session()
    session.headers.update({"Accept": "application/json"})

    # request_with_retry

    for task in it:
        session.headers.update({
            "Authorization": f'Bearer {task["token"]}',
        })
        company = task["company"]
        realm_id = task["realm_id"]
        start = task["start"]
        end = task["end"]
        report_name = task["report"]

        url = f"{BASE_URL}/v3/company/{realm_id}/reports/{report_name}"
        params = {
            "minorversion": minor_version,
            "start_date": start,
            "end_date": end,
            "columns": "all"
        }

        resp = session.get(url, params=params)
        payload = resp.content

        task["out_path"].parent.mkdir(parents=True, exist_ok=True)

        with open(task["out_path"], "wb") as f:
            f.write(payload)

    

In [86]:
cores = 8
rdd = spark.sparkContext.parallelize(partitions, cores * 3)
rdd.foreachPartition(extract_partition)

                                                                                