# Calculate cloud costs per day


In [4]:
import json
import subprocess
import csv
import sys
from google.cloud import storage
import io
import tempfile
from dateutil.parser import parse

In [5]:
def totals_from_json(file):
    totals = {}
    for item in json.load(file):
        cost = float(item['cost']['amount'])
        time_range = (item['startTime'], item['endTime'])
        totals[time_range] = totals.get(time_range, 0) + cost

    return totals

def totals_from_csv(file):
    totals = {}
    reader = csv.DictReader(file)
    for row in reader:
        time_range = (row['Start Time'], row['End Time'])
        totals[time_range] = totals.get(time_range, 0) + float(row['Cost'])

    return totals

In [20]:
def get_daily_cost(project, billing_bucket_name, kind='json'):
    totals = {}
    client = storage.Client(project)

    bucket = storage.Bucket(client, billing_bucket_name)
    if kind == 'csv':
        prefix='report-'
    else:
        prefix='billing-'
    blobs = bucket.list_blobs(prefix=prefix)

    for blob in blobs:
        buffer = io.StringIO(blob.download_as_string().decode())

        if kind == 'csv':
            current_totals = totals_from_csv(buffer)
        else:
            current_totals = totals_from_json(buffer)

        for time_range, cost in current_totals.items():
            totals[time_range] = totals.get(time_range, 0) + cost


    # We want to push out sorted jsonl
    sorted_items = [
        { 'start_time': start_time, 'end_time': end_time, 'cost': cost }
        for (start_time, end_time), cost in totals.items()
    ]

    sorted_items.sort(key=lambda d: d['start_time'])

    return sorted_items

In [21]:
costs = get_daily_cost('ucb-datahub-2018','ucb_datahub_18_billing')



In [22]:
with open('../data/processed/cloud-costs.jsonl', 'w') as f:
    for day in costs:
        f.write(json.dumps(day) + '\n')


In [None]:
!ls