In [1]:
pip install gspread oauth2client

Note: you may need to restart the kernel to use updated packages.


In [2]:
import gspread
import json
import numpy as np
import pathlib

from oauth2client.service_account import ServiceAccountCredentials
from pandas import DataFrame, MultiIndex, to_datetime, to_numeric, read_json

In [3]:
def get_sheet(sheet, doc):
    """
    helper function to open a specific google sheet
    """
    scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]

    credentials = ServiceAccountCredentials.from_json_keyfile_name(
        "carbonplan-03794eb9a308.json", scope
    )  # Your json file here

    gc = gspread.authorize(credentials)
    wks = gc.open(doc)
    sheet = wks.worksheet(sheet)
    return sheet

In [4]:
sheet = get_sheet("Sheet2", "Forest-Offset-Projects")

In [5]:
def get_df():
    data = sheet.get_all_values()
    data = np.asarray(data)
    df = DataFrame(data[1:], columns=data[0])
    
    levels = ['level0', 'level1', 'level2']
    left = df[levels].copy()
    left[levels[:2]] = left[levels[:2]].mask(left == '', None).ffill()
    index = MultiIndex.from_frame(left)
    
    types = df['type']
    
    df.index = index
    df = df.drop(columns=levels + ['type'])
    df = df.transpose()
    df = df.iloc[1:]
    
    types.index = index
    
    return df, types

In [6]:
def json_loads(v):
    try:
        if 'SEE NOTE' in v:
            return None
        return json.loads(v)
    except:
        print(v)
        raise

def cast_col(col, type_str):
    if type_str == 'YYYY-MM-DD':
        return col  # to_datetime(col, errors='coerce')
    elif type_str == 'str' or type_str == 'str:previous_project_id':
        return col.astype(str)
    elif type_str == 'bool':
        return col.astype(bool)
    elif type_str == 'int':
        return to_numeric(col, errors='coerce', downcast='integer')
    elif type_str == 'float':
        return to_numeric(col.str.replace(',', ''), errors='coerce', downcast='float')
    elif type_str == '[lon:float, lat:float]' or type_str == '[int]':
        return [json_loads(v) if v else [] for v in col]
    elif type_str == '[(is_intentional, size)]':
        return col  # TODO
    else:
        try:
            return [json_loads(v) if v else "" for v in col]
        except:
            print(col)
            raise
            
df, types = get_df()

for index, col in df.iteritems():
    type_str = types[index]
    df[index] = cast_col(col, type_str)

In [7]:
df.head()

level0,documents,documents,documents,documents,documents,project,project,project,project,project,...,rp[6],rp[6],rp[6],rp[6],rp[6],rp[6],rp[6],rp[6],rp[6],rp[6]
level1,initial,initial_attachment_G,initial_attachment_G,initial_attachment_L,initial_attachment_L,project_id,arb_id,name,opo,apd,...,components,components,components,components,components,components,secondary_effects,attestation,attestation,notes
level2,date,date,source,date,source,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,...,ifm_1,ifm_3,ifm_7,ifm_8,ifm_14,ifm_17,Unnamed: 18_level_2,name,is_opo,Unnamed: 21_level_2
CAR1183,2018-01-09,,initial,,registry,CAR1183,CAFR5283,Forest Carbon Partners - Mescalero Apache Trib...,Mescalero Apache Tribe,Forest Carbon Partners,...,,,,,,,,,False,
ACR202,2015-04-23,,,,,ACR202,CAFR5043,Blue Source - Goodman Improved Forest Manageme...,Goodman Forest LLC,Blue Source,...,,,,,,,,,False,
ACR361,2019-12-04,,initial,,registry,ACR361,CAFR5361,Forest Carbon Partners - Port Graham Corporati...,Port Graham Corporation,"Forest Carbon Partners, L.P.",...,,,,,,,,,False,
CAR993,,,initial,,,CAR993,CAFR5011,Yurok Tribe/Forest Carbon Partners CKGG Improv...,Yurok Tribe,Finite Carbon,...,,,,,,,,,False,
CAR1197,2018-05-11,,registry,,registry,CAR1197,CAFR5297,Finite Carbon - Upper Hudson Woodlands ATP IFM,"Upper Hudson Woodlands ATP, LP",Finite Carbon,...,,,,,,,,,False,


In [8]:
df.to_json('retro.json', orient='index', date_format='iso', date_unit='s', indent=2)

In [9]:
strip = lambda x: x.strip()
def str_to_tuple(s):
    return tuple(map(strip, s[1:-1].replace("'", "").split(',')))

In [10]:
df2 = read_json('retro.json', orient='index', convert_dates=True)
df2.columns = MultiIndex.from_tuples(map(str_to_tuple, df2.columns))

In [11]:
# df.compare(df2)

In [12]:
def make_project(name):
    """
    return a template project
    """
    obj = {
        "type": "Offset-Project",
        "name": name,
        "documents": {},
        "project": {},
        "live_carbon": {},
        "baseline": {},
        "rp": []
    }  
    return obj

In [13]:
projects = []
for key, col in df.iterrows():
    d = make_project(key)
    
    # documents
    keys = col['documents'].index.get_level_values(0).unique()
    d['documents'].update({k: col[('documents', k)].to_dict() for k in keys})
    
    # project
    temp = col['project']
    temp.index = temp.index.droplevel(1)   
    d['project'].update(temp.to_dict())
    
    # live_carbon
    d['live_carbon'].update({'components': col[('live_carbon', 'components')].to_dict()})
    d['live_carbon']['notes'] = col[('live_carbon', 'notes', '')]

    # baseline
    temp = col['baseline']
    temp.index = temp.index.droplevel(1)
    d['baseline'].update(temp.to_dict())
    d['baseline']['components'] = col[('baseline', 'components')].to_dict()
    d['baseline']['economics'] = col[('baseline', 'economics')].to_dict()
    
    # rp[0-?]
    for i in range(7):
        key = f'rp[{i}]'
        if not col[(key, 'date_submitted', '')]:
            continue
            
        
        temp = col[key]
        temp.index = temp.index.droplevel(1)
        rp = temp.to_dict()
        rp['components'] = col[(key, 'components')].to_dict()
        rp['attestation'] = col[(key, 'attestation')].to_dict()
        d['rp'].append(rp)
    projects.append(d)

  return runner(coro)
  raw_cell, store_history, silent, shell_futures)


In [14]:
with open('retro_projects.json', 'w') as f:
    f.write(json.dumps(projects, indent=2))