# Obtaining Storage Statistics For all of Workspaces Associated with a single FireCloud Billing Project 

In [1]:
import json
import os
import csv
import subprocess
from fisswrapper.workspace_functions import list_workspaces
from fisswrapper.workspace_functions import get_workspace_info
import pandas as pd


In [2]:
project = "broad-firecloud-testing"
outputFile = "broad-firecloud-testing.tsv"
tmpFile = "tmp.txt"

In [3]:
workspaceList = list_workspaces(project=project)

In [4]:
with open(outputFile, 'w') as out:
    for workspaceListEntry in workspaceList:
        entryFields = workspaceListEntry.split(sep='\t')
        project = entryFields[0]
        workspace = entryFields[1]
        workspaceInfo = json.loads(get_workspace_info(project, workspace))
        owners = workspaceInfo['owners']
        bucketName = workspaceInfo['workspace']['bucketName'] 
        out.write("{0}\t{1}\t{2}\n".format(workspace, owners, bucketName))

# IMPORTANT!
Before going further, execute the get_bucket_sizes.sh script, which will create a tmp.txt file with file sizes.  I'm doing it this way because I was having trouble running gsutil from within my python notebook.  The bash script creates a list of bucket sizes, one integer per line in the file tmp.txt.  The nth line in tmp.txt corresponds to the nth line in the broad-firecloud-testing.tsv file.

In the following cell we combine the contents of broad-firecloud-testing.tsv and tmp.txt into a list object, and then create a pandas data frame from that list.

In [16]:
combinedTable = []
with open(outputFile) as tsvfile, open(tmpFile) as tmpfile:
    reader1 = csv.reader(tsvfile, delimiter='\t')
    reader2 = csv.reader(tmpfile, delimiter='\t')
    for row1, row2 in zip(reader1, reader2):
        combinedTable.append(row1+row2)
labels = ['workspace_name', 'workspace_owners', 'workspace_bucket', 'bucket_size']
myDataFrame = pd.DataFrame.from_records(combinedTable,columns=labels)
# need to change the bucket_size column data from strong to numeric
myDataFrame[['bucket_size']] = myDataFrame[['bucket_size']].apply(pd.to_numeric)

In [18]:
myDataFrame.sort_values(by=['bucket_size'], ascending=False, inplace=True)

In [19]:
myDataFrame

Unnamed: 0,workspace_name,workspace_owners,workspace_bucket,bucket_size
228,REBC_Oct16,"['stewart@broadinstitute.org', 'mhanna@broadin...",fc-4520e968-446a-4f9f-85e2-8fcc901269dc,2.949855e+14
59,Broad_MutationCalling_QC_Workflow_V1_BestPract...,"['mhanna@broadinstitute.org', 'esalinas@broadi...",fc-01a0364e-80b5-4f9e-b787-61c66a79f720,8.452215e+12
194,PanCan_Full,"['igleshch@broadinstitute.org', 'danielr@broad...",fc-ca949d54-dc6f-41c9-b698-117650bf779a,3.637411e+12
4,ACC_PanCan,"['danielr@broadinstitute.org', 'dlivitz@broadi...",fc-67d97596-903e-4405-83b3-ec258265962e,2.660387e+12
229,REBC_Oct16_test_splitting,['esalinas@broadinstitute.org'],fc-36f0435a-17fe-4ec4-859d-7043230509ca,2.469926e+12
97,Dev_GINC,['dlivitz@broadinstitute.org'],fc-1c01ba44-d1d4-470f-b4af-11fc8eb0532f,1.447478e+12
268,TCGA_UCS_ControlledAccess_V1-0_DATA_PCAWG,"['birger@broadinstitute.org', 'mhanna@broadins...",fc-b9a868f5-8c40-4252-b119-cf23ac1d8a2c,1.290774e+12
94,deTiN_public,['amaro@broadinstitute.org'],fc-97c909e0-8389-4238-beeb-32693b59b846,1.079730e+12
168,MEETING_10_25,"['mhanna@broadinstitute.org', 'stewart@broadin...",fc-75b01897-978e-4a63-9695-f82cf28ccb2d,1.009873e+12
293,THCA_filtering_ws,"['birger@broadinstitute.org', 'stewart@broadin...",fc-bea5e26a-be4e-4c95-afac-8c0ccb4cb083,8.729051e+11


Now write sorted dataframe as a TSV file.

In [13]:
finalFilename = 'broad-firecloud-testing-complete.tsv'
myDataFrame.to_csv(finalFilename,sep='\t')