In [36]:
# Libraries
from google.oauth2 import service_account
from googleapiclient.discovery import build
import demjson
import numpy as np
import pandas as pd
from datetime import datetime
import os
from pathlib import Path
from glob import glob

# Local files
import utils

In [30]:
SERVICE_ACCOUNT_FILE = 'byui-python-analysis-30a31cf00f2c.json'
REPORT_REQUEST_FILE = 'report-request.json'

START_DATE = '12/1/2019'
END_DATE = '12/8/2019'
PERIOD = '4D'

REQUEST_ALL_PAGES = True
REQUEST_ALL_PERIODS = True

CACHE_FOLDER = './data/cache'
REPORT_FOLDER = './data'

### Step 1: Initalize API with Service Account File

In [31]:
credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=['https://www.googleapis.com/auth/analytics.readonly'])
service = build('analytics','v3', credentials=credentials)
analytics = build('analyticsreporting', 'v4', credentials=credentials)

### Step 2: Read in Report Request file

In [32]:
requests = [] 
with open(REPORT_REQUEST_FILE) as f:
    requests = demjson.decode(f.read())

for request in requests:
    if REQUEST_ALL_PAGES:
        # Set page size to max, so that making less requests
        request['pageSize'] = 100000
    
    # Don't need totals or ranges, so leave them out
    request['hideTotals'] = True
    request['hideValueRanges'] = True
    
    folder_name = os.path.join(CACHE_FOLDER, utils.hash_dict(request))
    Path(folder_name).mkdir(parents=True, exist_ok=True)

### Step 3: Run Report

In [35]:
for reqs in utils.each_period(requests, START_DATE, END_DATE, PERIOD):
    for i, report in utils.each_page(analytics, reqs, CACHE_FOLDER):
        
        date_range = reqs[0]['dateRanges'][0]
        num_rows = len(report.get('data').get('rows',[]))
        total_rows = int(report.get('data').get('rowCount',0))
        last_row = int(report.get('nextPageToken',total_rows))
        page_size = reqs[i].get('pageSize',1000)
        
        print('{} to {} report #{} page {:.0f} of {:.0f} (rows {}-{} of {})'.format(
            date_range['startDate'], date_range['endDate'], i+1,
            np.ceil(last_row / page_size),
            np.ceil(total_rows / page_size),
            last_row-num_rows, last_row, total_rows
        ))
        
        if('samplesReadCounts' in report.get('data')):
            for read_count, space_size in zip(report.get('data').get('samplesReadCounts'), report.get('data').get('samplingSpaceSizes')):
                print('sample rate: {:0.1%}  ({} / {})'.format(int(read_count) / int(space_size), read_count, space_size))
        
        # Convert to DataFrame
        df = utils.report_to_frame(report)
        
        filename = os.path.join(
            CACHE_FOLDER,
            utils.hash_dict(requests[i]),
            '{}_{:.0f}.csv'.format(date_range['startDate'],np.ceil(last_row / page_size)))
        
        df.to_csv(filename)
        
        if not REQUEST_ALL_PAGES:
            break;
    if not REQUEST_ALL_PERIODS:
        break;

2019-12-01 to 2019-12-04 report #1 page 1 of 6 (rows 0-100000 of 545566)
2019-12-01 to 2019-12-04 report #1 page 2 of 6 (rows 100000-200000 of 545566)
2019-12-01 to 2019-12-04 report #1 page 3 of 6 (rows 200000-300000 of 545566)
2019-12-01 to 2019-12-04 report #1 page 4 of 6 (rows 300000-400000 of 545566)
2019-12-01 to 2019-12-04 report #1 page 5 of 6 (rows 400000-500000 of 545566)
2019-12-01 to 2019-12-04 report #1 page 6 of 6 (rows 500000-545566 of 545566)
2019-12-05 to 2019-12-08 report #1 page 1 of 5 (rows 0-100000 of 425642)
2019-12-05 to 2019-12-08 report #1 page 2 of 5 (rows 100000-200000 of 425642)
2019-12-05 to 2019-12-08 report #1 page 3 of 5 (rows 200000-300000 of 425642)
2019-12-05 to 2019-12-08 report #1 page 4 of 5 (rows 300000-400000 of 425642)
2019-12-05 to 2019-12-08 report #1 page 5 of 5 (rows 400000-425642 of 425642)


### Step 4: Concatenate Reports

In [38]:
# Create Directory if doesn't exist
Path(REPORT_FOLDER).mkdir(parents=True, exist_ok=True)

# Get Current Time
current_time = datetime.now().strftime("%h%d_%H%M")

for i, request in enumerate(requests):
    files = glob(os.path.join(CACHE_FOLDER, utils.hash_dict(request), "*.csv"))
    print('concatenating {} reports...'.format(len(files)))
    df = pd.concat(map(pd.read_csv, files))
    filename = os.path.join(REPORT_FOLDER, current_time+'_'+str(i)+'.csv')
    print('writing report to "{}"'.format(filename))
    df.to_csv(filename)

concatenating 11 reports
writing report to "./data/Jan17_2221_0.csv"
