In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd

from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
from PIL import Image, ImageDraw, ImageFont
from pyvis.network import Network

In [None]:
SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']

# This is the file name of the JSON you downloaded from Google Cloud Platform
KEY_FILE = 'bbe-web-cro-key.json'

# This is your Google Universal Analytics view ID.
# This can be found by selecting the drop-down on the top-left of your Google Analytic home 
# where it says 'All accounts >' This number will be listed in the right column beneath the view name.
VIEW_ID = 'XXXXXX'

In [None]:
# Define here the various parameters for the Google Analytics API
start_dt = '14daysAgo'
end_dt = 'yesterday'

# Rather than define the variables you're seeking within the request each time, it's easier to store them
# as lists that can then be read into the function.
dimensions = ['ga:PagePath', 'ga:PreviousPagePath']
metrics = ['ga:Pageviews', 'ga:exits', 'ga:entrances', 'ga:avgTimeOnPage']

In [None]:
# This function initializes the Google Analytics API with the credentials you've saved in your JSON file.
def initialize_analyticsreporting():
    credentials = ServiceAccountCredentials.from_json_keyfile_name(KEY_FILE, SCOPES)
    analytics = build('analyticsreporting', 'v4', credentials=credentials)
    
    return analytics

In [None]:
# This is the function to return your initial query results.
def get_report(analytics, dimensions, metrics, start_dt='yesterday', end_dt='7daysAgo', nextPageToken=None):
    return analytics.reports().batchGet(
        body={
            'reportRequests': [
                {
                    'viewId': VIEW_ID,
                    
                    'dateRanges': [
                        {
                            'startDate': start_dt,
                            'endDate': end_dt
                        }],
                    # Rather than type these variables out each time this creates the appropriate dictionary
                    # based on the list of dimensions and metrics that have been defined above.
                    'dimensions':[{'name': name} for name in dimensions],
                    'metrics':[{'expression': exp} for exp in metrics],
                    
                    # Your results will be paginated. Here you can define how many results to return
                    # and input the pageToken to get the next page of results.
                    'pageToken': nextPageToken,
                    'pageSize': '10000',
                    
                    # If you have a great deal of web traffic UA will only return a sample.
                    # You can control the size of that sample by specifying 'SMALL', 'MEDIUM', or 'LARGE'
                    'samplingLevel': 'LARGE'
                }
                
            ]}
    ).execute()

In [None]:
# This function will take your response and pass it into a Pandas dataframe to make it more managable.
# If there is data in addition to your initial response it will gather that data and append it as well.
def response_to_df(response, dimensions, metrics, nextPageToken = None):
    response = get_report(analytics, dimensions, metrics, start_dt, end_dt, nextPageToken)
    nextPageToken = response.get('reports')[0].get('nextPageToken')
    
    data_dict = {f"{i}": [] for i in dimensions + metrics}
    _data = pd.DataFrame(data=data_dict)
    
    # This begins the loop to gather additional data beyond the first request.
    while nextPageToken != None:
        print(nextPageToken)
        
        for report in response.get('reports', []):
            
            for row in report.get('data', {}).get('rows', []):
                for i, key in enumerate(dimensions):
                    data_dict[key].append(row.get('dimensions', [])[i])
                
                dateRangeValues = row.get('metrics', [])
                for values in dateRangeValues:
                    all_values = values.get('values', [])
                
                for i, key in enumerate(metrics):
                    data_dict[key].append(all_values[i])
        
        _data = _data.append(pd.DataFrame(data=data_dict))
        
        response = get_report(analytics, dimensions, metrics, start_dt, end_dt, nextPageToken)
        nextPageToken = response.get('reports')[0].get('nextPageToken')
        
    _data.columns = [col.split(':')[-1] for col in _data.columns]
    _data = _data.reset_index(drop=True)
            
    return _data

In [None]:
analytics = initialize_analyticsreporting()

In [None]:
response = get_report(analytics, dimensions, metrics, start_dt, end_dt, None)

In [None]:
analytics_df = response_to_df(response, dimensions, metrics)

In [None]:
# Unless you've filetered them out in your Universal Analytics view, your page names likely include query parameters.
# Here we remove the query parameters so that the pages can be grouped together properly for a complete view.
analytics_df['PagePath'] = analytics_df['PagePath'].apply(lambda x: x.split('?')[0])
analytics_df['PreviousPagePath'] = analytics_df['PreviousPagePath'].apply(lambda x: x.split('?')[0])

In [None]:
# Ensure your metrics are properly formatted so we can run mathematical calculations on them.
analytics_df['Pageviews'] = analytics_df['Pageviews'].astype('int64')
analytics_df['exits'] = analytics_df['exits'].astype('int64')
analytics_df['entrances'] = analytics_df['entrances'].astype('int64')
analytics_df['avgTimeOnPage'] = analytics_df['avgTimeOnPage'].astype('float')

In [None]:
# Define how to aggregate the metrics columns once they're grouped by their base page path.
agg_dict = {
    'Pageviews':'sum', 
    'exits':'sum', 
    'entrances':'sum', 
    'avgTimeOnPage':'mean'
}

# Group pages by base path, calculate the aggregate measures for their metrics and order them by most-trafficked first.
analytics_df = analytics_df.groupby(['PagePath', 'PreviousPagePath'], as_index=False).agg(agg_dict).sort_values('Pageviews', ascending=False)
analytics_df = analytics_df.reset_index(drop=True)

In [None]:
# Here I add a calculation for the exit rate based on the aggregated page data.
analytics_df['exitrate'] = analytics_df['exits'] / analytics_df['Pageviews']

# Pages fall within certain categories, this is to add a column to designate those categories.
# Analytics allows you to do this in the platform as Content Groupings.
analytics_df['s_grouping'] = 'default'
analytics_df['d_grouping']= 'default'
for i in analytics_df.index:
    if 'application' in analytics_df.loc[i, 'PagePath']:
        analytics_df.loc[i, 'd_grouping'] = 'application'
    elif '/program' in analytics_df.loc[i, 'PagePath']:
        analytics_df.loc[i, 'd_grouping'] = 'programs'
    elif '/blog' in analytics_df.loc[i, 'PagePath']:
        analytics_df.loc[i, 'd_grouping'] = 'blog'
        
    if 'application' in analytics_df.loc[i, 'PreviousPagePath']:
        analytics_df.loc[i, 's_grouping'] = 'application'
    elif '/program' in analytics_df.loc[i, 'PreviousPagePath']:
        analytics_df.loc[i, 's_grouping'] = 'programs'
    elif '/blog' in analytics_df.loc[i, 'PreviousPagePath']:
        analytics_df.loc[i, 's_grouping'] = 'blog'

In [None]:
# Define certain formatting options for the network graph.
var_options = """ var options = {
  "nodes": {
    "font": {
      "strokeWidth": 1
    }
  },
  "edges": {
    "arrows": {
      "to": {
        "enabled": true,
        "scaleFactor": 0.5
      }
    },
    "color": {
      "inherit": true
    },
    "font": {
      "strokeWidth": 1
    },
    "smooth": false
  },
  "physics": {
    "forceAtlas2Based": {
      "springLength": 150,
      "gravitationalConstant": -400,
      "springConstant": 0.20,
      "damping": 0.99,
      "avoidOverlap": 0.84
    },
    "minVelocity": 0.75,
    "solver": "forceAtlas2Based",
    "timestep": 0.93
  }
}

"""

In [None]:
# Since different node groups have different colors this establishes a dictionary of dictionaries which will
# define the colors for the different states of the nodes.
color_dict = {
    'default': {
        'border': '#98C8ED',
        'background': '#98C8ED',
        'highlight': {
            'border': '#192E4E',
            'background': '#192E4E'
        },
        'hover': {
            'border': '#192E4E',
            'background': '#192E4E'
        }
    },
    'application': {
        'border': '#26B4B2',
        'background': '#26B4B2',
        'highlight': {
            'border': '#192E4E',
            'background': '#192E4E'
        },
        'hover': {
            'border': '#192E4E',
            'background': '#192E4E'
        }
    },
    'blog': {
        'border': '#F7C793',
        'background': '#F7C793',
        'highlight': {
            'border': '#192E4E',
            'background': '#192E4E'
        },
        'hover': {
            'border': '#192E4E',
            'background': '#192E4E'
        }
    },
    'programs': {
        'border': '#AB6027',
        'background': '#AB6027',
        'highlight': {
            'border': '#192E4E',
            'background': '#192E4E'
        },
        'hover': {
            'border': '#192E4E',
            'background': '#192E4E'
        }
    }
}

In [None]:
# Initialize the network graph, set formatting and enable use within a Jupyter Notebook.
ex_net = Network(height='750px', width='100%', bgcolor='white', font_color='black', notebook=True)

# Apply the formatting defined above. This option and '.show_buttons(filter_=True)' (line 36) do not work well together.
# Use '.show_buttons(filter_=True)' to define your formatting and then copy the code as I have done above
# to recreate the graph with your chosen formatting options easily in the future.
ex_net.set_options(var_options)

# set the physics layout of the network
views_min_int = int(len(analytics_df) * 0.10)
max_width_int = int(analytics_df.iloc[2, 4])
ex_data_df = pd.DataFrame()
ex_data_df = analytics_df[(analytics_df['s_grouping'] == 'application') | (analytics_df.index < views_min_int)]

for i in ex_data_df.index:
    src, dst, w, v, sg, dg = ex_data_df.loc[i][['PreviousPagePath', 'PagePath', 'Pageviews', 'exitrate', 's_grouping', 'd_grouping']]
    
    v = 1 - v
    w = w / max_width_int
    if w > 1:
        w = 1
    
    ex_net.add_node(dst, dst, title=dst, color=color_dict.get(dg), value=v, group=dg)
    ex_net.add_node(src, src, title=src, color=color_dict.get(sg), value=v, group=sg)
    ex_net.add_edge(src, dst, value=w, width=w)

# add data to node hover label
for node in ex_net.nodes:
    node['title'] += '<br>Exit rate: ' + str(round(abs(node['value'] - 1)  * 100,2)) + '%<br> Top Origins:'
    
    _df = pd.DataFrame()
    _df = analytics_df[analytics_df['PagePath'] == node['id']][:5]
    for i in _df.index:
        node['title'] += '<br>' + _df.loc[i, 'PreviousPagePath'] + ': ' + str('{:,}'.format(_df.loc[i, 'Pageviews']))

#ex_net.show_buttons(filter_=True)
ex_net.show('example.html')