# Python script for harvesting Google Analytics for Data Management website

This script harvests Google Analytics Data for the Data Management website for review [our data management website](http://www.bu.edu/datamanagement)

## Important Resources

This script uses [Stijn Debrouwere's](https://github.com/debrouwere) [Google Analytics Python library](https://github.com/debrouwere/google-analytics/). Be sure to checkout [his documentation.](https://github.com/debrouwere/google-analytics/wiki)


In [2]:
# import libraries
import googleanalytics as ga
import pandas as pd
import matplotlib.pyplot as plt
import time


In [52]:
# setup google analytics 
accounts = ga.authenticate(identity='Google Analtics API', interactive=True, save=True)
profile = accounts[0].webproperties[0].profile

In [11]:
# our pages 
pages = ["/datamanagement/", 
         "/datamanagement/background/", 
         "/datamanagement/background/whatisdata/", 
         "/datamanagement/background/importance/",
         "/datamanagement/background/data-life-cycle/",
         "/datamanagement/background/cite/",
         "/datamanagement/background/relevant-background-reading/",
         "/datamanagement/outline/",
         "/datamanagement/outline/funding-agencies/",
         "/datamanagement/outline/elements/",
         "/datamanagement/outline/elements/organize/",
         "/datamanagement/outline/elements/tracking-changes/",
         "/datamanagement/outline/elements/storage/",
         "/datamanagement/outline/elements/metadata/",
         "/datamanagement/outline/elements/access/",
         "/datamanagement/outline/elements/data-repositories/",
         "/datamanagement/outline/building-a-data-curation-profile/",
         "/datamanagement/outline/web-resources-and-tutorials/",
         "/datamanagement/resources/",
         "/datamanagement/resources/ist-services/",
         "/datamanagement/resources/library-services/",
         "/datamanagement/resources/archiving-services/",
         "/datamanagement/resources/templates/",
         "/datamanagement/resources/issp/",
         "/datamanagement/resources/discipline-specific-data-management-articles/",
         "/datamanagement/calendar/",
         "/datamanagement/ask-a-question/",
         "/datamanagement/research-data-management/",
        ]

In [12]:
# years we are interested in 
years = ["2013", "2014", "2015"]

In [13]:
# columns for our dataframes 
columns = ["date", "pageviews"]

In [239]:
#
# Let's get our Google Analytics 
#

#loop through pages
for p in pages: # loop through pages
    df_ = pd.DataFrame(columns=columns) # create data frame 
    url = p + "index.html" # create URL 
   
    for y in years:
        start = y + "-01-01"       
        if y == "2015": # BU changed its google analytics reporting, after 2015-05-31, you no longer need index.html
            end = y + "-05-31"
        else:
            end = y + "-12-31"
        
        
        # run google analytics query and save as dataframe 
        p_df = profile.core.query.filter(pagepath__re=url).metrics("ga:pageviews").daily(start, end).as_dataframe()

        time.sleep(1) # take a 1 second break 
        
        df_ = df_.append(p_df) # append to df_
        
        if y == "2015": # if year is 2015, also run a second query to get data from 2015-06-01 forward
            p_df_2 = profile.core.query.filter(pagepath__eq=p).metrics("ga:pageviews").daily("2015-06-01", "2015-11-30").as_dataframe()
            df_ = df_.append(p_df_2)
        
        print p + " " + y + " done!" #print which pages are done to help track progress 
    df_.to_csv("data-out/" + p.replace("/", "") + ".csv") # save data as csv in data-out folder

/datamanagement/ 2013 done!
/datamanagement/ 2014 done!
/datamanagement/ 2015 done!
/datamanagement/background/ 2013 done!
/datamanagement/background/ 2014 done!
/datamanagement/background/ 2015 done!
/datamanagement/background/whatisdata/ 2013 done!
/datamanagement/background/whatisdata/ 2014 done!
/datamanagement/background/whatisdata/ 2015 done!
/datamanagement/background/importance/ 2013 done!
/datamanagement/background/importance/ 2014 done!
/datamanagement/background/importance/ 2015 done!
/datamanagement/background/data-life-cycle/ 2013 done!
/datamanagement/background/data-life-cycle/ 2014 done!
/datamanagement/background/data-life-cycle/ 2015 done!
/datamanagement/background/cite/ 2013 done!
/datamanagement/background/cite/ 2014 done!
/datamanagement/background/cite/ 2015 done!
/datamanagement/background/relevant-background-reading/ 2013 done!
/datamanagement/background/relevant-background-reading/ 2014 done!
/datamanagement/background/relevant-background-reading/ 2015 done!
/

In [17]:
#
# Grab site wide analytics
#

site_df = pd.DataFrame(columns=columns) # create data frame 
url = "/datamanagement/" # create URL 

for y in years:
    start = y + "-01-01"       
    if y == "2015": # BU changed its google analytics reporting, after 2015-05-31, you no longer need index.html
        end = y + "-05-31"
    else:
        end = y + "-12-31"
    
    # run google analytics query and save as dataframe 
    site_ = profile.core.query.filter(pagepath__contains=url).metrics("ga:pageviews").daily(start, end).as_dataframe()        
    
    time.sleep(1) # take a 1 second break 

    site_df = site_df.append(site_) # append to df_

    if y == "2015": # if year is 2015, also run a second query to get data from 2015-06-01 forward
        site_df_2 = profile.core.query.filter(pagepath__contains=url).metrics("ga:pageviews").daily("2015-06-01", "2015-11-30").as_dataframe()
        site_df = site_df.append(site_df_2)

    print y + " done!" #print which pages are done to help track progress 
    
site_df.to_csv("data-out/site.csv") # save data as csv in data-out folder


2013 done!
2014 done!
2015 done!


# Testing 

Below this are some tests I have been running for gathering other information

In [56]:
#
# Testing 
#

t_q = profile.core.query.filter(pagepath__eq="/datamanagement/").metrics("ga:bounceRate", "ga:entrances", "ga:exitRate", "ga:avgTimeOnPage", "ga:pageviews").daily("2015-06-01", "2015-11-30").as_dataframe()


In [54]:
t_q.head()

Unnamed: 0,avg_time_on_page,bounce_rate,date,entrances,exit_rate
0,0.0,100,2015-06-01,2,100.0
1,0.0,0,2015-06-02,0,0.0
2,0.0,100,2015-06-03,1,100.0
3,24.0,0,2015-06-04,1,0.0
4,152.333333,50,2015-06-05,4,33.333333


In [45]:
t_q["avg_time_on_page_min"] = t_q["avg_time_on_page"]/60

In [57]:
t_q.head()
t_q.to_csv("t_q.csv")