## This Notebook contains code to pull up list of files from Google Drive
####            a)  Pull up list of documents in a Google Drive account mapped to the service account referenced below by private key
####            b)  Pull up the contents of the Google sheets and render it into a Dataframe 
####            c)  Display and store using %store magic function to access across notebooks


In [84]:
# Import libraries 
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from apiclient.discovery import build #Util for API calls
import gspread # For connection to google sheets
from oauth2client.service_account import ServiceAccountCredentials
from df2gspread import df2gspread as d2g # d2g will be used once ready to upload data back to sheets

In [96]:
# Configure the connection 
scopes = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive.readonly']
key_file_location = 'gsheetsprivkey\serviceaccount.json'

# Give the path to the Service Account Credential json file 
credentials = ServiceAccountCredentials.from_json_keyfile_name(key_file_location,scopes)

# Authorise your Jupyter Notebook to connect to Google Drive API using private key credentials in 'credentials'
gc = gspread.authorize(credentials)


# Function to be able to get service for google drive
def get_service(api_name, api_version, scopes=scopes, key_file_location=key_file_location):
    """Get a service that communicates to a Google API.

    Args:
        api_name: The name of the api to connect to.
        api_version: The api version to connect to.
        scopes: A list auth scopes to authorize for the application.
        key_file_location: The path to a valid service account JSON key file.

    Returns:
        A service that is connected to the specified API.
    """

    # Build the service object.
    service = build(api_name, api_version, credentials=credentials)

    return service

# Retrieve list of files in Google Drive
# Interested only in spreadsheets within folder, hence using query as arg option within list function
itemsInDrive = get_service("drive","v3").files().list(q="mimeType='application/vnd.google-apps.spreadsheet'").execute()

#Retrieving value of 'files' key which has details of all the files we are interested in.
filesInDrive = itemsInDrive['files']



In [92]:
# Converting results to Dictionary with 'name' as the index of each entry.
# This allows for easier reading of the data structure using the name
filesInDriveDict = {};

for file in filesInDrive:
    itemList = [item for item in file.items() if item[0] != 'name' ] #Strip out name 
    filesInDriveDict[file['name']] = dict(itemList) # Build dict with value of 'name' as key
    
#Optional, if you need this to be shared across Notebooks.
#In my case I needed list of filesInDriveDict in the drive to be available across other Notebooks
%store filesInDriveDict

Stored 'filesInDriveDict' (dict)


In [91]:
# Helper function that retrieves the id from Google Drive to pass to Google Sheets API
# season naming format is EPL_YYYY_YY , E.g. EPL_2004_05

def get_fileID_from_name(seasonName,filesInDriveDict):
    if seasonName in filesInDriveDict:
        return filesInDriveDict[seasonName]['id']
    raise Exception("File name passed to get_fileID_from_name is invalid") #If invalid name passed


In [90]:
# Pass name of the file to function defined above
spreadsheet_key = get_fileID_from_name("EPL_2015_16",filesInDriveDict)
#spreadsheet_key is the internal key stored in Google drive for that file
workbook = gc.open_by_key(spreadsheet_key)
#By default load 1st sheet of the wor , i.e. index = 0
sheet = workbook.get_worksheet(0)
#get_all_values returns list of lists with first list as column headers
values = sheet.get_all_values()
# Pulling the data and transform it to the data frame .1st row is header , remaining are actual values
pd_data = pd.DataFrame(values[1:], columns = values[0])
pd_data



Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA
0,E0,08/08/15,Bournemouth,Aston Villa,0,1,A,0,0,D,...,1.79,26,-0.5,1.98,1.93,1.99,1.92,1.82,3.88,4.7
1,E0,08/08/15,Chelsea,Swansea,2,2,D,2,1,H,...,1.99,27,-1.5,2.24,2.16,1.8,1.73,1.37,5.04,10.88
2,E0,08/08/15,Everton,Watford,2,2,D,0,1,A,...,1.96,26,-1,2.28,2.18,1.76,1.71,1.75,3.76,5.44
3,E0,08/08/15,Leicester,Sunderland,4,2,H,3,0,H,...,1.67,26,-0.5,2,1.95,1.96,1.9,1.79,3.74,5.1
4,E0,08/08/15,Man United,Tottenham,1,0,H,1,0,H,...,2.01,26,-1,2.2,2.09,1.82,1.78,1.64,4.07,6.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,E0,15/05/16,Stoke,West Ham,2,1,H,0,1,A,...,2.27,30,0.25,2.11,2.03,1.87,1.83,3.05,4,2.26
376,E0,15/05/16,Swansea,Man City,1,1,D,1,1,D,...,2.45,31,1,2.14,2.05,1.85,1.81,7.05,5,1.47
377,E0,15/05/16,Watford,Sunderland,2,2,D,0,1,A,...,2.1,29,-0.5,2.06,2.01,1.91,1.85,1.64,4.52,5.27
378,E0,15/05/16,West Brom,Liverpool,1,1,D,1,1,D,...,2.09,30,-0.25,2.1,2.05,1.86,1.81,2.61,3.75,2.7


In [93]:
#Optional, using magic function if you want to pull up data via one call to the API and persist across notebooks

%store pd_data

Stored 'pd_data' (DataFrame)
