### Function that takes iterates over all files and produces a concatenated dataframe
#### Data produced will be stored in DataFrame named as 'finalDataFrame' that will persist across Notebooks

In [90]:
from oauth2client.service_account import ServiceAccountCredentials
import gspread
import pandas as pd
import time # workaround to get past the issue of Google Sheets API timing out

In [91]:
#retrieve filesInDriveDict
%store -r filesInDriveDict

# Configure the connection 
scopes = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive.readonly']
key_file_location = 'gsheetsprivkey\serviceaccount.json'

# Give the path to the Service Account Credential json file 
credentials = ServiceAccountCredentials.from_json_keyfile_name(key_file_location,scopes)

# Authorise your Jupyter Notebook to connect to Google Drive API using private key credentials in 'credentials'
gc = gspread.authorize(credentials)

In [92]:
#List comprehension to pull up all the ids from the filesInDriveDict
fileKeyIDs = [val['id'] for key,val in filesInDriveDict.items()]

#initialize a list with one value , viz number of files in fileKeyIDs
fileIDChunkList = [len(fileKeyIDs)]

# breaking 27 into [7,10,10]
while fileIDChunkList[0] > 10:
    fileIDChunkList.append(10)
    fileIDChunkList[0] = fileIDChunkList[0]-10

In [93]:
#initialize new 
finalDataFrame = pd.DataFrame()

#variables used to split fileKeyIDs
start = end = 0

# Expected to take over 100 seconds per 10 files due to time.sleep(). Expect 30 files to take 300 seconds (i.e. 5 mins)
for item in fileIDChunkList:
    end = start + item
    for keyID in fileKeyIDs[start:end]:
        #spreadsheet_key is the internal key stored in Google drive for that file
        tempWorkbook = gc.open_by_key(keyID)
        #By default load 1st sheet of the workbook , i.e. index = 0
        tempSheet = tempWorkbook.get_worksheet(0)
        #get_all_values returns list of lists with first list as column headers
        tempValues = tempSheet.get_all_values()
        # Pulling the data and transform it to the data frame .1st row is header , remaining are actual values
        temp_pd_data = pd.DataFrame(tempValues[1:], columns = tempValues[0])
        #iteratively concatenating
        finalDataFrame = pd.concat([finalDataFrame,temp_pd_data])
    start += item;
    #if not on the last item (since 'end' is equal to length of fileKeyIDs on last step), proceed to pause for 100 secs
    if end < len(fileKeyIDs):
        time.sleep(100)

In [94]:
%store finalDataFrame

Stored 'finalDataFrame' (DataFrame)
