### Function that takes iterates over all files and produces a concatenated dataframe
#### Data produced will be stored in DataFrame named as 'finalDataFrame' that will persist across Notebooks

In [167]:
from oauth2client.service_account import ServiceAccountCredentials
import gspread
import pandas as pd
import time # workaround to get past the issue of Google Sheets API timing out

In [168]:
#retrieve filesInDriveDict
%store -r filesInDriveDict

# Configure the connection 
scopes = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive.readonly']
key_file_location = 'gsheetsprivkey\serviceaccount.json'

# Give the path to the Service Account Credential json file 
credentials = ServiceAccountCredentials.from_json_keyfile_name(key_file_location,scopes)

# Authorise your Jupyter Notebook to connect to Google Drive API using private key credentials in 'credentials'
gc = gspread.authorize(credentials)

In [169]:
#List comprehension to pull up all the ids from the filesInDriveDict
fileKeyIDs = [val['id'] for key,val in filesInDriveDict.items()]

#initialize a list with one value , viz number of files in fileKeyIDs
fileIDChunkList = [len(fileKeyIDs)]

# breaking 27 into [7,10,10]
while fileIDChunkList[0] > 10:
    fileIDChunkList.append(10)
    fileIDChunkList[0] = fileIDChunkList[0]-10

In [170]:
#Helper functions

# Note: this function is applied in context of each season
# Add points for each team before the game starts
def addPointsBeforeGame(pd_data):
    #unique teams
    uniqueTeams = pd_data['HomeTeam'].unique()

    #build dict with all teams in that season; initialize to zero for each
    teamPoints = {uniqueTeams[i]:0 for i in range(0,len(uniqueTeams))};
    
    # Adding new feature: Points before each game starts
    # PBGH = PointsBeforeGameHomeTeam in that season
    # PBGA = PointsBeforeGameAwaayTeam in that season
    pd_data['PBGH'] = 0
    pd_data['PBGA'] = 0

    for index,row in pd_data.iterrows():
        #set points before start of game into DataFrame
        pd_data.loc[index,'PBGH'] = teamPoints[row['HomeTeam']]
        pd_data.loc[index,'PBGA'] = teamPoints[row['AwayTeam']]
        #check who won and who lost
        if (row['FTR']=='D'):
            teamPoints[row['HomeTeam']] += 1
            teamPoints[row['AwayTeam']] += 1
        elif (row['FTR']=='H'):
            teamPoints[row['HomeTeam']] += 3
        elif (row['FTR']=='A'):
            teamPoints[row['AwayTeam']] += 3
    return 

# Note: this function is applied in context of all seasons
def populateCumulativeGoalsAndStreak(finalData):
    
    #across all seasons thus far
    uniqueTeams = finalData['HomeTeam'].unique()
    
    #build dict with ALL teams; initialize to ----- for each
    teamStreak = {uniqueTeams[i]: {'GS': 0,'GC':0,'Streak':'----'} for i in range(0,len(uniqueTeams))}; 
    
    # Adding new feature: Win streak before game
    # HomeStreak = Streak of home team before the game
    # AwayStreak = Streak of away team before the game
    # initialize to null initial value
    finalData['HomeStreak'] = ''
    finalData['AwayStreak'] = ''
    finalData['HTGS'] = 0
    finalData['HTGC'] = 0
    finalData['ATGS'] = 0
    finalData['ATGC'] = 0

        
    for index,row in finalData.iterrows():
    
        #Populate streak data into columns before game outcome is counted
        finalData.at[index,'HomeStreak'] = teamStreak[row['HomeTeam']]['Streak'];
        finalData.at[index,'AwayStreak'] = teamStreak[row['AwayTeam']]['Streak'];
        finalData.at[index,'HTGS'] = teamStreak[row['HomeTeam']]['GS'];
        finalData.at[index,'ATGS'] = teamStreak[row['AwayTeam']]['GS'];
        finalData.at[index,'HTGC'] = teamStreak[row['HomeTeam']]['GC'];
        finalData.at[index,'ATGC'] = teamStreak[row['AwayTeam']]['GC'];

        teamStreak[row['HomeTeam']]['GS'] += int(row['FTHG'])
        teamStreak[row['AwayTeam']]['GC'] += int(row['FTHG'])
        teamStreak[row['AwayTeam']]['GS'] += int(row['FTAG'])
        teamStreak[row['HomeTeam']]['GC'] += int(row['FTAG'])

        #check who won and who lost
        if (row['FTR']=='Draw'):
            teamStreak[row['HomeTeam']]['Streak'] = 'D' + teamStreak[row['HomeTeam']]['Streak']
            teamStreak[row['AwayTeam']]['Streak'] = 'D' + teamStreak[row['AwayTeam']]['Streak']
        elif (row['FTR']=='Home'):
            teamStreak[row['HomeTeam']]['Streak'] = 'W' + teamStreak[row['HomeTeam']]['Streak']
            teamStreak[row['AwayTeam']]['Streak'] = 'L' + teamStreak[row['AwayTeam']]['Streak']
        elif (row['FTR']=='Away'):
            teamStreak[row['HomeTeam']]['Streak'] = 'L' + teamStreak[row['HomeTeam']]['Streak']
            teamStreak[row['AwayTeam']]['Streak'] = 'W' + teamStreak[row['AwayTeam']]['Streak']

        #snipping off last character since above if block added a result to start
        teamStreak[row['HomeTeam']]['Streak'] = teamStreak[row['HomeTeam']]['Streak'][0:-1]
        teamStreak[row['AwayTeam']]['Streak'] = teamStreak[row['AwayTeam']]['Streak'][0:-1]

    return 



In [171]:
#initialize new DataFrame
finalDataFrame = pd.DataFrame()

#variables used to split fileKeyIDs. Using this method to prevent Google API rejections
start = end = 0

# Expected to take over 100 seconds per 10 files due to time.sleep(). Expect 30 files to take 300 seconds (i.e. 5 mins)
for item in fileIDChunkList:
    end = start + item
    for keyID in fileKeyIDs[start:end]:
        #spreadsheet_key is the internal key stored in Google drive for that file
        tempWorkbook = gc.open_by_key(keyID)
        #By default load 1st sheet of the workbook , i.e. index = 0
        tempSheet = tempWorkbook.get_worksheet(0)
        #get_all_values returns list of lists with first list as column headers
        tempValues = tempSheet.get_all_values()
        # Pulling the data and transform it to the data frame .1st row is header , remaining are actual values
        temp_pd_data = pd.DataFrame(tempValues[1:], columns = tempValues[0])
        #converting string format dates to datetime object
        temp_pd_data['Date'] = pd.to_datetime(temp_pd_data['Date'])
        addPointsBeforeGame(temp_pd_data)
        #iteratively concatenating
        finalDataFrame = pd.concat([finalDataFrame,temp_pd_data])
    start += item;
    #if not on the last item (since 'end' is equal to length of fileKeyIDs on last step), proceed to pause for 100 secs
    if end < len(fileKeyIDs):
        time.sleep(100)

In [172]:
#sort rows by date ascending
finalDataFrame.sort_values(by='Date',inplace=True)

#numbered index with unique numbers
finalDataFrame['newIndex'] = range(0,len(finalDataFrame.index))
finalDataFrame.set_index(['newIndex'],inplace=True)



In [173]:
populateCumulativeGoalsAndStreak(finalDataFrame)

In [174]:
%store finalDataFrame


Stored 'finalDataFrame' (DataFrame)
