### Function that takes iterates over all files and produces a concatenated dataframe
#### Data produced will be stored in DataFrame named as 'finalDataFrame' that will persist across Notebooks

In [118]:
from oauth2client.service_account import ServiceAccountCredentials
import gspread
import pandas as pd
import time # workaround to get past the issue of Google Sheets API timing out

In [119]:
#retrieve filesInDriveDict
%store -r filesInDriveDict

# Configure the connection 
scopes = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive.readonly']
key_file_location = 'gsheetsprivkey\serviceaccount.json'

# Give the path to the Service Account Credential json file 
credentials = ServiceAccountCredentials.from_json_keyfile_name(key_file_location,scopes)

# Authorise your Jupyter Notebook to connect to Google Drive API using private key credentials in 'credentials'
gc = gspread.authorize(credentials)

In [120]:
#List comprehension to pull up all the ids from the filesInDriveDict
fileKeyIDs = [val['id'] for key,val in filesInDriveDict.items()]

#initialize a list with one value , viz number of files in fileKeyIDs
fileIDChunkList = [len(fileKeyIDs)]

# breaking 27 into [7,10,10]
while fileIDChunkList[0] > 10:
    fileIDChunkList.append(10)
    fileIDChunkList[0] = fileIDChunkList[0]-10

In [121]:
#Helper functions

# Note: this function is applied in context of each season
# Add points for each team before the game starts
def addPointsBeforeGame(pd_data):
    #unique teams
    uniqueTeams = pd_data['HomeTeam'].unique()

    #build dict with all teams in that season; initialize to zero for each
    teamPoints = {uniqueTeams[i]:0 for i in range(0,len(uniqueTeams))};
    
    # Adding new feature: Points before each game starts
    # PBGH = PointsBeforeGameHomeTeam in that season
    # PBGA = PointsBeforeGameAwaayTeam in that season
    pd_data['PBGH'] = 0
    pd_data['PBGA'] = 0

    for index,row in pd_data.iterrows():
        #set points before start of game into DataFrame
        pd_data.loc[index,'PBGH'] = teamPoints[row['HomeTeam']]
        pd_data.loc[index,'PBGA'] = teamPoints[row['AwayTeam']]
        #check who won and who lost
        if (row['FTR']=='D'):
            teamPoints[row['HomeTeam']] += 1
            teamPoints[row['AwayTeam']] += 1
        elif (row['FTR']=='H'):
            teamPoints[row['HomeTeam']] += 3
        elif (row['FTR']=='A'):
            teamPoints[row['AwayTeam']] += 3
    return 

# Note: this function is applied in context of all seasons
def getWinLossDrawStreak(finalData):
    
    #across all seasons thus far
    uniqueTeams = finalData['HomeTeam'].unique()
    
    #build dict with ALL teams; initialize to ----- for each
    teamStreak = {uniqueTeams[i]:'-----' for i in range(0,len(uniqueTeams))}; 
    
    # Adding new feature: Win streak before game
    # HomeStreak = Streak of home team before the game
    # AwayStreak = Streak of away team before the game
    # initialize to null initial value
    finalData['HomeStreak'] = ''
    finalData['AwayStreak'] = ''
    
    for index,row in finalData.iterrows():
        
        #Populate streak data into columns before game outcome is counted
        finalData.loc[index,'HomeStreak'] = teamStreak[row['HomeTeam']];
        finalData.loc[index,'AwayStreak'] = teamStreak[row['AwayTeam']];
        
        #check who won and who lost
        if (row['FTR']=='D'):
            teamStreak[row['HomeTeam']] = 'D' + teamStreak[row['HomeTeam']]
            teamStreak[row['AwayTeam']] = 'D' + teamStreak[row['AwayTeam']]
        elif (row['FTR']=='H'):
            teamStreak[row['HomeTeam']] = 'W' + teamStreak[row['HomeTeam']]
            teamStreak[row['AwayTeam']] = 'L' + teamStreak[row['AwayTeam']]
        elif (row['FTR']=='A'):
            teamStreak[row['HomeTeam']] = 'L' + teamStreak[row['HomeTeam']]
            teamStreak[row['AwayTeam']] = 'W' + teamStreak[row['AwayTeam']]
        
        #snipping off last character since above if block added a result to start
        teamStreak[row['HomeTeam']] = teamStreak[row['HomeTeam']][0:-1]
        teamStreak[row['AwayTeam']] = teamStreak[row['AwayTeam']][0:-1]

    return 



In [134]:
#initialize new DataFrame
finalDataFrame = pd.DataFrame()

#variables used to split fileKeyIDs. Using this method to prevent Google API rejections
start = end = 0

# Expected to take over 100 seconds per 10 files due to time.sleep(). Expect 30 files to take 300 seconds (i.e. 5 mins)
for item in fileIDChunkList:
    end = start + item
    for keyID in fileKeyIDs[start:end]:
        #spreadsheet_key is the internal key stored in Google drive for that file
        tempWorkbook = gc.open_by_key(keyID)
        #By default load 1st sheet of the workbook , i.e. index = 0
        tempSheet = tempWorkbook.get_worksheet(0)
        #get_all_values returns list of lists with first list as column headers
        tempValues = tempSheet.get_all_values()
        # Pulling the data and transform it to the data frame .1st row is header , remaining are actual values
        temp_pd_data = pd.DataFrame(tempValues[1:], columns = tempValues[0])
        #converting string format dates to datetime object
        temp_pd_data['Date'] = pd.to_datetime(temp_pd_data['Date'])
        addPointsBeforeGame(temp_pd_data)
        #iteratively concatenating
        finalDataFrame = pd.concat([finalDataFrame,temp_pd_data])
    start += item;
    #if not on the last item (since 'end' is equal to length of fileKeyIDs on last step), proceed to pause for 100 secs
    if end < len(fileKeyIDs):
        time.sleep(100)

In [136]:
#sort rows by date ascending
finalDataFrame.sort_values(by='Date',inplace=True)

#numbered index with unique numbers
finalDataFrame['newIndex'] = range(0,len(finalDataFrame.index))
finalDataFrame.set_index(['newIndex'],inplace=True)



In [137]:
getWinLossDrawStreak(finalDataFrame)

In [138]:
%store finalDataFrame


Stored 'finalDataFrame' (DataFrame)


In [139]:
finalDataFrame

Unnamed: 0_level_0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AHW,HO,AO,HBP,ABP,SYH,SYD,SYA,HomeStreak,AwayStreak
newIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,E0,1993-01-09,,Sheffield Weds,Norwich,3,3,D,,,...,,,,,,,,,-----,-----
1,E0,1993-01-09,,Blackburn,Arsenal,1,1,D,,,...,,,,,,,,,-----,-----
2,E0,1993-01-09,,Coventry,Liverpool,1,0,H,,,...,,,,,,,,,-----,-----
3,E0,1993-01-09,,Man United,West Ham,3,0,H,,,...,,,,,,,,,-----,-----
4,E0,1993-01-09,,QPR,Sheffield United,2,1,H,,,...,,,,,,,,,-----,-----
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10327,E0,2020-11-01,15:00,Wolves,Newcastle,1,1,D,1,1,...,,,,,,,,,DWDLW,WDLLD
10328,E0,2020-11-01,17:30,Tottenham,Liverpool,0,1,A,0,1,...,,,,,,,,,DLWWW,WLWWW
10329,E0,2020-11-01,15:00,Chelsea,Burnley,3,0,H,2,0,...,,,,,,,,,WDWLD,DDWWD
10330,E0,2020-12-01,14:00,Bournemouth,Watford,0,3,A,0,1,...,,,,,,,,,LLDLW,DLWLL
