### Version Summary

 Version# 3
 
 Date: 2021-04-23
     1. Added getMatchSummary fn
     2. Splitted 'extras' & 'total' attributes atomically
     3. Added getFileList fn
     
 Exceptions:
     1. teamWon field will not have correct data in case of super over. Will have to populate the data manually
 
------------------------------------------------------------
 Version# v2
 
 Date: 2021-04-22
    1. Added did not bat secion in getBattingScoreCard fn
    2. casting int fields to int
    3. Added Fall of Wickets

In [33]:
SOURCE_DIR = '..//HTML//'
TARGET_DIR = '..//OutputFIles//'
FNAME = '13_Delhi Capitals v Mumbai Indians _ Match 13, IPL 2021 Match Centre _ IPLT20.com.html'
PROCESSED_FILES = 'processed_files.txt'

TEAMS = {
    'Chennai Super Kings': 'CSK', 
    'Delhi Capitals': 'DC',
    'Kolkata Knight Riders': 'KKR',
    'Mumbai Indians' : 'MI',
    'Punjab Kings' : 'PK',
    'Rajasthan Royals' : 'RR',
    'Royal Challengers Bangalore' : 'RCB',
    'Sunrisers Hyderabad' : 'SRH'
}

In [2]:
import os
import requests
from lxml import html

In [32]:
def getFilesList(sourceFilePath):
    """
        Get the list of files which are not yet processed
        reference: processed_files.txt
    """
    flist = []
    files = os.listdir(sourceFilePath)
    
    f = open(os.path.join(sourceFilePath,PROCESSED_FILES))
    processedFiles = list(map(lambda x: x.strip(), f.readlines()))
    f.close()
    files.sort()
    
    for file in files:
        if file.endswith('.html') and file not in processedFiles:
            flist.append(file)
    return flist  

In [36]:
def processFiles(sourceFilePath, targetFilePath):
    
    sourceFiles = getFilesList(sourceFilePath)
    processedFiles = open(os.path.join(sourceFilePath,PROCESSED_FILES), 'a')
    
    for file in sourceFiles:
        outFileName = 'IPL_2021_Match_' + file.split('_')[0] 
        outFileName += '_' + TEAMS[file.split('_')[1].strip().split('v')[0].strip()]
        outFileName += '_' + TEAMS[file.split('_')[1].strip().split('v')[1].strip()]
        outFileName += '.txt'
        outputFile = open(os.path.join(targetFilePath, outFileName), 'a')
        
        d = getMatchSummary(sourceFilePath, file)
        outputFile.write(d)
        outputFile.write('\n')
        outputFile.close()
        
        processedFiles.write(file + '\n')
        
    processedFiles.close()


In [45]:
processFiles(SOURCE_DIR, TARGET_DIR)

In [27]:
def getMatchSummary(filePath, fileName):
    d = {}
    f = open(os.path.join(filePath, fileName))
    tree = html.fromstring(f.read())
    f.close()
    
    matchInfo = tree.find_class('matchInfo')
    scoreCard = tree.find_class('teamScorecard')
    
    d['matchNum'] = int(fileName.split('_')[0])
    d['team1'] = ''
    d['team2'] = ''
    d['tossWonBy'] = matchInfo[0][0][0].text_content().split(':')[1].strip().split(',')[0]
    d['tossDecision'] = matchInfo[0][0][0].text_content().split(':')[1].strip().split(',')[1].split(' ')[-1]
    d['manOfTheMatch'] = matchInfo[0][0][1].text_content().split(':')[1].strip()
    d['venue'] = matchInfo[0][0][2].text_content().split(':')[1].strip()
    d['umpires'] = matchInfo[0][0][3].text_content().split(':')[1].strip().split(', ')
    d['referee'] = matchInfo[0][0][4].text_content().split(':')[1].strip()
    d['teamWon'] = ''
    d['firstInnings'] = getInningsSummary(scoreCard[0])
    d['secondInnings'] = getInningsSummary(scoreCard[1])
    d['team1'] = d['firstInnings']['battingTeamName']
    d['team2'] = d['secondInnings']['battingTeamName']
    d['teamWon'] = d['firstInnings']['battingTeamName'] if d['firstInnings']['battingScoreCard']['total']['runs'] > d['secondInnings']['battingScoreCard']['total']['runs'] else d['secondInnings']['battingTeamName']
    
    return d

In [46]:
result = getMatchSummary(SOURCE_DIR, FNAME); result

{'matchNum': 13,
 'team1': 'Mumbai Indians',
 'team2': 'Delhi Capitals',
 'tossWonBy': 'Mumbai Indians',
 'tossDecision': 'bat',
 'manOfTheMatch': 'Amit Mishra',
 'venue': 'M. A. Chidambaram Stadium, Chennai',
 'umpires': ['Chris Gaffaney',
  'C. Shamshuddin',
  'Ulhas Gandhe',
  'Tapan Sharma'],
 'referee': 'V. Narayanankutty',
 'teamWon': 'Delhi Capitals',
 'firstInnings': {'battingTeamName': 'Mumbai Indians',
  'runRate': 6.85,
  'battingScoreCard': {'batsman': [{'battingPos': 1,
     'playerId': 107,
     'playerName': 'Rohit Sharma',
     'dismissal': 'c Steve Smith b Amit Mishra',
     'runs': 44,
     'balls': 30,
     'strikeRate': 146.66,
     'fours': 3,
     'sixes': 3},
    {'battingPos': 2,
     'playerId': 834,
     'playerName': 'Quinton de Kock',
     'dismissal': 'c Rishabh Pant b Marcus Stoinis',
     'runs': 2,
     'balls': 4,
     'strikeRate': 50.0,
     'fours': 0,
     'sixes': 0},
    {'battingPos': 3,
     'playerId': 108,
     'playerName': 'Suryakumar Yadav'

In [40]:
def getInningsSummary(scoreCard):
    """
        gets the inninngs summary of 1 innings
        param: scoreCard should be the data of 1 innings
    """
    d = {}
    team = scoreCard.find_class('teamHeader') 
    d['battingTeamName'] = team[0][0].text.strip('Innings').strip()
    d['runRate'] = float(team[0][1].text.strip('()').split(':')[1].strip())
    d['battingScoreCard'] = getBattingScoreCard(scoreCard)
    d['bowlingScoreCard'] = getBowlingScoreCard(scoreCard)
    d['fallOfWickets'] = getFallOfWickets(scoreCard)
    return d


In [None]:
summary = getInningsSummary(scoreCard[1]); summary

In [18]:
def getBattingScoreCard(scoreCard):
    d = {}
    battingPos = 1
    
    battingSC = scoreCard.find_class('batsmen')
    d['batsman'] = []
    tableBody = battingSC[0][1]
    for tableRow in tableBody:
        battingRec = {}
        if tableRow.attrib['class'] == 'extra':
            d2 = {}
            for s in tableRow[1].text.strip().strip('()').split(','):
                d2[s.strip().split(' ')[0]] = int(s.strip().split(' ')[1])
            d['extras'] = d2
            #d['extras'] = tableRow[1].text.strip()
            d['extras']['runs'] = int(tableRow[2].text)
        elif tableRow.attrib['class'] == 'total':
            d2 = {}
            for s in tableRow[1].text.strip().strip('()').split(';'):
                d2[s.strip().split(' ')[1]] = float(s.strip().split(' ')[0])
            d['total'] = d2
            #d['total_info'] = tableRow[1].text.strip()
            d['total']['runs'] = int(tableRow[2].text)
        else:
            battingRec['battingPos'] = battingPos
            battingPos += 1
            battingRec['playerId'] = int(tableRow.attrib['data-player-id'])
            battingRec['playerName'] = tableRow[1].text
            battingRec['dismissal'] = tableRow[2].text
            battingRec['runs'] = int(tableRow[3].text)
            battingRec['balls'] = int(tableRow[4].text)
            battingRec['strikeRate'] = float(tableRow[5].text)
            battingRec['fours'] = int(tableRow[6].text)
            battingRec['sixes'] = int(tableRow[7].text)
            d['batsman'].append(battingRec)
            
    # Getting data of players who didnt bat
    didNotBat = scoreCard.find_class('remainingBatsmen')
    if(len(didNotBat) > 0):
        for batsman in didNotBat[0][1]:
            battingRec = {}
            battingRec['battingPos'] = battingPos
            battingPos += 1
            battingRec['playerId'] = int(batsman[0].attrib['data-player-id'])
            battingRec['playerName'] = batsman[0].text.strip()
            battingRec['dismissal'] = 'DID NOT BAT'
            battingRec['runs'] = 0
            battingRec['balls'] = 0
            battingRec['strikeRate'] = 0
            battingRec['fours'] = 0
            battingRec['sixes'] = 0
            d['batsman'].append(battingRec)
            
    return d

In [None]:
firstInnings = getBattingScoreCard(scoreCard[0]); firstInnings

In [None]:
secondInnings = getBattingScoreCard(scoreCard[1]); secondInnings

In [44]:
def getBowlingScoreCard(scoreCard):
    d = []
    
    bowlingSC = scoreCard.find_class('bowlers')

    tableBody = bowlingSC[0][1]
    for tableRow in tableBody:
        bowlingRec = {}
        bowlingRec['playerId'] = int(tableRow.attrib['data-player-id'])
        bowlingRec['playerName'] = tableRow[1].text
        bowlingRec['overs'] = float(tableRow[2].text)
        bowlingRec['runs'] = int(tableRow[3].text)
        bowlingRec['wickets'] = int(tableRow[4].text)
        bowlingRec['economy'] = float(tableRow[5].text)
        bowlingRec['dots'] = int(tableRow[6].text)
        d.append(bowlingRec)
        
    return d


In [None]:
firstInningsBowling = getBowlingScoreCard(scoreCard[0]); firstInningsBowling

In [30]:
def getFallOfWickets(scoreCard):
    d = []
    fallOfWickets = scoreCard.find_class('fallOfWicket')
    for fow in fallOfWickets[0][1]:
       d.append(fow.text.strip().strip(','))
    return d

In [31]:
fow = getFallOfWickets(scoreCard[0]); fow

['1-9 (de Kock, 2.1 ov) ',
 '2-67 (Yadav, 6.6 ov) ',
 '3-76 (Sharma, 8.4 ov) ',
 '4-77 (Pandya, 8.6 ov) ',
 '5-81 (Pandya, 10.4 ov) ',
 '6-84 (Pollard, 11.5 ov) ',
 '7-123 (Kishan, 17.3 ov) ',
 '8-129 (Yadav, 18.5 ov) ',
 '9-135 (Chahar, 19.4 ov)']