# Scraping Career Data of a Player

The following program scrapes data from the stats pages of priemer sports website espncricinfo, and returns a json data containing extensive data player's career match-by-match.

In [1]:
import requests

```requests library is required to fetch the data from webserver via http```

In [2]:
import bs4

``` bs4/beautifulsoup is required to parse data from html```

In [3]:
base_url = 'http://stats.espncricinfo.com/ci/engine/player/'

In [4]:
params = [
    ('class', '3'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'match')
]

In [5]:
player_id = '253802'

```player_id is the id of the player in espncricinfo```

In [6]:
url = base_url + player_id + '.html'

In [7]:
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')

In [8]:
table = soup.find_all("table")[3]

In [9]:
rows = bs4.BeautifulSoup(str(table))

In [10]:
match_data = rows.find_all('tr')[1:]

#### Finds basic batting details for each match played

In [11]:
data = []
i = 0
for match in match_data:
    i = i+1
    isDismissed = False
    didNotBat = False

    cols = bs4.BeautifulSoup(str(match))
    
    score_cell = cols.find_all('td')[0]
    soup = bs4.BeautifulSoup(str(score_cell))
    score = soup.get_text()
    if score == "DNB" or score == "TDNB":
        didNotBat = True
        runs = 0
    elif "*" in score:
        runs = int(score[:-1])
    else:
        isDismissed = True
        runs =  int(score)
        

    bf_cell = cols.find_all('td')[2]
    soup = bs4.BeautifulSoup(str(bf_cell))
    balls = soup.get_text()
    if balls == "-":
        ball = 0
        strikeRate = 0
    else:
        ball = balls
        strikeRate = round(int(runs)/int(balls) * 100,2)
        
    fours_cell  = cols.find_all('td')[4]
    soup = bs4.BeautifulSoup(str(fours_cell))
    fours = soup.get_text()
    if fours == "-":
        four = 0
    else:
        four = fours
    
    sixes_cell  = cols.find_all('td')[5]
    soup = bs4.BeautifulSoup(str(sixes_cell))
    sixes = soup.get_text()
    if sixes == "-":
        six = 0
    else:
        six = sixes
        
    
    opposition_cell  = cols.find_all('td')[7]
    soup = bs4.BeautifulSoup(str(opposition_cell))
    opposition = soup.get_text()[2:]
    
    ground_cell  = cols.find_all('td')[8]
    soup = bs4.BeautifulSoup(str(ground_cell))
    ground = soup.get_text()
    
    date_cell  = cols.find_all('td')[9]
    soup = bs4.BeautifulSoup(str(date_cell))
    date = soup.get_text()
    
    id_cell  = cols.find_all('td')[10]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1] 
    
    entry = {
        'balls': int(ball),
        'batPosition': None,
        'captained': False,
        'date': date,
        'didNotBat': didNotBat,
        'dismissalType': None,
        'dismissedAgainst': None,
        'dismissalBowlerHand': None,
        'matchID': int(matchID),
        'minutes': 0,
        'fours': int(four),
        'firstBat': False,
        'ground': ground,
        'innings': 1,
        'match': int(i),
        'notOut': not isDismissed,
        'opposition': opposition,
        'runs': int(runs),
        'sixes': int(six),
        'strikeRate': int(strikeRate),
        'won': False
    }
    data.append(entry)

In [12]:
import pandas as pd

``` pandas library is required to handle large amount of data easily ```

In [13]:
dataset = pd.DataFrame(data)
print(dataset)

    balls batPosition  captained         date  didNotBat dismissalBowlerHand  \
0      21        None      False  12 Jun 2010      False                None   
1       0        None      False  13 Jun 2010       True                None   
2      19        None      False   9 Jan 2011      False                None   
3      12        None      False   4 Jun 2011      False                None   
4       5        None      False  31 Aug 2011      False                None   
5      16        None      False  29 Oct 2011      False                None   
6      21        None      False   1 Feb 2012      False                None   
7      24        None      False   3 Feb 2012      False                None   
8       0        None      False  30 Mar 2012       True                None   
9      48        None      False   7 Aug 2012      False                None   
10     41        None      False  11 Sep 2012      False                None   
11     39        None      False  19 Sep

#### Finds matches won among played matches

In [14]:
params = [
    ('class', '3'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'match'),
    ('result','1')
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

won = []
for match in match_data:
    id_cell  = match.find_all('td')[10]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]
    won.append(int(matchID))
print(won)

[182, 183, 196, 200, 218, 255, 265, 272, 282, 286, 292, 298, 331, 378, 382, 389, 393, 399, 485, 486, 489, 509, 512, 515, 517, 521, 541, 547, 553, 593, 594, 618, 623, 630, 632, 652, 678, 680, 684, 690, 714]


#### Finds matches as captain among played matches

In [15]:
params = [
    ('captain','1'),
    ('class', '3'),
    ('filter','advanced'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'match'),
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

captained = []
for match in match_data:
    id_cell  = match.find_all('td')[10]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]
    captained.append(int(matchID))
print(captained)

[592, 593, 594, 617, 618, 623, 624, 630, 631, 632, 652, 654, 678, 680, 684, 688, 690, 712, 713, 714, 748, 749]


#### Finds Batting Position, Batting Innings, Minutes Batted, Dismissal Type in played matches

In [16]:
params = [
    ('class', '3'),
    ('filter','advanced'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'innings'),
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

batPosition = []
batFirst = []
dismissalType = []
minutes = []
for match in match_data:

    id_cell  = match.find_all('td')[13]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]

    position_cell  = match.find_all('td')[6]
    soup = bs4.BeautifulSoup(str(position_cell))
    batPosition.append({
        "id": int(matchID),
        "pos": soup.get_text()
    })
    
    minutes_cell  = match.find_all('td')[1]
    soup = bs4.BeautifulSoup(str(minutes_cell))
    if soup.get_text() != '-':
        minutes.append({
            "id": int(matchID),
            "min": soup.get_text()
        })
    
    dismissal_cell  = match.find_all('td')[7]
    soup = bs4.BeautifulSoup(str(dismissal_cell))
    if soup.get_text() != '-' or soup.get_text() != 'not out':
        dismissalType.append({
            "id": int(matchID),
            "type": soup.get_text()
        })
    
    innings_cell  = match.find_all('td')[8]
    soup = bs4.BeautifulSoup(str(innings_cell))
    if soup.get_text() == '1':  
        batFirst.append(matchID)

#### Finds Dismissal against Pace/Spin Bowlers in played matches

In [17]:
dismissedAgainst = []

params = [
    ('class', '3'),
    ('bowling_pacespin','1'),
    ('filter','advanced'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'dismissal_list'),
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

for match in match_data:

    id_cell  = match.find_all('td')[9]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]
    dismissedAgainst.append({
        "type": "pace",
        "id": int(matchID)
    })
    
params = [
    ('class', '3'),
    ('bowling_pacespin','2'),
    ('filter','advanced'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'dismissal_list'),
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

for match in match_data:

    id_cell  = match.find_all('td')[9]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]
    dismissedAgainst.append({
        "type": "spin",
        "id": int(matchID)
    })
    
print(dismissedAgainst)


[{'type': 'pace', 'id': 200}, {'type': 'pace', 'id': 204}, {'type': 'pace', 'id': 214}, {'type': 'pace', 'id': 218}, {'type': 'pace', 'id': 255}, {'type': 'pace', 'id': 261}, {'type': 'pace', 'id': 265}, {'type': 'pace', 'id': 278}, {'type': 'pace', 'id': 286}, {'type': 'pace', 'id': 292}, {'type': 'pace', 'id': 294}, {'type': 'pace', 'id': 296}, {'type': 'pace', 'id': 331}, {'type': 'pace', 'id': 382}, {'type': 'pace', 'id': 405}, {'type': 'pace', 'id': 456}, {'type': 'pace', 'id': 509}, {'type': 'pace', 'id': 512}, {'type': 'pace', 'id': 562}, {'type': 'pace', 'id': 593}, {'type': 'pace', 'id': 617}, {'type': 'pace', 'id': 618}, {'type': 'pace', 'id': 624}, {'type': 'pace', 'id': 654}, {'type': 'pace', 'id': 678}, {'type': 'pace', 'id': 680}, {'type': 'pace', 'id': 688}, {'type': 'pace', 'id': 690}, {'type': 'spin', 'id': 196}, {'type': 'spin', 'id': 217}, {'type': 'spin', 'id': 272}, {'type': 'spin', 'id': 393}, {'type': 'spin', 'id': 489}, {'type': 'spin', 'id': 535}, {'type': 'spi

#### Finds Dismissal against Right/Left-Handed Bowlers in played matches

In [18]:
dismissalBowlerHand = []

params = [
    ('class', '3'),
    ('bowling_hand','1'),
    ('filter','advanced'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'dismissal_list'),
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

for match in match_data:

    id_cell  = match.find_all('td')[9]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]
    dismissalBowlerHand.append({
        "hand": "right",
        "id": int(matchID)
    })
    
params = [
    ('class', '3'),
    ('bowling_hand','2'),
    ('filter','advanced'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'dismissal_list'),
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

for match in match_data:

    id_cell  = match.find_all('td')[9]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]
    dismissalBowlerHand.append({
        "hand": "left",
        "id": int(matchID)
    })
    
print(dismissalBowlerHand)

[{'hand': 'right', 'id': 196}, {'hand': 'right', 'id': 200}, {'hand': 'right', 'id': 204}, {'hand': 'right', 'id': 214}, {'hand': 'right', 'id': 218}, {'hand': 'right', 'id': 255}, {'hand': 'right', 'id': 265}, {'hand': 'right', 'id': 272}, {'hand': 'right', 'id': 278}, {'hand': 'right', 'id': 286}, {'hand': 'right', 'id': 292}, {'hand': 'right', 'id': 294}, {'hand': 'right', 'id': 331}, {'hand': 'right', 'id': 382}, {'hand': 'right', 'id': 393}, {'hand': 'right', 'id': 405}, {'hand': 'right', 'id': 456}, {'hand': 'right', 'id': 489}, {'hand': 'right', 'id': 509}, {'hand': 'right', 'id': 512}, {'hand': 'right', 'id': 535}, {'hand': 'right', 'id': 547}, {'hand': 'right', 'id': 562}, {'hand': 'right', 'id': 592}, {'hand': 'right', 'id': 593}, {'hand': 'right', 'id': 617}, {'hand': 'right', 'id': 632}, {'hand': 'right', 'id': 654}, {'hand': 'right', 'id': 678}, {'hand': 'right', 'id': 680}, {'hand': 'right', 'id': 690}, {'hand': 'right', 'id': 712}, {'hand': 'right', 'id': 748}, {'hand': 

#### Adds the new data found into the existing match-wise dataset

In [19]:
for match in won:
    dataset.loc[dataset['matchID'] == match, 'won'] = True
for match in captained:
    dataset.loc[dataset['matchID'] == match, 'captained'] = True
for match in dismissalType:
    dataset.loc[dataset['matchID'] == match['id'], 'dismissalType'] = match['type']
for match in dismissedAgainst:
    dataset.loc[dataset['matchID'] == match['id'], 'dismissedAgainst'] = match['type']
for match in dismissalBowlerHand:
    dataset.loc[dataset['matchID'] == match['id'], 'dismissalBowlerHand'] = match['hand']
for match in minutes:
    dataset.loc[dataset['matchID'] == match['id'], 'minutes'] = match['min']
for match in batPosition:
    dataset.loc[dataset['matchID'] == match['id'], 'batPosition'] = match['pos']
for match in batFirst:
    dataset.loc[dataset['matchID'] == match, 'firstBat'] = True

  result = method(y)


In [20]:
print(dataset)

    balls batPosition  captained         date  didNotBat dismissalBowlerHand  \
0      21           5      False  12 Jun 2010      False                None   
1       0           -      False  13 Jun 2010       True                None   
2      19           1      False   9 Jan 2011      False               right   
3      12           3      False   4 Jun 2011      False               right   
4       5           4      False  31 Aug 2011      False               right   
5      16           3      False  29 Oct 2011      False               right   
6      21           3      False   1 Feb 2012      False                left   
7      24           3      False   3 Feb 2012      False               right   
8       0           -      False  30 Mar 2012       True                None   
9      48           3      False   7 Aug 2012      False               right   
10     41           2      False  11 Sep 2012      False                left   
11     39           3      False  19 Sep

#### Converts the dataset in Pandas to JSON

In [21]:
json = dataset.to_json(orient='records')
print(json)

[{"balls":21,"batPosition":"5","captained":false,"date":"12 Jun 2010","didNotBat":false,"dismissalBowlerHand":null,"dismissalType":"not out","dismissedAgainst":null,"firstBat":false,"fours":3,"ground":"Harare","innings":1,"match":1,"matchID":182,"minutes":"30","notOut":true,"opposition":"Zimbabwe","runs":26,"sixes":1,"strikeRate":123,"won":true},{"balls":0,"batPosition":"-","captained":false,"date":"13 Jun 2010","didNotBat":true,"dismissalBowlerHand":null,"dismissalType":"-","dismissedAgainst":null,"firstBat":false,"fours":0,"ground":"Harare","innings":1,"match":2,"matchID":183,"minutes":0,"notOut":true,"opposition":"Zimbabwe","runs":0,"sixes":0,"strikeRate":0,"won":true},{"balls":19,"batPosition":"1","captained":false,"date":"9 Jan 2011","didNotBat":false,"dismissalBowlerHand":"right","dismissalType":"bowled","dismissedAgainst":"spin","firstBat":false,"fours":5,"ground":"Durban","innings":1,"match":3,"matchID":196,"minutes":"28","notOut":false,"opposition":"South Africa","runs":28,"si