# Scraping Career Data of a Player

The following program scrapes data from the stats pages of priemer sports website espncricinfo, and returns a json data containing extensive data player's career match-by-match.

In [1]:
import requests

```requests library is required to fetch the data from webserver via http```

In [2]:
import bs4

``` bs4/beautifulsoup is required to parse data from html```

In [3]:
base_url = 'http://stats.espncricinfo.com/ci/engine/player/'

In [4]:
params = [
    ('class', '3'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'match')
]

In [5]:
player_id = '422108'

```player_id is the id of the player in espncricinfo```

In [6]:
url = base_url + player_id + '.html'

In [7]:
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')

In [8]:
table = soup.find_all("table")[3]

In [9]:
rows = bs4.BeautifulSoup(str(table))

In [10]:
match_data = rows.find_all('tr')[1:]

#### Finds basic batting details for each match played

In [11]:
data = []
i = 0
for match in match_data:
    i = i+1
    isDismissed = False
    didNotBat = False

    cols = bs4.BeautifulSoup(str(match))
    
    score_cell = cols.find_all('td')[0]
    soup = bs4.BeautifulSoup(str(score_cell))
    score = soup.get_text()
    if score == "DNB" or score == "TDNB":
        didNotBat = True
        runs = 0
    elif "*" in score:
        runs = int(score[:-1])
    else:
        isDismissed = True
        runs =  int(score)
        

    bf_cell = cols.find_all('td')[2]
    soup = bs4.BeautifulSoup(str(bf_cell))
    balls = soup.get_text()
    if balls == "-":
        ball = 0
        strikeRate = 0
    else:
        ball = balls
        if(int(balls)>0):
            strikeRate = round(int(runs)/int(balls) * 100,2)
        
    fours_cell  = cols.find_all('td')[4]
    soup = bs4.BeautifulSoup(str(fours_cell))
    fours = soup.get_text()
    if fours == "-":
        four = 0
    else:
        four = fours
    
    sixes_cell  = cols.find_all('td')[5]
    soup = bs4.BeautifulSoup(str(sixes_cell))
    sixes = soup.get_text()
    if sixes == "-":
        six = 0
    else:
        six = sixes
        
    
    opposition_cell  = cols.find_all('td')[7]
    soup = bs4.BeautifulSoup(str(opposition_cell))
    opposition = soup.get_text()[2:]
    
    ground_cell  = cols.find_all('td')[8]
    soup = bs4.BeautifulSoup(str(ground_cell))
    ground = soup.get_text()
    
    date_cell  = cols.find_all('td')[9]
    soup = bs4.BeautifulSoup(str(date_cell))
    date = soup.get_text()
    
    id_cell  = cols.find_all('td')[10]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1] 
    
    entry = {
        'balls': int(ball),
        'batPosition': None,
        'captained': False,
        'date': date,
        'didNotBat': didNotBat,
        'dismissalType': None,
        'dismissedAgainst': None,
        'dismissalBowlerHand': None,
        'matchID': int(matchID),
        'minutes': 0,
        'fours': int(four),
        'firstBat': False,
        'ground': ground,
        'match': int(i),
        'notOut': not isDismissed,
        'opposition': opposition,
        'runs': int(runs),
        'sixes': int(six),
        'strikeRate': int(strikeRate),
        'won': False
    }
    data.append(entry)

In [12]:
import pandas as pd

``` pandas library is required to handle large amount of data easily ```

In [13]:
dataset = pd.DataFrame(data)
print(dataset)

    balls batPosition  captained         date  didNotBat dismissalBowlerHand  \
0       1        None      False  18 Jun 2016      False                None   
1      40        None      False  20 Jun 2016      False                None   
2      20        None      False  22 Jun 2016      False                None   
3      51        None      False  27 Aug 2016      False                None   
4       0        None      False  28 Aug 2016       True                None   
5       9        None      False  26 Jan 2017      False                None   
6      47        None      False  29 Jan 2017      False                None   
7      18        None      False   1 Feb 2017      False                None   
8      18        None      False   6 Sep 2017      False                None   
9      48        None      False  20 Dec 2017      False                None   
10     49        None      False  22 Dec 2017      False                None   
11      9        None      False  24 Dec

#### Finds matches won among played matches

In [14]:
params = [
    ('class', '3'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'match'),
    ('result','1')
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

won = []
for match in match_data:
    id_cell  = match.find_all('td')[10]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]
    won.append(int(matchID))
print(won)

[559, 560, 593, 594, 618, 633, 634, 635, 659, 660, 662, 680, 684, 690, 707, 709, 710, 714]


#### Finds matches as captain among played matches

In [15]:
params = [
    ('captain','1'),
    ('class', '3'),
    ('filter','advanced'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'match'),
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

captained = []
for match in match_data:
    id_cell  = match.find_all('td')[10]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]
    captained.append(int(matchID))
print(captained)

IndexError: list index out of range

#### Finds Batting Position, Batting Innings, Minutes Batted, Dismissal Type in played matches

In [16]:
params = [
    ('class', '3'),
    ('filter','advanced'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'innings'),
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

batPosition = []
batFirst = []
dismissalType = []
minutes = []
for match in match_data:

    id_cell  = match.find_all('td')[13]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]

    position_cell  = match.find_all('td')[6]
    soup = bs4.BeautifulSoup(str(position_cell))
    batPosition.append({
        "id": int(matchID),
        "pos": soup.get_text()
    })
    
    minutes_cell  = match.find_all('td')[1]
    soup = bs4.BeautifulSoup(str(minutes_cell))
    if soup.get_text() != '-':
        minutes.append({
            "id": int(matchID),
            "min": soup.get_text()
        })
    
    dismissal_cell  = match.find_all('td')[7]
    soup = bs4.BeautifulSoup(str(dismissal_cell))
    if soup.get_text() != '-' or soup.get_text() != 'not out':
        dismissalType.append({
            "id": int(matchID),
            "type": soup.get_text()
        })
    
    innings_cell  = match.find_all('td')[8]
    soup = bs4.BeautifulSoup(str(innings_cell))
    if soup.get_text() == '1':  
        batFirst.append(int(matchID))

#### Finds Dismissal against Pace/Spin Bowlers in played matches

In [17]:
dismissedAgainst = []

params = [
    ('class', '3'),
    ('bowling_pacespin','1'),
    ('filter','advanced'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'dismissal_list'),
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

for match in match_data:

    id_cell  = match.find_all('td')[9]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]
    dismissedAgainst.append({
        "type": "pace",
        "id": int(matchID)
    })
    
params = [
    ('class', '3'),
    ('bowling_pacespin','2'),
    ('filter','advanced'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'dismissal_list'),
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

for match in match_data:

    id_cell  = match.find_all('td')[9]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]
    dismissedAgainst.append({
        "type": "spin",
        "id": int(matchID)
    })
    
print(dismissedAgainst)


[{'type': 'pace', 'id': 558}, {'type': 'pace', 'id': 560}, {'type': 'pace', 'id': 592}, {'type': 'pace', 'id': 593}, {'type': 'pace', 'id': 594}, {'type': 'pace', 'id': 633}, {'type': 'pace', 'id': 634}, {'type': 'pace', 'id': 635}, {'type': 'pace', 'id': 662}, {'type': 'pace', 'id': 680}, {'type': 'pace', 'id': 688}, {'type': 'pace', 'id': 690}, {'type': 'pace', 'id': 707}, {'type': 'pace', 'id': 710}, {'type': 'pace', 'id': 748}, {'type': 'pace', 'id': 749}, {'type': 'spin', 'id': 618}, {'type': 'spin', 'id': 659}, {'type': 'spin', 'id': 712}, {'type': 'spin', 'id': 714}]


#### Finds Dismissal against Right/Left-Handed Bowlers in played matches

In [18]:
dismissalBowlerHand = []

params = [
    ('class', '3'),
    ('bowling_hand','1'),
    ('filter','advanced'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'dismissal_list'),
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

for match in match_data:

    id_cell  = match.find_all('td')[9]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]
    dismissalBowlerHand.append({
        "hand": "right",
        "id": int(matchID)
    })
    
params = [
    ('class', '3'),
    ('bowling_hand','2'),
    ('filter','advanced'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'dismissal_list'),
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

for match in match_data:

    id_cell  = match.find_all('td')[9]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]
    dismissalBowlerHand.append({
        "hand": "left",
        "id": int(matchID)
    })
    
print(dismissalBowlerHand)

IndexError: list index out of range

#### Adds the new data found into the existing match-wise dataset

In [19]:
for match in won:
    dataset.loc[dataset['matchID'] == match, 'won'] = True
for match in captained:
    dataset.loc[dataset['matchID'] == match, 'captained'] = True
for match in dismissalType:
    dataset.loc[dataset['matchID'] == match['id'], 'dismissalType'] = match['type']
for match in dismissedAgainst:
    dataset.loc[dataset['matchID'] == match['id'], 'dismissedAgainst'] = match['type']
for match in dismissalBowlerHand:
    dataset.loc[dataset['matchID'] == match['id'], 'dismissalBowlerHand'] = match['hand']
for match in minutes:
    dataset.loc[dataset['matchID'] == match['id'], 'minutes'] = match['min']
for match in batPosition:
    dataset.loc[dataset['matchID'] == match['id'], 'batPosition'] = match['pos']
for match in batFirst:
    dataset.loc[dataset['matchID'] == match, 'firstBat'] = True

In [20]:
print(dataset)

    balls batPosition  captained         date  didNotBat dismissalBowlerHand  \
0       1           1      False  18 Jun 2016      False               right   
1      40           1      False  20 Jun 2016      False                None   
2      20           1      False  22 Jun 2016      False               right   
3      51           4      False  27 Aug 2016      False                None   
4       0           -      False  28 Aug 2016       True                None   
5       9           2      False  26 Jan 2017      False               right   
6      47           2      False  29 Jan 2017      False               right   
7      18           2      False   1 Feb 2017      False               right   
8      18           2      False   6 Sep 2017      False               right   
9      48           2      False  20 Dec 2017      False               right   
10     49           2      False  22 Dec 2017      False               right   
11      9           2      False  24 Dec

#### Converts the dataset in Pandas to JSON

In [21]:
json = dataset.to_json(orient='records')
print(json)

[{"balls":1,"batPosition":"1","captained":false,"date":"18 Jun 2016","didNotBat":false,"dismissalBowlerHand":"right","dismissalType":"bowled","dismissedAgainst":"pace","firstBat":false,"fours":0,"ground":"Harare","match":1,"matchID":558,"minutes":0,"notOut":false,"opposition":"Zimbabwe","runs":0,"sixes":0,"strikeRate":0,"won":false},{"balls":40,"batPosition":"1","captained":false,"date":"20 Jun 2016","didNotBat":false,"dismissalBowlerHand":null,"dismissalType":"not out","dismissedAgainst":null,"firstBat":false,"fours":2,"ground":"Harare","match":2,"matchID":559,"minutes":"56","notOut":true,"opposition":"Zimbabwe","runs":47,"sixes":2,"strikeRate":117,"won":true},{"balls":20,"batPosition":"1","captained":false,"date":"22 Jun 2016","didNotBat":false,"dismissalBowlerHand":"right","dismissalType":"bowled","dismissedAgainst":"pace","firstBat":true,"fours":3,"ground":"Harare","match":3,"matchID":560,"minutes":"21","notOut":false,"opposition":"Zimbabwe","runs":22,"sixes":1,"strikeRate":110,"wo