# Scraping Career Data of a Player

The following program scrapes data from the stats pages of priemer sports website espncricinfo, and returns a json data containing extensive data player's career match-by-match.

In [1]:
import requests

```requests library is required to fetch the data from webserver via http```

In [2]:
import bs4

``` bs4/beautifulsoup is required to parse data from html```

In [3]:
base_url = 'http://stats.espncricinfo.com/ci/engine/player/'

In [4]:
params = [
    ('class', '2'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'match')
]

In [5]:
player_id = '253802'

```player_id is the id of the player in espncricinfo```

In [6]:
url = base_url + player_id + '.html'

In [7]:
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')

In [8]:
table = soup.find_all("table")[3]

In [9]:
rows = bs4.BeautifulSoup(str(table))

In [10]:
match_data = rows.find_all('tr')[1:]

#### Finds basic batting details for each match played

In [11]:
data = []
i = 0
for match in match_data:
    i = i+1
    isDismissed = False
    didNotBat = False

    cols = bs4.BeautifulSoup(str(match))
    
    score_cell = cols.find_all('td')[0]
    soup = bs4.BeautifulSoup(str(score_cell))
    score = soup.get_text()
    if score == "DNB" or score == "TDNB":
        didNotBat = True
        runs = 0
    elif "*" in score:
        runs = int(score[:-1])
    else:
        isDismissed = True
        runs =  int(score)
        

    bf_cell = cols.find_all('td')[2]
    soup = bs4.BeautifulSoup(str(bf_cell))
    balls = soup.get_text()
    if balls == "-":
        ball = 0
        strikeRate = 0
    else:
        ball = balls
        if(int(balls)>0):
            strikeRate = round(int(runs)/int(balls) * 100,2)
        
    fours_cell  = cols.find_all('td')[4]
    soup = bs4.BeautifulSoup(str(fours_cell))
    fours = soup.get_text()
    if fours == "-":
        four = 0
    else:
        four = fours
    
    sixes_cell  = cols.find_all('td')[5]
    soup = bs4.BeautifulSoup(str(sixes_cell))
    sixes = soup.get_text()
    if sixes == "-":
        six = 0
    else:
        six = sixes
        
    
    opposition_cell  = cols.find_all('td')[7]
    soup = bs4.BeautifulSoup(str(opposition_cell))
    opposition = soup.get_text()[2:]
    
    ground_cell  = cols.find_all('td')[8]
    soup = bs4.BeautifulSoup(str(ground_cell))
    ground = soup.get_text()
    
    date_cell  = cols.find_all('td')[9]
    soup = bs4.BeautifulSoup(str(date_cell))
    date = soup.get_text()
    
    id_cell  = cols.find_all('td')[10]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1] 
    
    entry = {
        'balls': int(ball),
        'batPosition': None,
        'captained': False,
        'date': date,
        'didNotBat': didNotBat,
        'dismissalType': None,
        'dismissedAgainst': None,
        'dismissalBowlerHand': None,
        'matchID': int(matchID),
        'minutes': 0,
        'fours': int(four),
        'firstBat': False,
        'ground': ground,
        'match': int(i),
        'notOut': not isDismissed,
        'opposition': opposition,
        'runs': int(runs),
        'sixes': int(six),
        'strikeRate': int(strikeRate),
        'won': False
    }
    data.append(entry)

In [12]:
import pandas as pd

``` pandas library is required to handle large amount of data easily ```

In [13]:
dataset = pd.DataFrame(data)
print(dataset)

     balls batPosition  captained         date  didNotBat dismissalBowlerHand  \
0       22        None      False  18 Aug 2008      False                None   
1       67        None      False  20 Aug 2008      False                None   
2       38        None      False  24 Aug 2008      False                None   
3       66        None      False  27 Aug 2008      False                None   
4       46        None      False  29 Aug 2008      False                None   
5        2        None      False  14 Sep 2009      False                None   
6       24        None      False  26 Sep 2009      False                None   
7        0        None      False  28 Sep 2009       True                None   
8      104        None      False  30 Sep 2009      False                None   
9       41        None      False  25 Oct 2009      False                None   
10      16        None      False   2 Nov 2009      False                None   
11      19        None      

#### Finds matches won among played matches

In [14]:
params = [
    ('class', '2'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'match'),
    ('result','1')
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

won = []
for match in match_data:
    id_cell  = match.find_all('td')[10]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]
    won.append(int(matchID))
print(won)

[2745, 2750, 2755, 2889, 2904, 2932, 2935, 2939, 2941, 2942, 2961, 2962, 2983, 2993, 2996, 3001, 3032, 3039, 3060, 3070, 3072, 3074, 3076, 3077, 3080, 3082, 3100, 3121, 3124, 3141, 3143, 3147, 3148, 3159, 3160, 3161, 3199, 3201, 3205, 3207, 3210, 3217, 3219, 3223, 3224, 3233, 3237, 3251, 3259, 3263, 3291, 3293, 3294, 3295, 3316, 3320, 3322, 3327, 3363, 3368, 3372, 3376, 3377, 3383, 3387, 3388, 3395, 3397, 3399, 3402, 3403, 3420, 3424, 3428, 3436, 3439, 3474, 3483, 3517, 3520, 3523, 3533, 3535, 3539, 3540, 3543, 3544, 3547, 3602, 3610, 3618, 3625, 3631, 3636, 3641, 3661, 3692, 3698, 3727, 3796, 3798, 3800, 3819, 3821, 3878, 3886, 3891, 3896, 3898, 3902, 3905, 3906, 3907, 3908, 3909, 3910, 3912, 3914, 3919, 3931, 3932, 3969, 3970, 3971, 3976, 3978, 4014, 4056, 4063, 4064, 4078, 4079, 4082, 4085, 4088, 4102, 4106]


#### Finds matches as captain among played matches

In [15]:
params = [
    ('captain','1'),
    ('class', '2'),
    ('filter','advanced'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'match'),
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

captained = []
for match in match_data:
    id_cell  = match.find_all('td')[10]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]
    captained.append(int(matchID))
print(captained)

[3382, 3383, 3387, 3395, 3397, 3399, 3402, 3403, 3474, 3476, 3479, 3483, 3539, 3540, 3543, 3544, 3547, 3819, 3821, 3824, 3878, 3882, 3886, 3891, 3894, 3895, 3896, 3898, 3900, 3902, 3905, 3906, 3907, 3908, 3909, 3910, 3912, 3914, 3917, 3919, 3928, 3931, 3932, 3969, 3970, 3971, 3973, 3976, 3978, 4014, 4016, 4018, 4056, 4059, 4062, 4063, 4064, 4077, 4078, 4079, 4082, 4085, 4088, 4102, 4106, 4109, 4111, 4113]


#### Finds Batting Position, Batting Innings, Minutes Batted, Dismissal Type in played matches

In [16]:
params = [
    ('class', '2'),
    ('filter','advanced'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'innings'),
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

batPosition = []
batFirst = []
dismissalType = []
minutes = []
for match in match_data:

    id_cell  = match.find_all('td')[13]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]

    position_cell  = match.find_all('td')[6]
    soup = bs4.BeautifulSoup(str(position_cell))
    batPosition.append({
        "id": int(matchID),
        "pos": soup.get_text()
    })
    
    minutes_cell  = match.find_all('td')[1]
    soup = bs4.BeautifulSoup(str(minutes_cell))
    if soup.get_text() != '-':
        minutes.append({
            "id": int(matchID),
            "min": soup.get_text()
        })
    
    dismissal_cell  = match.find_all('td')[7]
    soup = bs4.BeautifulSoup(str(dismissal_cell))
    if soup.get_text() != '-' or soup.get_text() != 'not out':
        dismissalType.append({
            "id": int(matchID),
            "type": soup.get_text()
        })
    
    innings_cell  = match.find_all('td')[8]
    soup = bs4.BeautifulSoup(str(innings_cell))
    if soup.get_text() == '1':  
        batFirst.append(int(matchID))

#### Finds Dismissal against Pace/Spin Bowlers in played matches

In [17]:
dismissedAgainst = []

params = [
    ('class', '2'),
    ('bowling_pacespin','1'),
    ('filter','advanced'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'dismissal_list'),
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

for match in match_data:

    id_cell  = match.find_all('td')[9]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]
    dismissedAgainst.append({
        "type": "pace",
        "id": int(matchID)
    })
    
params = [
    ('class', '3'),
    ('bowling_pacespin','2'),
    ('filter','advanced'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'dismissal_list'),
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

for match in match_data:

    id_cell  = match.find_all('td')[9]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]
    dismissedAgainst.append({
        "type": "spin",
        "id": int(matchID)
    })
    
print(dismissedAgainst)


[{'type': 'pace', 'id': 2742}, {'type': 'pace', 'id': 2745}, {'type': 'pace', 'id': 2755}, {'type': 'pace', 'id': 2756}, {'type': 'pace', 'id': 2919}, {'type': 'pace', 'id': 2932}, {'type': 'pace', 'id': 2938}, {'type': 'pace', 'id': 2943}, {'type': 'pace', 'id': 2961}, {'type': 'pace', 'id': 2963}, {'type': 'pace', 'id': 2988}, {'type': 'pace', 'id': 2999}, {'type': 'pace', 'id': 3001}, {'type': 'pace', 'id': 3032}, {'type': 'pace', 'id': 3039}, {'type': 'pace', 'id': 3040}, {'type': 'pace', 'id': 3060}, {'type': 'pace', 'id': 3070}, {'type': 'pace', 'id': 3072}, {'type': 'pace', 'id': 3076}, {'type': 'pace', 'id': 3079}, {'type': 'pace', 'id': 3082}, {'type': 'pace', 'id': 3087}, {'type': 'pace', 'id': 3110}, {'type': 'pace', 'id': 3124}, {'type': 'pace', 'id': 3141}, {'type': 'pace', 'id': 3147}, {'type': 'pace', 'id': 3159}, {'type': 'pace', 'id': 3161}, {'type': 'pace', 'id': 3189}, {'type': 'pace', 'id': 3210}, {'type': 'pace', 'id': 3217}, {'type': 'pace', 'id': 3219}, {'type': 

#### Finds Dismissal against Right/Left-Handed Bowlers in played matches

In [18]:
dismissalBowlerHand = []

params = [
    ('class', '2'),
    ('bowling_hand','1'),
    ('filter','advanced'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'dismissal_list'),
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

for match in match_data:

    id_cell  = match.find_all('td')[9]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]
    dismissalBowlerHand.append({
        "hand": "right",
        "id": int(matchID)
    })
    
params = [
    ('class', '2'),
    ('bowling_hand','2'),
    ('filter','advanced'),
    ('template', 'results'),
    ('type', 'batting'),
    ('view', 'dismissal_list'),
]
response = requests.get(url=url, params=params)
soup = bs4.BeautifulSoup(response.text,'lxml')
table = soup.find_all("table")[3]
match_data = table.find_all('tr')[1:]

for match in match_data:

    id_cell  = match.find_all('td')[9]
    soup = bs4.BeautifulSoup(str(id_cell))
    matchID = soup.get_text().split("#",1)[1]
    dismissalBowlerHand.append({
        "hand": "left",
        "id": int(matchID)
    })
    
print(dismissalBowlerHand)

[{'hand': 'right', 'id': 2742}, {'hand': 'right', 'id': 2756}, {'hand': 'right', 'id': 2898}, {'hand': 'right', 'id': 2933}, {'hand': 'right', 'id': 2935}, {'hand': 'right', 'id': 2961}, {'hand': 'right', 'id': 2963}, {'hand': 'right', 'id': 2983}, {'hand': 'right', 'id': 2986}, {'hand': 'right', 'id': 2996}, {'hand': 'right', 'id': 2999}, {'hand': 'right', 'id': 3001}, {'hand': 'right', 'id': 3032}, {'hand': 'right', 'id': 3039}, {'hand': 'right', 'id': 3040}, {'hand': 'right', 'id': 3060}, {'hand': 'right', 'id': 3079}, {'hand': 'right', 'id': 3082}, {'hand': 'right', 'id': 3087}, {'hand': 'right', 'id': 3110}, {'hand': 'right', 'id': 3124}, {'hand': 'right', 'id': 3141}, {'hand': 'right', 'id': 3143}, {'hand': 'right', 'id': 3148}, {'hand': 'right', 'id': 3159}, {'hand': 'right', 'id': 3160}, {'hand': 'right', 'id': 3161}, {'hand': 'right', 'id': 3162}, {'hand': 'right', 'id': 3187}, {'hand': 'right', 'id': 3189}, {'hand': 'right', 'id': 3191}, {'hand': 'right', 'id': 3195}, {'hand'

#### Adds the new data found into the existing match-wise dataset

In [19]:
for match in won:
    dataset.loc[dataset['matchID'] == match, 'won'] = True
for match in captained:
    dataset.loc[dataset['matchID'] == match, 'captained'] = True
for match in dismissalType:
    dataset.loc[dataset['matchID'] == match['id'], 'dismissalType'] = match['type']
for match in dismissedAgainst:
    dataset.loc[dataset['matchID'] == match['id'], 'dismissedAgainst'] = match['type']
for match in dismissalBowlerHand:
    dataset.loc[dataset['matchID'] == match['id'], 'dismissalBowlerHand'] = match['hand']
for match in minutes:
    dataset.loc[dataset['matchID'] == match['id'], 'minutes'] = match['min']
for match in batPosition:
    dataset.loc[dataset['matchID'] == match['id'], 'batPosition'] = match['pos']
for match in batFirst:
    dataset.loc[dataset['matchID'] == match, 'firstBat'] = True

In [20]:
print(dataset)

     balls batPosition  captained         date  didNotBat dismissalBowlerHand  \
0       22           2      False  18 Aug 2008      False               right   
1       67           2      False  20 Aug 2008      False                left   
2       38           1      False  24 Aug 2008      False                None   
3       66           1      False  27 Aug 2008      False                left   
4       46           1      False  29 Aug 2008      False               right   
5        2           7      False  14 Sep 2009      False                None   
6       24           4      False  26 Sep 2009      False               right   
7        0           -      False  28 Sep 2009       True                None   
8      104           4      False  30 Sep 2009      False                None   
9       41           4      False  25 Oct 2009      False                left   
10      16           3      False   2 Nov 2009      False                left   
11      19           7      

#### Converts the dataset in Pandas to JSON

In [21]:
json = dataset.to_json(orient='records')
print(json)

[{"balls":22,"batPosition":"2","captained":false,"date":"18 Aug 2008","didNotBat":false,"dismissalBowlerHand":"right","dismissalType":"lbw","dismissedAgainst":"pace","firstBat":true,"fours":1,"ground":"Dambulla","match":1,"matchID":2742,"minutes":"33","notOut":false,"opposition":"Sri Lanka","runs":12,"sixes":0,"strikeRate":54,"won":false},{"balls":67,"batPosition":"2","captained":false,"date":"20 Aug 2008","didNotBat":false,"dismissalBowlerHand":"left","dismissalType":"caught","dismissedAgainst":"pace","firstBat":false,"fours":6,"ground":"Dambulla","match":2,"matchID":2745,"minutes":"82","notOut":false,"opposition":"Sri Lanka","runs":37,"sixes":0,"strikeRate":55,"won":true},{"balls":38,"batPosition":"1","captained":false,"date":"24 Aug 2008","didNotBat":false,"dismissalBowlerHand":null,"dismissalType":"run out","dismissedAgainst":null,"firstBat":true,"fours":4,"ground":"Colombo (RPS)","match":3,"matchID":2750,"minutes":"40","notOut":false,"opposition":"Sri Lanka","runs":25,"sixes":0,"s