In [1]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
import json

In [9]:
def extract_batting_data(series_id, match_id):
    URL = 'https://www.espncricinfo.com/series/'+ str(series_id) + '/scorecard/' + str(match_id)
    page = requests.get(URL)
    bs = BeautifulSoup(page.content, 'lxml')

    table_body=bs.find_all('tbody')
    batsmen_df = pd.DataFrame(columns=["Name","Desc","Runs", "Balls", "4s", "6s", "SR", "Team"])
    # for i, table in enumerate(table_body[0:4:2]):
    for i, table in enumerate(table_body[::2]):
        rows = table.find_all('tr')
        for row in rows[::2]:
            cols=row.find_all('td')
            cols=[x.text.strip() for x in cols]
            if cols[0] == 'Extras':
                continue
            if len(cols) == 1:
                continue
            if len(cols) >= 8:
                batsmen_df = batsmen_df.append(pd.Series(
                [re.sub(r"\W+", ' ', cols[0].split("(c)")[0]).strip(), cols[1], 
                int(cols[2]), int(cols[3]), int(cols[5]), int(cols[6]), float(cols[7]), i % 2], 
                index=batsmen_df.columns ), ignore_index=True)
    return batsmen_df

In [10]:
def extract_bowling_data(series_id, match_id):
    URL = 'https://www.espncricinfo.com/series/'+ str(series_id) + '/scorecard/' + str(match_id)
    page = requests.get(URL)
    bs = BeautifulSoup(page.content, 'lxml')

    table_body=bs.find_all('tbody')
    bowler_df = pd.DataFrame(columns=['Name', 'Overs', 'Maidens', 'Runs', 'Wickets',
                                      'Econ', 'Wd', 'Nb','Team'])
    for i, table in enumerate(table_body[1::2]):
        rows = table.find_all('tr')
        for row in rows:
            cols=row.find_all('td')
            cols=[x.text.strip() for x in cols]
            if len(cols) == 11:
                bowler_df = bowler_df.append(pd.Series([cols[0], float(cols[1]), int(cols[2]), int(cols[3]), int(cols[4]), float(cols[5]), 
                                                         float(cols[9]), int(cols[10]), (i + 1) % 2], 
                                                       index=bowler_df.columns ), ignore_index=True)
    return bowler_df

In [2]:
def find_xis(series_id, match_id):
    URL = 'https://www.espncricinfo.com/series/'+ str(series_id) + '/scorecard/' + str(match_id)
    page = requests.get(URL)
    bs = BeautifulSoup(page.content, 'lxml')

    table_body=bs.find_all('tbody')
    xis = []
    table_foot = bs.find_all('tfoot')

    for i, table in enumerate(table_foot):
        rows = table.find_all('tr')
        if len(rows) == 3:
            dnb_names = rows[1].find_all('span')
            dnb_names = [x.text.replace(u'\xa0', '').strip(' ,') for x in dnb_names]
            dnb_names = [re.sub(r"\W+", ' ', i.split("(c)")[0]).strip() for i in dnb_names]
            dnb_names = [i for i in dnb_names if i]
            xis.extend(dnb_names)
 
    for i, table in enumerate(table_body):
        rows = table.find_all('tr')
        for row in rows:
            cols=row.find_all('td')
            cols=[x.text.strip() for x in cols]
            if len(cols) >= 8:
                xis.append(re.sub(r"\W+", ' ', cols[0].split("(c)")[0]).strip())
            if len(cols) == 2:
                if cols[1] == 'Batsman' or cols[1] == 'Allrounder' or cols[1] == 'Bowler' or cols[1] == 'Wicketkeeper':
                    name = re.sub(r"\W+", ' ', cols[0].split("(c)")[0]).strip()
                    xis.append(names_dict[name])
    return set(xis)

In [3]:
URL = "https://www.espncricinfo.com/ci/content/squad/index.html?object=1210595"
page = requests.get(URL)
bs = BeautifulSoup(page.content, 'lxml')

In [117]:
players = {}
columns = []
ultag = bs.find('ul', {'class': 'squads_list'})
for litag in ultag.find_all('li'):
    a = litag.find('a', href=True)
    team = " ".join(a.text.split()[:-1])
    href = a['href']
    squad_URL = "https://www.espncricinfo.com/" + href
    squad_page = requests.get(squad_URL)
    squad_bs = BeautifulSoup(squad_page.content,'lxml')
    players_div  = squad_bs.find('div', {'role': 'main'})
    for player in players_div.find_all("div", {'class': "large-13 medium-13 small-13 columns"}):
#         print(player)
        a = player.find('a', href=True)
        name = a.text.strip()
        players[name] = []
        href = a['href']
        player_URL = "https://www.espncricinfo.com/" + href
#         print(name, player_URL)
        player_attributes = player.find_all('span')
#         print(len(player_attributes), name)

        # Overseas Player
        if len(player_attributes) == 5:
            players[name].append("Overseas")
        else:
            players[name].append("Domestic")
        columns.append("Overseas/Domestic")
        
        # Role
        role_list = [span for span in player_attributes if span.find('b') and "Playing role:" in span.text]
        if (len(role_list) == 0):
            players[name].append["Unknown"]
        else:
            role = role_list[0].text.split(':')[-1]
            if "allrounder" in role or "Allrounder" in role:
                players[name].append["All-Rounder"]
            elif 'batsman' in role or 'Batsman' in role:
                players[name].append["Batsman"]
            elif 'Bowler' in role:
                players[name].append["Bowler"]
            columns.append("Role")
                #         for span in player_attributes:
#             if any.find('b') and "Playing role:" in span.text:
#                 role = span.text.split(':')[-1]
#             else:
#                 print(name)

    
    
#     break

Dwayne Bravo allrounder
Piyush Chawla allrounder
Sam Curran allrounder
MS Dhoni batsman
Faf du Plessis batsman
Harbhajan Singh bowler
Josh Hazlewood bowler
Imran Tahir bowler
Ravindra Jadeja allrounder
Kedar Jadhav allrounder
Monu Kumar allrounder
Lungi Ngidi bowler
Suresh Raina batsman
Ambati Rayudu batsman
Mitchell Santner allrounder
Karn Sharma bowler
Shardul Thakur bowler
Murali Vijay batsman
Shane Watson allrounder
Ravichandran Ashwin allrounder
Avesh Khan bowler
Alex Carey batsman
Shikhar Dhawan batsman
Shimron Hetmyer batsman
Shreyas Iyer batsman
Sandeep Lamichhane bowler
Amit Mishra bowler
Anrich Nortje bowler
Rishabh Pant batsman
Axar Patel allrounder
Harshal Patel bowler
Keemo Paul allrounder
Kagiso Rabada bowler
Ajinkya Rahane batsman
Ishant Sharma bowler
Mohit Sharma bowler
Prithvi Shaw batsman
Marcus Stoinis batsman
Lalit Yadav allrounder
Jason Roy batsman
Chris Woakes allrounder
Mayank Agarwal batsman
Arshdeep Singh bowler
Sheldon Cottrell bowler
Chris Gayle allrounder
De

In [71]:
players

{'Dwayne Bravo': ['Overseas'],
 'Piyush Chawla': ['Domestic'],
 'Sam Curran': ['Overseas'],
 'MS Dhoni': ['Domestic'],
 'Faf du Plessis': ['Overseas'],
 'Harbhajan Singh': ['Domestic'],
 'Josh Hazlewood': ['Overseas'],
 'Imran Tahir': ['Overseas'],
 'Ravindra Jadeja': ['Domestic'],
 'Kedar Jadhav': ['Domestic'],
 'Monu Kumar': ['Domestic'],
 'Lungi Ngidi': ['Overseas'],
 'Suresh Raina': ['Domestic'],
 'Ambati Rayudu': ['Domestic'],
 'Mitchell Santner': ['Overseas'],
 'Karn Sharma': ['Domestic'],
 'Shardul Thakur': ['Domestic'],
 'Murali Vijay': ['Domestic'],
 'Shane Watson': ['Overseas']}