In [0]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import csv
import sys
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
from tqdm import tqdm_notebook as tqdm
import traceback

In [0]:
def fetch_player_data(years):
    """ 
    This function will be used to fetch player performance data for T20 matches
    parameters :
    year : years of ipl(will be list)
    """
    universal_link_set_bat = set()
    universal_link_set_bowl = set()

    Base_URL = ["http://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=",";trophy=117;type=season"]
    if isinstance(years,list) == False or len(years) == 0:
        raise ValueError('Etiher the year is not passed in a list or the list is empty')
    
    # let's create dataFrames to store data
    col_batsmens = ['fullname','born','playing_role','batting_style','matches','Inns','not_out','runs','highest_score','Avg',\
                                       'Balls_faced','SR','hundreds','fiftys','fours','sixes','ct','st']

     # let's create a dataFrame with above columns, then we will loop over all the table rows to fill data in it

    df_batsmens = pd.DataFrame(columns = col_batsmens)

    # prepare dataframe for bowling
    col_bowlers = ['fullname','born','playing_role','bowling_style','matches','Inns','balls','runs','wkts','bbi',\
                                       'bbm','avg','econ','SR','four_W','five_W','ten']
    # let's create a dataFrame with above columns, then we will loop over all the table rows to fill data in it

    df_bowlers = pd.DataFrame(columns = col_bowlers)

    try:


        # loop for all the years
        for year in years:
            # build the URL
            url = str(year).join(Base_URL)

            print(f'Collecting player data for year {year}....')

            # First we have to parse the batting-bowling data page link from the starter page i.e url

            # Approach : i will first collect all the links present in that table
            #            then filter the links which will lead us to players info data page.
            # hurdles : in the table a particular row contains 5 links all with same class name.

            source = requests.get(url).text

            soup = BeautifulSoup(source, 'lxml') 

            data_table = soup.find('table','engineTable')

            link_all = data_table.find_all('a','data-link')

            useful_links = []
            link_base = "http://stats.espncricinfo.com"

            for link in link_all:
                href = str(link['href'])
                if "/ci/engine/match/" in href:
                    useful_links.append(link_base+href)

            
            

            

            for link in tqdm(useful_links):

                source = requests.get(link).text

                soup = BeautifulSoup(source, 'lxml')

                main_div = soup.find('div','col-b')

                ###############################################################################################

                ############# Now we will Scrape ScoreCards i.e bating and bowling performance for each team #######################


                score_cards = main_div.find_all('article','sub-module scorecard')

                # above score_cards contain 2 score_card i.e. each team

                batsmen_links = []
                bowler_links = []

                for sc in score_cards:
                  batsmen_div = sc.find('div','scorecard-section batsmen')
                  bowler_div =  sc.find('div','scorecard-section bowling')

                  if batsmen_div is None:
                      continue
                  if bowler_div is None:
                      continue

                  """
                  we will collecte all the links that will lead to batsmen and bowler from the batasmen and bowler scorecard div.
                  Then we will scrape all the data needed for player analysis for their T20 performance
                  """
                  
                  b_links = batsmen_div.find_all('a')
                  bow_links = bowler_div.find_all('a')

                  if b_links is None:
                      continue
                  if bow_links is None:
                      continue

                  for b in b_links:
                    try:
                     if 'html' in b['href'] and b['href'] not in b_links:
                       batsmen_links.append(b['href'])
                    except:
                        continue
                   
                  for bo in bow_links:
                    try:
                     if 'html' in bo['href'] and bo['href'] not in bow_links:
                       bowler_links.append(bo['href'])
                    except:
                        continue
                 
                

                # Now that we have all the links avaliable to us for both batsmen and bowler
                # let's scrape their information and put it dataframes

                ## ** Scraping batsmen data from batsmen's link list
                # print('scraping batsmen data ...........') 
                for bat_link in batsmen_links:
                  if bat_link not in universal_link_set_bat: 
                    source = requests.get(bat_link).text
                    soup = BeautifulSoup(source, 'lxml')
                    main_div = soup.find('div','pnl490M')

                    if main_div is None:
                        continue

                    all_para = main_div.find_all('p','ciPlayerinformationtxt')

                    # fetching player personal info
                    fullname = ""
                    born = ""
                    playing_role = "" 
                    batting_style = ""

                    for p in all_para:
                      # print(p.find('b').text.strip())
                      if p.find('b').text.strip() == "Full name":
                        fullname = p.find('span').text.strip()
                      elif p.find('b').text.strip() == "Born":
                        born = p.find('span').text.strip()
                      elif p.find('b').text.strip() == "Playing role":
                        playing_role = p.find('span').text.strip()
                      elif p.find('b').text.strip() == "Batting style":
                        batting_style = p.find('span').text.strip()
                      else:
                        pass 

                    # print(f'name : {fullname}\ndob : {born}\nplaying_role : {playing_role}\nbatting_style : {batting_style}')

                    # fetching player sports info

                    batting_table = main_div.find_all('table','engineTable')
                    batting_table = batting_table[0]

                    df_temp_bat = pd.read_html(batting_table.prettify(),index_col=0)[0]
                    
                    info = df_temp_bat.loc['T20s'].tolist()
                    # print(info)

                    # let's fill the dataFrame

                    if fullname not in df_batsmens['fullname'].tolist():
                        df_batsmens = df_batsmens.append({
                          'fullname':fullname,'born':born,'playing_role':playing_role,'batting_style':batting_style,
                          'matches':info[0],'Inns':info[1],'not_out':info[2],'runs':info[3],'highest_score':info[4],
                          'Avg':info[5],'Balls_faced':info[6],'SR':info[7],'hundreds':info[8],'fiftys':info[9],
                          'fours':info[10],'sixes':info[11],'ct':info[12],'st':info[13]
                      },ignore_index = True)


                    # add this link to universal_link_set.
                    # ***** This will make sure we are not requesting the same link again and again, makes the process much faster ***
                    universal_link_set_bat.add(bat_link)
                    # print(df_temp_bat) 

                ## ** Scraping bowler data from bowlers's link list

                for bow_link in bowler_links:
                  if bow_link not in universal_link_set_bowl: 
                    source = requests.get(bow_link).text
                    soup = BeautifulSoup(source, 'lxml')
                    main_div = soup.find('div','pnl490M')

                    if main_div is None:
                        continue

                    all_para = main_div.find_all('p','ciPlayerinformationtxt')

                    # fetching player personal info
                    fullname = ""
                    born = ""
                    playing_role = "" 
                    bowling_style = ""

                    for p in all_para:
                      # print(p.find('b').text.strip())
                      if p.find('b').text.strip() == "Full name":
                        fullname = p.find('span').text.strip()
                      elif p.find('b').text.strip() == "Born":
                        born = p.find('span').text.strip()
                      elif p.find('b').text.strip() == "Playing role":
                        playing_role = p.find('span').text.strip()
                      elif p.find('b').text.strip() == "Bowling style":
                        bowling_style = p.find('span').text.strip()
                      else:
                        pass
                  

                    # fetching player sports info

                    bowling_table = main_div.find_all('table','engineTable')
                    bowling_table = bowling_table[1]

                    df_temp_bat = pd.read_html(bowling_table.prettify(),index_col=0)[0]
                    
                    info = df_temp_bat.loc['T20s'].tolist()
                    # print(info)
                    # print(df_temp_bat)

                    # let's fill the dataFrame

                    if fullname not in df_bowlers['fullname'].tolist():
                        df_bowlers = df_bowlers.append({
                          'fullname':fullname,'born':born,'playing_role':playing_role,'bowling_style':bowling_style,
                          'matches':info[0],'Inns':info[1],'balls':info[2],'runs':info[3],'wkts':info[4],
                          'bbi':info[5],'bbm':info[6],'avg':info[7],'econ':info[8],'SR':info[9],
                          'four_W':info[10],'five_W':info[11],'ten':info[12]
                      },ignore_index = True)


                    # add this link to universal_link_set.
                    # ***** This will make sure we are not requesting the same link again and again, makes the process much faster ***
                    universal_link_set_bowl.add(bow_link)




      
        return df_batsmens,df_bowlers
   

    except Exception:
       traceback.print_exc()

In [28]:
%%time
df_bat,df_bowl = fetch_player_data([2017])

Collecting player data for year 2017....


HBox(children=(IntProgress(value=0, max=60), HTML(value='')))

CPU times: user 22.9 s, sys: 292 ms, total: 23.2 s
Wall time: 2min 38s


In [0]:
df_bat['season'] = 2017
df_bowl['season'] = 2017

In [0]:
Data_path = "/content/drive/My Drive/data/"

In [0]:
df_bat.to_csv(Data_path+'player_batsmen_2017.csv',index = False)
df_bowl.to_csv(Data_path+'player_bowler_2017.csv',index = False)