### Notebook use:

* This notebook will be used to Scrape bating and bowling data from http://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=2019;trophy=117;type=season

* The above link contains a table of match level data and a link to ScoreCard data in the last column .i.e <a> T20 </a>. This link will redirect us to the bating and bowling data page for a particular match e.g. https://www.espncricinfo.com/series/8048/scorecard/1175356/chennai-super-kings-vs-royal-challengers-bangalore-1st-match-indian-premier-league-2019

### Approach : 

* First we have collected all the link of scorecards present in the base url i.e. all the href of <a> T20 </a>.
* We will use the func parameter years and no_of_matches to select the year and select the match for scraping the data.

* The function will return 2 DataFrames, One will contain all the bating data for n(func param) no of matches and the other will contain all the bowling data for  n(func param) no of matches 

In [0]:

import requests
import pandas as pd
from bs4 import BeautifulSoup
import csv
import sys
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
from tqdm import tqdm_notebook as tqdm

In [0]:
def fetch_bating_bowling_data(years,no_of_matches=1):
    """ 
    This Function is used to fetch bating and bowling data from the web.
    Parameters
    years : a list which contains the years for which you want the data
    no_of_matches : no of matches data you want. if -1 is given then it will fetch data of all the matches
    return : a tuple of DataFrames, one for bating and one for bowling
    """
    
    # a array which will be used to build a complete URL e.g '2019'.join(Base_URL)
    Base_URL = ["http://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=",";trophy=117;type=season"]
    
    # check if the passed year is in the format of a list or not and if the list is non-empty
    if isinstance(years,list) == False or len(years) == 0:
        raise ValueError('Etiher the year is not passed in a list or the list is empty')
    
    # let's create dataFrames to store data
    column_names_bating = ['match_no','match_city','year','month','day','team_1','team_2','batsmen','wicket_status','R',\
                                       'B','M','fours','sixes','SR']
    
    # let's create a dataFrame with above columns, then we will loop over all the table rows to fill data in it

    df_bating_score_card = pd.DataFrame(columns = column_names_bating)

    # prepare dataframe for bowling
    
    column_names_bowling = ['match_no','match_city','year','month','day','team_1','team_2','bowler','O','M',\
                                       'R','W','ECON','zeros','fours','sixes','WD','NB']
    # let's create a dataFrame with above columns, then we will loop over all the table rows to fill data in it

    df_bowling_score_card = pd.DataFrame(columns = column_names_bowling)

    try:


	    # loop for all the years
	    for year in years:
	        # build the URL
	        url = str(year).join(Base_URL)

	        print(f'collecting data for year {year}....')

	        # First we have to parse the batting-bowling data page link from the starter page i.e url

	        # Approach : i will first collect all the links present in that table
	        #            then filter the links which will lead us to bating-bowling data page.
	        # hurdles : in the table a particular row contains 5 links all with same class name.

	        source = requests.get(url).text

	        soup = BeautifulSoup(source, 'lxml') 

	        data_table = soup.find('table','engineTable')

	        link_all = data_table.find_all('a','data-link')

	        useful_links = []
	        link_base = "http://stats.espncricinfo.com"

	        for link in link_all:
	            href = str(link['href'])
	            if "/ci/engine/match/" in href:
	                useful_links.append(link_base+href)

	        
	        # now we have all the links that will lead us to bating and bowling data for a particular year page in useful_links.
	        
	        # now the hard stuff : we have to parse the batting and bowling data page.

	        if no_of_matches == -1:
	            no_of_matches = len(useful_links)

	        for link in tqdm(useful_links[:no_of_matches]):

	            source = requests.get(link).text

	            soup = BeautifulSoup(source, 'lxml')

	            main_div = soup.find('div','col-b')

	            # we will fetch step by step all the data

	            # gp__cricket__gameHeader : it contains the following information ->
	            # 1. match no, match city, match date
	            # 2. team names, their scores,
	            # 3. player of the match with team
	            # 4. a small match summary : Super Kings won by 7 wickets (with 14 balls remaining)
	            
	            div_name = "gp__cricket__gameHeader"
	            #################### scrape div_name = "gp__cricket__gameHeader" ####################
	            div_gp__cricket__gameHeader_data = main_div.find('div',div_name)

	            # print(div_gp__cricket__gameHeader_data.prettify())

	            match_first_glance_info = div_gp__cricket__gameHeader_data.find('div','cscore_info-overview').text.strip()
	            match_first_glance_info_parts = match_first_glance_info.split(',')
	            
	            match_no = match_first_glance_info_parts[0].split('s')[0] # attribute

	            match_city = match_first_glance_info_parts[1].split('at')[1] # attribute

	            date_parts = match_first_glance_info_parts[2].split(' ')

	            month = date_parts[1] # attribute
	            day = date_parts[2] # attribute
	            year = date_parts[3] # attribute

	            # print(div_gp__cricket__gameHeader_data.prettify())

	            # mom_details = div_gp__cricket__gameHeader_data.find('a','gp__cricket__player-match__player__detail__link').contents
	            
	            # mom_player_name = mom_details[0].strip() # attribute
	            # mom_team_name = mom_details[1].text.strip() # attribute

	            teams = div_gp__cricket__gameHeader_data.find_all('span','cscore_name cscore_name--long')
	            team_1 = teams[0].text.strip() # attribute
	            team_2 = teams[1].text.strip() # attribute


	            

	            ###############################################################################################

	            ############# Now we will Scrape ScoreCards i.e bating and bowling performance for each team #######################


	            score_cards = main_div.find_all('article','sub-module scorecard')

	            # above score_cards contain 2 score_card i.e. each team
                
	            for score_card in score_cards:

	                batsmen_div = score_card.find('div','scorecard-section batsmen')

	                # print(batsmen_div.prettify()) # for debug

	                if batsmen_div is None:
	                	continue

	                # fetching all batsman 
	                for one in batsmen_div.find_all('div','wrap batsmen'):
	                        # one = batsmen_div.find('div','wrap batsmen')

	                        if one  is None:
	                        	continue


	                        batsmen = one.find('div','cell batsmen').text.strip()

	                        # wicket_status = one.find_all('div','cell commentary')[0].find('a').contents[0].strip()
	                        try:
	                            wicket_status = one.find_all('div','cell commentary')[0].contents[0].text.strip()
	                        except Exception as e:
	                            wicket_status = one.find_all('div','cell commentary')[0].contents[0].strip()
	                        
	                        # print(one.find_all('div','cell commentary')[0].contents[0].text.strip())


	                        all_numeric  = one.find_all('div','cell runs')
	                        # print(len(all_numeric))

	                        R = ''
	                        B = ''
	                        M = ''
	                        fours = ''
	                        sixes = '' 
	                        SR = ''
	                        info = [R,B,M,fours,sixes,SR]

	                        if len(all_numeric) == 5:
	                            all_numeric.insert(2,'')
	                        # print(len(list(map(lambda x : x.text,all_numeric))))
	                        # print(list(map(lambda x : x.text,all_numeric)))
	                        for i,n in enumerate(all_numeric):
	                            # print(n.text.strip())
	                            if i < len(info):
	                                if isinstance(n,str):
	                                    info[i] = info[i]+n
	                                else:
	                                    info[i] = info[i]+n.text.strip()
	                            

	                        # print(date_parts)
	                        df_bating_score_card = df_bating_score_card.append({'match_no': match_no, 'match_city': match_city,'year': year, \
	                                                    'month': month,'day': day,'team_1':team_1,'team_2':team_2,'batsmen':batsmen,
	                                                    'wicket_status':wicket_status,'R':info[0],'B':info[1],'M':info[2],
	                                                    'fours':info[3],'sixes':info[4],
	                                                    'SR':info[5]
	                                                    }, ignore_index=True)
	                        

	                # we will follow the similar approach to fetch all the bowlers for both innings

	                # bowling_div = score_card.find_all('div','scorecard-section bowling')
	                bowling_all =  score_card.find_all('div','scorecard-section bowling')
	                # print(len(bowling_all))
	                # print(bowling_all.prettify())

	                if bowling_all is None:
	                	continue
	                for bowling_div in bowling_all:
	                        # print(bowling_div.prettify())
	                        all_rows = []
	                        bowling_table_data = bowling_div.find('table').find_all('tr')
	                        for tr in bowling_table_data:
	                            td = tr.find_all('td')
	                            row = [tr.text.strip() for tr in td]
	                            all_rows.append(row)
	                        
	                        for data_row in all_rows:
	                            if len(data_row) != 0:
	                                df_bowling_score_card = df_bowling_score_card.append({'match_no': match_no, 'match_city': match_city,'year': year, \
	                                                                'month': month,'day': day,'team_1':team_1,'team_2':team_2,'bowler':data_row[0],
	                                                                'O':data_row[2],'M':data_row[3],'R':data_row[4],'W':data_row[5],'ECON':data_row[6],
	                                                                'zeros':data_row[7],
	                                                                'fours':data_row[8],'sixes':data_row[9],
	                                                                'WD':data_row[10],'NB':data_row[11]
	                                                                }, ignore_index=True)

	    return df_bating_score_card,df_bowling_score_card

    except Exception as e:
    	print(e)

In [0]:
year_list = [2018]
(df_bat,df_bowl) = fetch_bating_bowling_data(year_list,no_of_matches = -1)