### Notebook use:

* This notebook will be used to Scrape bating and bowling data from http://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=2019;trophy=117;type=season

* The above link contains a table of match level data and a link to ScoreCard data in the last column .i.e <a> T20 </a>. This link will redirect us to the bating and bowling data page for a particular match e.g. https://www.espncricinfo.com/series/8048/scorecard/1175356/chennai-super-kings-vs-royal-challengers-bangalore-1st-match-indian-premier-league-2019

### Approach : 

* First we have collected all the link of scorecards present in the base url i.e. all the href of <a> T20 </a>.
* We will use the func parameter years and no_of_matches to select the year and select the match for scraping the data.

* The function will return 2 DataFrames, One will contain all the bating data for n(func param) no of matches and the other will contain all the bowling data for  n(func param) no of matches 

In [0]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import csv
import sys
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
from tqdm import tqdm_notebook as tqdm
import traceback

In [0]:
def fetch_bating_bowling_data(years,no_of_matches=1):
    
    
    """ 
    This Function is used to fetch bating and bowling data from the web.
    Parameters
    years : a list which contains the years for which you want the data
    no_of_matches : no of matches data you want. if -1 is given then it will fetch data of all the matches
    return : a tuple of DataFrames, one for bating and one for bowling
    """
    
    # let's create dataFrames to store data
    column_names_bating = ['match_no','match_city','year','month','day','team_1','team_2','batsmen','wicket_status','R',\
                                       'B','M','fours','sixes','SR']
    # let's create a dataFrame with above columns, then we will loop over all the table rows to fill data in it

    df_bating_score_card = pd.DataFrame(columns = column_names_bating)

    # prepare dataframe for bowling
    column_names_bowling = ['match_no','match_city','year','month','day','team_1','team_2','bowler','O','M',\
                                       'R','W','ECON','zeros','fours','sixes','WD','NB']
    # let's create a dataFrame with above columns, then we will loop over all the table rows to fill data in it

    df_bowling_score_card = pd.DataFrame(columns = column_names_bowling)

    try:


        # loop for all the years
        for year in [2010]:
            # build the URL
            url = "http://www.espncricinfo.com/c/engine/series/313494.html"

            source = requests.get(url).text

            soup = BeautifulSoup(source, 'lxml') 

            # main_div = soup.find('table','engineTable')

            # link_all = soup.find_all('a','potMatchMenuLink')
            main_div = soup.find('div','news-pannel')

            link_all = main_div.find_all('a','potMatchMenuLink')
            useful_links = []
            # link_base = "http://stats.espncricinfo.com"

            for link in link_all:
                href = str(link['href'])
                if "https://www.espncricinfo.com/series/8048" in href:
                    useful_links.append(href)

            
            # now we have all the links that will lead us to bating and bowling data for a particular year page in useful_links.
            
            # now the hard stuff : we have to parse the batting and bowling data page.

            
            if no_of_matches == -1:
                no_of_matches = len(useful_links)

            for link in tqdm(useful_links):

                source = requests.get(link).text

                soup = BeautifulSoup(source, 'lxml')

                main_div = soup.find('div','col-b')

                # we will fetch step by step all the data

                # gp__cricket__gameHeader : it contains the following information ->
                # 1. match no, match city, match date
                # 2. team names, their scores,
                # 3. player of the match with team
                # 4. a small match summary : Super Kings won by 7 wickets (with 14 balls remaining)
                
                div_name = "gp__cricket__gameHeader"
                #################### scrape div_name = "gp__cricket__gameHeader" ####################
                div_gp__cricket__gameHeader_data = main_div.find('div',div_name)

                # print(div_gp__cricket__gameHeader_data.prettify())

                match_first_glance_info = div_gp__cricket__gameHeader_data.find('div','cscore_info-overview').text.strip()
                match_first_glance_info_parts = match_first_glance_info.split(',')
                
                match_no = match_first_glance_info_parts[0].split('s')[0] # attribute

                match_city = match_first_glance_info_parts[1].split('at')[1] # attribute

                date_parts = match_first_glance_info_parts[2].split(' ')

                month = date_parts[1] # attribute
                day = date_parts[2] # attribute
                year = date_parts[3] # attribute

                
                teams = div_gp__cricket__gameHeader_data.find_all('span','cscore_name cscore_name--long')
                team_1 = teams[0].text.strip() # attribute
                team_2 = teams[1].text.strip() # attribute


                

                ###############################################################################################

                ############# Now we will Scrape ScoreCards i.e bating and bowling performance for each team #######################


                score_cards = main_div.find_all('article','sub-module scorecard')

                # above score_cards contain 2 score_card i.e. each team

                for score_card in score_cards:

                    batsmen_div = score_card.find('div','scorecard-section batsmen')

                    # print(batsmen_div.prettify())

                    if batsmen_div is None:
                        continue

                    # fetching all batsman 
                    for one in batsmen_div.find_all('div','wrap batsmen'):
                            # one = batsmen_div.find('div','wrap batsmen')

                            if one  is None:
                                continue


                            batsmen = one.find('div','cell batsmen').text.strip()

                            # wicket_status = one.find_all('div','cell commentary')[0].find('a').contents[0].strip()
                            try:
                                wicket_status = one.find_all('div','cell commentary')[0].contents[0].text.strip()
                            except Exception as e:
                                wicket_status = one.find_all('div','cell commentary')[0].contents[0].strip()
                            
                            # print(one.find_all('div','cell commentary')[0].contents[0].text.strip())


                            all_numeric  = one.find_all('div','cell runs')
                            # print(len(all_numeric))

                            R = ''
                            B = ''
                            M = ''
                            fours = ''
                            sixes = '' 
                            SR = ''
                            info = [R,B,M,fours,sixes,SR]

                            if len(all_numeric) == 5:
                                all_numeric.insert(2,'')
                            # print(len(list(map(lambda x : x.text,all_numeric))))
                            # print(list(map(lambda x : x.text,all_numeric)))
                            for i,n in enumerate(all_numeric):
                                # print(n.text.strip())
                                if i < len(info):
                                    if isinstance(n,str):
                                        info[i] = info[i]+n
                                    else:
                                        info[i] = info[i]+n.text.strip()
                                

                            # print(date_parts)
                            df_bating_score_card = df_bating_score_card.append({'match_no': match_no, 'match_city': match_city,'year': year, \
                                                        'month': month,'day': day,'team_1':team_1,'team_2':team_2,'batsmen':batsmen,
                                                        'wicket_status':wicket_status,'R':info[0],'B':info[1],'M':info[2],
                                                        'fours':info[3],'sixes':info[4],
                                                        'SR':info[5]
                                                        }, ignore_index=True)
                            

                    # we will follow the similar approach to fetch all the bowlers for both innings

                    # bowling_div = score_card.find_all('div','scorecard-section bowling')
                    bowling_all =  score_card.find_all('div','scorecard-section bowling')
                    # print(len(bowling_all))
                    # print(bowling_all.prettify())

                    if bowling_all is None:
                        continue
                    for bowling_div in bowling_all:
                            # print(bowling_div.prettify())
                            all_rows = []
                            bowling_table_data = bowling_div.find('table').find_all('tr')
                            for tr in bowling_table_data:
                                td = tr.find_all('td')
                                row = [tr.text.strip() for tr in td]
                                all_rows.append(row)
                            
                            for data_row in all_rows:
                                if len(data_row) != 0:
                                    df_bowling_score_card = df_bowling_score_card.append({'match_no': match_no, 'match_city': match_city,'year': year, \
                                                                    'month': month,'day': day,'team_1':team_1,'team_2':team_2,'bowler':data_row[0],
                                                                    'O':data_row[2],'M':data_row[3],'R':data_row[4],'W':data_row[5],'ECON':data_row[6],
                                                                    'zeros':data_row[7],
                                                                    'fours':data_row[8],'sixes':data_row[9],
                                                                    'WD':data_row[10],'NB':data_row[11]
                                                                    }, ignore_index=True)
                                                                    
        return df_bating_score_card,df_bowling_score_card

    except Exception:
        traceback.print_exc()

In [0]:
# year_list = [2018,2019]
(df_bat_2008,df_bowl_2008) = fetch_bating_bowling_data([2008],no_of_matches = -1)

HBox(children=(IntProgress(value=0, max=118), HTML(value='')))




In [0]:
df_bat_2008.shape,df_bowl_2008.shape

((895, 15), (685, 18))

In [0]:
## Merge this data to other FullData

full_batsmen_data = pd.read_csv('E:/Google_Drive_Contents/data/Batsmen_data_cleaned.csv')
full_bowler_data = pd.read_csv('E:/Google_Drive_Contents/data/Bowler_data_cleaned.csv')


In [0]:
full_batsmen_data.head(2)

Unnamed: 0,match_no,match_city,year,month,day,team_1,team_2,batsmen,wicket_status,R,B,M,fours,sixes,SR,special_role
0,1,Cape Town,2009,April,18,Mumbai Indians,Chennai Super Kings,ST Jayasuriya,c Hayden b Thushara,26,20.0,,5.0,0.0,130.0,normal player
1,1,Cape Town,2009,April,18,Mumbai Indians,Chennai Super Kings,SR Tendulkar (c),not out,59,49.0,,7.0,0.0,120.4,captain


In [0]:
df_bat_2008.head(2)

Unnamed: 0,match_no,match_city,year,month,day,team_1,team_2,batsmen,wicket_status,R,B,M,fours,sixes,SR
0,1,Bengaluru,2008,Apr,18,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly (c),c Kallis b Khan,10,12,,2,0,83.33
1,1,Bengaluru,2008,Apr,18,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,not out,158,73,,10,13,216.43


In [0]:
df_bat_2008['special_role'] = 'temp_data'

In [0]:
full_bowler_data.head(2)

Unnamed: 0,match_no,match_city,year,month,day,team_1,team_2,bowler,O,M,R,W,ECON,zeros,fours,sixes,WD,NB
0,1,Cape Town,2009,April,18,Mumbai Indians,Mumbai Indians,MS Gony,4.0,0,32,2,8.0,7,4,0,1,0
1,1,Cape Town,2009,April,18,Mumbai Indians,Mumbai Indians,T Thushara,4.0,0,32,1,8.0,9,5,0,0,0


In [0]:
df_bowl_2008.head(2)

Unnamed: 0,match_no,match_city,year,month,day,team_1,team_2,bowler,O,M,R,W,ECON,zeros,fours,sixes,WD,NB
0,1,Bengaluru,2008,Apr,18,Kolkata Knight Riders,Royal Challengers Bangalore,P Kumar,4,0,38,0,9.5,13,3,3,1,0
1,1,Bengaluru,2008,Apr,18,Kolkata Knight Riders,Royal Challengers Bangalore,Z Khan,4,0,38,1,9.5,8,3,2,0,0


In [0]:
full_batsmen_data = full_batsmen_data.append(df_bat_2008,ignore_index=True)
full_bowler_data = full_bowler_data.append(df_bowl_2008,ignore_index=True)

In [0]:
full_batsmen_data.to_csv('E:/Google_Drive_Contents/data/Batsmen_scorecard_data.csv',index = False)
full_bowler_data.to_csv('E:/Google_Drive_Contents/data/Bowler_scorecard_data.csv',index = False)