In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import csv
import sys
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
from tqdm import tqdm_notebook as tqdm
import warnings
warnings.filterwarnings('ignore')
import traceback

In [None]:
def fetch_toss_data(years):
    
    Base_URL = ["http://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=",";trophy=117;type=season"]
    if isinstance(years,list) == False or len(years) == 0:
        raise ValueError('Etiher the year is not passed in a list or the list is empty')
    
    # let's create dataFrames to store data
    columns = ['match_no','team 1','team 2','toss_info']

    df_toss = pd.DataFrame(columns = columns)

    try:


        # loop for all the years
        for year in years:
            # build the URL
            url = "http://www.espncricinfo.com/c/engine/series/313494.html"

            print(f'Collecting player data for year {year}....')

            source = requests.get(url).text

            soup = BeautifulSoup(source, 'lxml') 

            main_div = soup.find('div','news-pannel')

            link_all = main_div.find_all('a','potMatchMenuLink')
            useful_links = []
            # link_base = "http://stats.espncricinfo.com"

            for link in link_all:
                href = str(link['href'])
                if "https://www.espncricinfo.com/series/8048" in href:
                    useful_links.append(href)
                 
            for link in tqdm(useful_links):

                source = requests.get(link).text

                soup = BeautifulSoup(source, 'lxml')

                main_div = soup.find('div','col-b')

                # we will fetch step by step all the data

                # gp__cricket__gameHeader : it contains the following information ->
                # 1. match no, match city, match date
                # 2. team names, their scores,
                # 3. player of the match with team
                # 4. a small match summary : Super Kings won by 7 wickets (with 14 balls remaining)
                
                div_name = "gp__cricket__gameHeader"
                #################### scrape div_name = "gp__cricket__gameHeader" ####################
                div_gp__cricket__gameHeader_data = main_div.find('div',div_name)

                # print(div_gp__cricket__gameHeader_data.prettify())

                match_first_glance_info = div_gp__cricket__gameHeader_data.find('div','cscore_info-overview').text.strip()
                match_first_glance_info_parts = match_first_glance_info.split(',')
                
                match_no = match_first_glance_info_parts[0].split('s')[0] # attribute

                match_city = match_first_glance_info_parts[1].split('at')[1] # attribute

                date_parts = match_first_glance_info_parts[2].split(' ')

                month = date_parts[1] # attribute
                day = date_parts[2] # attribute
                year = date_parts[3] # attribute

                # print(month,day,year)

                # print(div_gp__cricket__gameHeader_data.prettify())

                # mom_details = div_gp__cricket__gameHeader_data.find('a','gp__cricket__player-match__player__detail__link').contents
                
                # mom_player_name = mom_details[0].strip() # attribute
                # mom_team_name = mom_details[1].text.strip() # attribute

                teams = div_gp__cricket__gameHeader_data.find_all('span','cscore_name cscore_name--long')
                # print(teams)
                team_1 = teams[0].text.strip() # attribute
                team_2 = teams[1].text.strip() # attribute
                # print(team_2)

                div_match_detail = main_div.find('div','match-detail-container')

                if div_match_detail is None:
                    continue

                # print(div_match_detail.prettify())

                div_left = div_match_detail.find_all('div','match-detail--left')[1]
                div_right = div_match_detail.find_all('div','match-detail--right')[1]

                toss_info = div_right.find('span').text.strip()
                # print(toss_info)
                # toss_winner_team = toss_info.split(',')[0]
                # toss_decision = toss_info.split(',')[1]

                # # columns = ['match_no','team 1','team 2','toss_winner','toss_decision']
                df_toss = df_toss.append({
                    'match_no':match_no,
                    'team 1':team_1,
                    'team 2':team_2,
                    'toss_info':toss_info
                },ignore_index = True)




                # print(df_toss)

                # return None

            
        return df_toss
    except Exception:
        traceback.print_exc()

In [3]:
df_toss = fetch_toss_data([2008])

Collecting player data for year 2008....


HBox(children=(IntProgress(value=0, max=118), HTML(value='')))




In [4]:
df_toss

Unnamed: 0,match_no,team 1,team 2,toss_info
0,1,Kolkata Knight Riders,Royal Challengers Bangalore,"Royal Challengers Bangalore , elected to field..."
1,2nd match (D/N),Chennai Super Kings,Kings XI Punjab,"Chennai Super Kings , elected to bat first"
2,3rd match (N),Rajasthan Royals,Delhi Daredevils,"Rajasthan Royals , elected to bat first"
3,4th match (D/N),Deccan Chargers,Kolkata Knight Riders,"Deccan Chargers , elected to bat first"
4,5th match (N),Mumbai Indians,Royal Challengers Bangalore,"Mumbai Indians , elected to bat first"
5,6th match (N),Kings XI Punjab,Rajasthan Royals,"Kings XI Punjab , elected to bat first"
6,7th match (N),Deccan Chargers,Delhi Daredevils,"Deccan Chargers , elected to bat first"
7,8th match (N),Chennai Super Kings,Mumbai Indians,"Mumbai Indians , elected to field first"
8,9th match (N),Deccan Chargers,Rajasthan Royals,"Rajasthan Royals , elected to field first"
9,10th match (N),Kings XI Punjab,Mumbai Indians,"Mumbai Indians , elected to field first"


In [None]:
df_toss.to_csv('/content/drive/My Drive/data/toss_data_2008.csv',index=False)