In [0]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
from tqdm import tqdm_notebook as tqdm
import traceback

import json

In [0]:
def fetch_player_data(years):
    """ 
    This function will be used to fetch player performance data for T20 matches
    parameters :
    year : years of ipl(will be list)
    """
    universal_link_set = set()

    player_data_Json_list = [] # store all the data of the player

    try:


        # loop for all the years
        for year in years:
            # build the URL
            url = "http://www.espncricinfo.com/c/engine/series/418064.html"

            print(f'Collecting player data for year {year}....')

            # First we have to parse the batting-bowling data page link from the starter page i.e url

            # Approach : i will first collect all the links present in that table
            #            then filter the links which will lead us to players info data page.
            # hurdles : in the table a particular row contains 5 links all with same class name.

            source = requests.get(url).text

            soup = BeautifulSoup(source, 'lxml') 

            main_div = soup.find('div','news-pannel')

            link_all = main_div.find_all('a','potMatchMenuLink')
            useful_links = []
            # link_base = "http://stats.espncricinfo.com"

            for link in link_all:
                href = str(link['href'])
                if "https://www.espncricinfo.com/series/8048" in href:
                    useful_links.append(href)


            
            

            

            for link in tqdm(useful_links):

                source = requests.get(link).text

                soup = BeautifulSoup(source, 'lxml')

                main_div = soup.find('div','col-b')

                ###############################################################################################

                ############# Now we will Scrape ScoreCards i.e bating and bowling performance for each team #######################


                score_cards = main_div.find_all('article','sub-module scorecard')

                # above score_cards contain 2 score_card i.e. each team

                batsmen_links = []
                bowler_links = []

                for sc in score_cards:
                  batsmen_div = sc.find('div','scorecard-section batsmen')
                  bowler_div =  sc.find('div','scorecard-section bowling')

                  if batsmen_div is None:
                      continue
                  if bowler_div is None:
                      continue

                  """
                  we will collecte all the links that will lead to batsmen and bowler from the batasmen and bowler scorecard div.
                  Then we will scrape all the data needed for player analysis for their T20 performance
                  """
                  
                  b_links = batsmen_div.find_all('a')
                  bow_links = bowler_div.find_all('a')

                  if b_links is None:
                      continue
                  if bow_links is None:
                      continue

                  for b in b_links:
                    try:
                     if 'html' in b['href'] and b['href'] not in b_links:
                       batsmen_links.append(b['href'])
                    except:
                        continue
                   
                  for bo in bow_links:
                    try:
                     if 'html' in bo['href'] and bo['href'] not in bow_links:
                       bowler_links.append(bo['href'])
                    except:
                        continue
                 
                

                # Now that we have all the links avaliable to us for both batsmen and bowler
                # let's scrape their information and put it dataframes

                ## ** Scraping batsmen data from batsmen's link list
                full_list = batsmen_links + bowler_links

                

                for link in full_list:
                    if link not in universal_link_set:

                        source = requests.get(link).text
                        soup = BeautifulSoup(source, 'lxml')

                        main_div = soup.find('div','pnl490M')

                        if main_div is None:
                            continue

                        temp_div = main_div.find_all('div')[2]

                        if temp_div is None:
                            continue

                        player_info_div = temp_div.find_all('div')[0]

                        if player_info_div is None:
                            continue

                        player_info_p = player_info_div.find_all('p','ciPlayerinformationtxt')

                        if player_info_p is None:
                            continue

                        player_obj = dict()

                        for player in player_info_p:
                            # print(player.find('b').text,player.find('span').text)
                            player_obj[player.find('b').text.strip()] = player.find('span').text.strip()
                        # print(player_obj)

                        player_data_Json_list.append(player_obj)

                        # print(player_data_Json_list)
                        universal_link_set.add(link)



        return player_data_Json_list

   

    except Exception:
       traceback.print_exc()

In [8]:
json_data = fetch_player_data([2010])

Collecting player data for year 2010....


HBox(children=(IntProgress(value=0, max=120), HTML(value='')))




In [9]:
len(json_data)

181

In [14]:
Data_path = "/content/drive/My Drive/data/"

full_path = Data_path+'player_infomation_2010.json'

print(full_path)

/content/drive/My Drive/data/player_infomation_2010.json


In [0]:
with open(full_path, 'w') as outfile:
    json.dump(json_data, outfile)