In [47]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from bs4 import BeautifulSoup
import requests

In [9]:
html_content = requests.get("https://stats.espncricinfo.com/ci/engine/records/team/match_results_year.html?class=2;id=202;type=decade")
soup = BeautifulSoup(html_content.content,"html.parser")
anchor = soup.find_all("a",class_ = "QuoteSummary")
yearwise_links = ["https://stats.espncricinfo.com" + a['href'] for a in anchor]
print(yearwise_links)

['https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?class=2;id=2020;type=year', 'https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?class=2;id=2021;type=year', 'https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?class=2;id=2022;type=year', 'https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?class=2;id=2023;type=year']


# Scrap the matches between 2020-current

In [44]:
def scrap_matches_info(links_list):
    df = pd.DataFrame()
    match_links = []
    for link in links_list:
        html_text = requests.get(link)
        soup = BeautifulSoup(html_text.content,"html.parser")
        table = soup.find("table",class_ = "engineTable")
        columns = []
        for th in table.find('thead').find_all('th'):
            columns.append(th.text.strip())
        data = []
        for tr in table.find('tbody').find_all('tr'):
            row = []
            for td in tr.find_all('td'):
                row.append(td.text.strip())
                if td.find('a'):
                    link = td.find('a')['href']
                    if 'match' in link:
                        match_links.append("https://stats.espncricinfo.com" + link)
            if row[2] == 'no result': 
                if match_links:
                    match_links.pop() 
                continue 
            data.append(row)
        columns = ['Team 1','Team 2','Winner','Margin','Ground','Match Date','Match id']   
        for row in data:
            match = row[0] + " vs " + row[1]
            row.insert(0, match)
            df = df.append(pd.Series(row, index=['Match'] + columns), ignore_index=True)
    return df, match_links

In [54]:
matches_df,match_links = scrap_matches_info(yearwise_links)

In [63]:
matches_df.to_csv("matches_info.csv",index=False)

In [69]:
match_links[-1]

'https://stats.espncricinfo.com/ci/engine/match/1358081.html'

# Now scrap each matches players performance

In [67]:
def scrap_stats_of_matches(match_url):
    batting_df = pd.DataFrame()
    bowling_df = pd.DataFrame()
    
    html_content = requests.get(match_url)
    soup = BeautifulSoup(html_content.content,"html.parser")
    try:
        batting_and_bowling = soup.find_all("div",class_ = "ds-p-0")[1:3]
        match_number_div = soup.find_all("div",class_ = "ds-p-0")[3]
        match_number = match_number_div.find_all("span",class_ = "ds-text-tight-s ds-font-medium ds-text-typo ds-underline ds-decoration-ui-stroke hover:ds-text-typo-primary hover:ds-decoration-ui-stroke-primary ds-block")[-1].text.split(" ")[-1]
    #     print(match_number)
        innings_batting_data = []
        innings_bowling_data = []
        inningsTeam = soup.find_all("span",class_ = "ds-text-title-xs ds-font-bold ds-capitalize")
        team1 = inningsTeam[0].text
        team2 = inningsTeam[1].text
        match_name = team1 + " Vs"  + team2
        print(f"Processing match: {match_name} with match id: {match_number}")
        for div in batting_and_bowling:
            innings_team_name = inningsTeam.pop(0).text
            table = div.find_all("table")
            innings_batting = table[0]
            batting_columns = []
            for th in innings_batting.find("thead").find_all('th'):
                batting_columns.append(th.text.strip())
            batting_data = []
            for tr in innings_batting.find("tbody").find_all('tr'):
                batting_row = []
                for td in tr.find_all("td"):
                    batting_row.append(td.text.strip())
                if any(batting_row):
                    batting_data.append(batting_row) 
            batting_columns = ['Batsman Name','Dismissal','Runs','Balls','M','4s','6s','Strike Rate']
            innings_batting_df = pd.DataFrame(batting_data, columns=batting_columns)
            innings_batting_df = innings_batting_df[~innings_batting_df['Batsman Name'].str.contains('Extras|TOTAL|Did not bat|Fall of wickets')]
            innings_batting_df.insert(0, "teamInnings", innings_team_name)
            innings_batting_df.insert(1, "Batting Position", range(1, 1+len(innings_batting_df)))
            innings_batting_df.insert(10,"Match id",match_number)
            batting_df = batting_df.append(innings_batting_df, ignore_index=True)

            #bowling scorecard
            innings_bowling = table[1]
            bowling_columns = []
            for th in innings_bowling.find("thead").find_all('th'):
                bowling_columns.append(th.text.strip())
            bowling_data = []
            for tr in innings_bowling.find("tbody").find_all('tr'):
                bowling_row = []
                for td in tr.find_all("td"):
                    bowling_row.append(td.text.strip())
                if len(bowling_row)!=1:
                    bowling_data.append(bowling_row) 
            bowling_columns = ['Bowler Name','Over','Maiden','Runs','Wickets','Economy','dots','fours','sixes','wides','no balls']
            innings_bowling_df = pd.DataFrame(bowling_data, columns=bowling_columns)
            if innings_team_name == team1:
                innings_team_name = team2
            else:
                innings_team_name = team1
            innings_bowling_df.insert(0, "match", match_name)
            innings_bowling_df.insert(1, "teamInnings", innings_team_name)
            innings_bowling_df.insert(13,"Match Id",match_number)
            bowling_df = bowling_df.append(innings_bowling_df, ignore_index=True)
    except:
        pass
        
    return batting_df,bowling_df

In [72]:
#saving odi matches scorecard
combined_batting_df = pd.DataFrame()
combined_bowling_df = pd.DataFrame()

# Loop through the match URLs
for match_url in match_links:
    batting_df, bowling_df = scrap_stats_of_matches(match_url)
    combined_batting_df = combined_batting_df.append(batting_df)
    combined_bowling_df = combined_bowling_df.append(bowling_df)

# Write the combined batting and bowling dataframes to CSV files
combined_batting_df.to_csv('batting_data.csv', index=False)
combined_bowling_df.to_csv('bowling_data.csv', index=False)

Processing match: United Arab Emirates VsOman with match id: 4224
Processing match: United Arab Emirates VsNamibia with match id: 4225
Processing match: Ireland VsWest Indies with match id: 4226
Processing match: Namibia VsOman with match id: 4227
Processing match: Namibia VsUnited Arab Emirates with match id: 4228
Processing match: Ireland VsWest Indies with match id: 4229
Processing match: Ireland VsWest Indies with match id: 4230
Processing match: India VsAustralia with match id: 4231
Processing match: India VsAustralia with match id: 4232
Processing match: Australia VsIndia with match id: 4233
Processing match: England VsSouth Africa with match id: 4234
Processing match: India VsNew Zealand with match id: 4235
Processing match: Oman VsNepal with match id: 4236
Processing match: United States of America VsOman with match id: 4237
Processing match: New Zealand VsIndia with match id: 4239
Processing match: Nepal VsUnited States of America with match id: 4240
Processing match: Nepal Vs

Processing match: United Arab Emirates VsNamibia with match id: 4365
Processing match: Oman VsNamibia with match id: 4366
Processing match: Papua New Guinea VsUnited Arab Emirates with match id: 4367
Processing match: Papua New Guinea VsNepal with match id: 4368
Processing match: United Arab Emirates VsNepal with match id: 4369
Processing match: Bangladesh VsSouth Africa with match id: 4370
Processing match: United Arab Emirates VsPapua New Guinea with match id: 4371
Processing match: Bangladesh VsSouth Africa with match id: 4372
Processing match: United Arab Emirates VsNepal with match id: 4373
Processing match: Papua New Guinea VsNepal with match id: 4374
Processing match: South Africa VsBangladesh with match id: 4375
Processing match: Papua New Guinea VsNepal with match id: 4376
Processing match: Nepal VsPapua New Guinea with match id: 4377
Processing match: Netherlands VsNew Zealand with match id: 4378
Processing match: Australia VsPakistan with match id: 4379
Processing match: Aus

Processing match: New Zealand VsPakistan with match id: 4500
Processing match: India VsSri Lanka with match id: 4501
Processing match: New Zealand VsPakistan with match id: 4502
Processing match: Sri Lanka VsIndia with match id: 4503
Processing match: Pakistan VsNew Zealand with match id: 4504
Processing match: India VsSri Lanka with match id: 4505
Processing match: Ireland VsZimbabwe with match id: 4506
Processing match: India VsNew Zealand with match id: 4507
Processing match: Ireland VsZimbabwe with match id: 4508
Processing match: New Zealand VsIndia with match id: 4509
Processing match: India VsNew Zealand with match id: 4511
Processing match: South Africa VsEngland with match id: 4512
Processing match: England VsSouth Africa with match id: 4513
Processing match: England VsSouth Africa with match id: 4514
Processing match: Namibia VsNepal with match id: 4515
Processing match: Namibia VsScotland with match id: 4516
Processing match: Scotland VsNepal with match id: 4517
Processing m