In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import warnings
import csv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
warnings.filterwarnings("ignore")

# Extract the matches data that happened between 2010-2019

In [3]:
html_text = requests.get("https://stats.espncricinfo.com/ci/engine/records/index.html?id=201;type=decade")
soup = BeautifulSoup(html_text.content,"html.parser")
table = soup.find("table",class_ = "recordsTable")
ul = table.find_all("ul")
a = ul[1].find_all("a")
matches_url = ["https://stats.espncricinfo.com" + anchor['href'] for anchor in a]
print(matches_url)

['https://stats.espncricinfo.com/ci/engine/records/team/match_results_year.html?class=1;id=201;type=decade', 'https://stats.espncricinfo.com/ci/engine/records/team/match_results_year.html?class=2;id=201;type=decade', 'https://stats.espncricinfo.com/ci/engine/records/team/match_results_year.html?class=3;id=201;type=decade']


In [4]:
test_url = matches_url[0]
odi_url = matches_url[1]
t20_url = matches_url[2]

In [5]:
def get_links_by_year(url):
    text_content = requests.get(url)
    soup = BeautifulSoup(text_content.content,"html.parser")
    a = soup.find_all("a",class_ = "QuoteSummary")
    links = ["https://stats.espncricinfo.com" + anchor['href'] for anchor in a]
    return links

In [6]:
test_year_links = get_links_by_year(test_url)
odi_year_links = get_links_by_year(odi_url)
t20_year_links = get_links_by_year(t20_url)

In [14]:
test_year_links[0]

'https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?class=1;id=2010;type=year'

In [7]:
def scrap_matches_by_formats(links_list):
    df = pd.DataFrame()
    match_links = []
    for link in links_list:
        html_text = requests.get(link)
        soup = BeautifulSoup(html_text.content,"html.parser")
        table = soup.find("table",class_ = "engineTable")
        columns = []
        for th in table.find('thead').find_all('th'):
            columns.append(th.text.strip())
        data = []
        for tr in table.find('tbody').find_all('tr'):
            row = []
            for td in tr.find_all('td'):
                row.append(td.text.strip())
                if td.find('a'):
                    link = td.find('a')['href']
                    if 'match' in link:
                        match_links.append("https://stats.espncricinfo.com" + link)
            if row[2] == 'no result': 
                if match_links:
                    match_links.pop() 
                continue 
            data.append(row)
        columns = ['Team 1','Team 2','Winner','Margin','Ground','Match Date','Match id']   
        for row in data:
            match = row[0] + " vs " + row[1]
            row.insert(0, match)
            df = df.append(pd.Series(row, index=['Match'] + columns), ignore_index=True)
    return df, match_links

In [8]:
test_matches_df,test_matches_link = scrap_matches_by_formats(test_year_links)
odi_matches_df,odi_matches_link = scrap_matches_by_formats(odi_year_links)
t20_matches_df,t20_matches_link = scrap_matches_by_formats(t20_year_links)

In [17]:
t20_matches_df.head()

Unnamed: 0,Match,Team 1,Team 2,Winner,Margin,Ground,Match Date,Match id
0,Afghanistan vs Ireland,Afghanistan,Ireland,Ireland,5 wickets,Colombo (PSS),"Feb 1, 2010",T20I # 128
1,Kenya vs Scotland,Kenya,Scotland,Kenya,10 wickets,Nairobi (Gym),"Feb 1, 2010",T20I # 129
2,Canada vs Ireland,Canada,Ireland,Canada,4 runs,Colombo (SSC),"Feb 3, 2010",T20I # 130
3,New Zealand vs Bangladesh,New Zealand,Bangladesh,New Zealand,10 wickets,Hamilton,"Feb 3, 2010",T20I # 131
4,Afghanistan vs Canada,Afghanistan,Canada,Afghanistan,5 wickets,Colombo (SSC),"Feb 4, 2010",T20I # 132


In [19]:
len(t20_matches_link)

875

In [11]:
def save_to_csv(csvfile,df):
    df.to_csv(csvfile,index=False)

In [12]:
save_to_csv("odi_matches_info.csv",odi_matches_df)
save_to_csv("test_matches_info.csv",test_matches_df)
save_to_csv("t20_matches_info.csv",t20_matches_df)


# Extract the batting and bowling stats of every match

In [9]:
def odi_and_t20_matches_stats(match_url):
    batting_df = pd.DataFrame()
    bowling_df = pd.DataFrame()
    
    html_content = requests.get(match_url)
    soup = BeautifulSoup(html_content.content,"html.parser")
    try:
        batting_and_bowling = soup.find_all("div",class_ = "ds-p-0")[1:3]
        match_number_div = soup.find_all("div",class_ = "ds-p-0")[3]
        match_number = match_number_div.find_all("span",class_ = "ds-text-tight-s ds-font-medium ds-text-typo ds-underline ds-decoration-ui-stroke hover:ds-text-typo-primary hover:ds-decoration-ui-stroke-primary ds-block")[-1].text.split(" ")[-1]
    #     print(match_number)
        innings_batting_data = []
        innings_bowling_data = []
        inningsTeam = soup.find_all("span",class_ = "ds-text-title-xs ds-font-bold ds-capitalize")
        team1 = inningsTeam[0].text
        team2 = inningsTeam[1].text
        match_name = team1 + " Vs"  + team2
        print(f"Processing match: {match_name} with match id: {match_number}")
        for div in batting_and_bowling:
            innings_team_name = inningsTeam.pop(0).text
            table = div.find_all("table")
            innings_batting = table[0]
            batting_columns = []
            for th in innings_batting.find("thead").find_all('th'):
                batting_columns.append(th.text.strip())
            batting_data = []
            for tr in innings_batting.find("tbody").find_all('tr'):
                batting_row = []
                for td in tr.find_all("td"):
                    batting_row.append(td.text.strip())
                if any(batting_row):
                    batting_data.append(batting_row) 
            batting_columns = ['Batsman Name','Dismissal','Runs','Balls','M','4s','6s','Strike Rate']
            innings_batting_df = pd.DataFrame(batting_data, columns=batting_columns)
            innings_batting_df = innings_batting_df[~innings_batting_df['Batsman Name'].str.contains('Extras|TOTAL|Did not bat|Fall of wickets')]
            innings_batting_df.insert(0,"Match Name",match_name)
            innings_batting_df.insert(1, "teamInnings", innings_team_name)
            innings_batting_df.insert(2, "Batting Position", range(1, 1+len(innings_batting_df)))
            innings_batting_df.insert(10,"Match id",match_number)
            batting_df = batting_df.append(innings_batting_df, ignore_index=True)

            #bowling scorecard
            innings_bowling = table[1]
            bowling_columns = []
            for th in innings_bowling.find("thead").find_all('th'):
                bowling_columns.append(th.text.strip())
            bowling_data = []
            for tr in innings_bowling.find("tbody").find_all('tr'):
                bowling_row = []
                for td in tr.find_all("td"):
                    bowling_row.append(td.text.strip())
                if len(bowling_row)!=1:
                    bowling_data.append(bowling_row) 
            bowling_columns = ['Bowler Name','Over','Maiden','Runs','Wickets','Economy','dots','fours','sixes','wides','no balls']
            innings_bowling_df = pd.DataFrame(bowling_data, columns=bowling_columns)
            if innings_team_name == team1:
                innings_team_name = team2
            else:
                innings_team_name = team1
            innings_bowling_df.insert(0, "match", match_name)
            innings_bowling_df.insert(1, "teamInnings", innings_team_name)
            innings_bowling_df.insert(13,"Match Id",match_number)
            bowling_df = bowling_df.append(innings_bowling_df, ignore_index=True)
    except:
        pass
        
    return batting_df,bowling_df

In [19]:
odi_matches_link[400]

'https://stats.espncricinfo.com/ci/engine/match/567371.html'

# save the batting and bowling data in two seperate files

In [25]:
#saving odi matches scorecard
combined_odi_batting_df = pd.DataFrame()
combined_odi_bowling_df = pd.DataFrame()

# Loop through the match URLs
for match_url in odi_matches_link:
    batting_df, bowling_df = odi_and_t20_matches_stats(match_url)
    combined_odi_batting_df = combined_odi_batting_df.append(batting_df)
    combined_odi_bowling_df = combined_odi_bowling_df.append(bowling_df)

# Write the combined batting and bowling dataframes to CSV files
combined_odi_batting_df.to_csv('odi_batting_data.csv', index=False)
combined_odi_bowling_df.to_csv('odi_bowling_data.csv', index=False)

Processing match: Bangladesh VsSri Lanka with match id: 2937


KeyboardInterrupt: 

In [201]:
combined_odi_bowling_df.head()

Unnamed: 0,match,teamInnings,Bowler Name,Over,Maiden,Runs,Wickets,Economy,dots,fours,sixes,wides,no balls,Match Id
0,Bangladesh VsSri Lanka,Sri Lanka,Nuwan Kulasekara,10,1,46,1,4.6,28,2,0,1,0,2937
1,Bangladesh VsSri Lanka,Sri Lanka,Chanaka Welegedara,8,0,39,0,4.87,27,5,0,1,0,2937
2,Bangladesh VsSri Lanka,Sri Lanka,Tillakaratne Dilshan,3,0,16,1,5.33,11,1,0,1,0,2937
3,Bangladesh VsSri Lanka,Sri Lanka,Suranga Lakmal,9,1,63,2,7.0,32,8,2,5,0,2937
4,Bangladesh VsSri Lanka,Sri Lanka,Suraj Randiv,10,0,51,2,5.1,28,3,1,2,0,2937


# saving the t20 batting anf bowling data in csvs

In [10]:
#saving t20 matches scorecard
combined_t20_batting_df = pd.DataFrame()
combined_t20_bowling_df = pd.DataFrame()

# Loop through the match URLs
for match_url in t20_matches_link:
    batting_df, bowling_df = odi_and_t20_matches_stats(match_url)
    combined_t20_batting_df = combined_t20_batting_df.append(batting_df)
    combined_t20_bowling_df = combined_t20_bowling_df.append(bowling_df)

 # Write the combined batting and bowling dataframes to CSV files
combined_t20_batting_df.to_csv('t20_batting_data.csv', index=False)
combined_t20_bowling_df.to_csv('t20_bowling_data.csv', index=False)

Processing match: Afghanistan VsIreland with match id: 128
Processing match: Scotland VsKenya with match id: 129
Processing match: Canada VsIreland with match id: 130
Processing match: Bangladesh VsNew Zealand with match id: 131
Processing match: Canada VsAfghanistan with match id: 132
Processing match: Scotland VsKenya with match id: 133
Processing match: Australia VsPakistan with match id: 134
Processing match: Afghanistan VsIreland with match id: 135
Processing match: Canada VsNetherlands with match id: 136
Processing match: Afghanistan VsScotland with match id: 137
Processing match: Canada VsKenya with match id: 138
Processing match: Kenya VsNetherlands with match id: 139
Processing match: Ireland VsScotland with match id: 140
Processing match: Afghanistan VsNetherlands with match id: 141
Processing match: Ireland VsNetherlands with match id: 142
Processing match: Ireland VsAfghanistan with match id: 143
Processing match: Pakistan VsEngland with match id: 144
Processing match: Engl

Processing match: West Indies VsAustralia with match id: 270
Processing match: Pakistan VsNew Zealand with match id: 271
Processing match: India VsEngland with match id: 272
Processing match: Bangladesh VsPakistan with match id: 274
Processing match: New Zealand VsSri Lanka with match id: 275
Processing match: West Indies VsEngland with match id: 276
Processing match: South Africa VsPakistan with match id: 277
Processing match: India VsAustralia with match id: 278
Processing match: New Zealand VsEngland with match id: 279
Processing match: West Indies VsSri Lanka with match id: 280
Processing match: South Africa VsAustralia with match id: 281
Processing match: Pakistan VsIndia with match id: 282
Processing match: West Indies VsNew Zealand with match id: 283
Processing match: Sri Lanka VsEngland with match id: 284
Processing match: Pakistan VsAustralia with match id: 285
Processing match: India VsSouth Africa with match id: 286
Processing match: Sri Lanka VsPakistan with match id: 287
P

Processing match: South Africa VsWest Indies with match id: 413
Processing match: South Africa VsWest Indies with match id: 414
Processing match: South Africa VsWest Indies with match id: 415
Processing match: Pakistan VsBangladesh with match id: 416
Processing match: Zimbabwe VsPakistan with match id: 417
Processing match: Zimbabwe VsPakistan with match id: 418
Processing match: Ireland VsScotland with match id: 419
Processing match: Ireland VsScotland with match id: 421
Processing match: England VsNew Zealand with match id: 423
Processing match: Netherlands VsNepal with match id: 424
Processing match: Netherlands VsNepal with match id: 425
Processing match: Netherlands VsNepal with match id: 426
Processing match: Netherlands VsNepal with match id: 427
Processing match: South Africa VsBangladesh with match id: 428
Processing match: South Africa VsBangladesh with match id: 429
Processing match: United Arab Emirates VsScotland with match id: 430
Processing match: Afghanistan VsNetherlan

Processing match: Sri Lanka VsSouth Africa with match id: 554
Processing match: New Zealand VsEngland with match id: 555
Processing match: India VsWest Indies with match id: 556
Processing match: England VsWest Indies with match id: 557
Processing match: Zimbabwe VsIndia with match id: 558
Processing match: Zimbabwe VsIndia with match id: 559
Processing match: India VsZimbabwe with match id: 560
Processing match: Sri Lanka VsEngland with match id: 561
Processing match: West Indies VsIndia with match id: 562
Processing match: Hong Kong VsIreland with match id: 564
Processing match: Australia VsSri Lanka with match id: 565
Processing match: England VsPakistan with match id: 566
Processing match: Sri Lanka VsAustralia with match id: 567
Processing match: West Indies VsPakistan with match id: 568
Processing match: Pakistan VsWest Indies with match id: 569
Processing match: West Indies VsPakistan with match id: 570
Processing match: Afghanistan VsUnited Arab Emirates with match id: 571
Proc

Processing match: Afghanistan VsIreland with match id: 696
Processing match: Afghanistan VsIreland with match id: 697
Processing match: South Africa VsZimbabwe with match id: 698
Processing match: Zimbabwe VsSouth Africa with match id: 699
Processing match: United Arab Emirates VsAustralia with match id: 700
Processing match: Pakistan VsAustralia with match id: 701
Processing match: Pakistan VsAustralia with match id: 702
Processing match: England VsSri Lanka with match id: 703
Processing match: Pakistan VsAustralia with match id: 704
Processing match: Pakistan VsNew Zealand with match id: 705
Processing match: New Zealand VsPakistan with match id: 706
Processing match: West Indies VsIndia with match id: 707
Processing match: Pakistan VsNew Zealand with match id: 708
Processing match: India VsWest Indies with match id: 709
Processing match: West Indies VsIndia with match id: 710
Processing match: South Africa VsAustralia with match id: 711
Processing match: Australia VsIndia with match

Processing match: United Arab Emirates VsNetherlands with match id: 845
Processing match: West Indies VsIndia with match id: 846
Processing match: Netherlands VsUnited Arab Emirates with match id: 847
Processing match: Finland VsSpain with match id: 848
Processing match: Finland VsSpain with match id: 849
Processing match: Finland VsSpain with match id: 850
Processing match: Bermuda VsUnited States of America with match id: 851
Processing match: Canada VsCayman Islands with match id: 852
Processing match: Namibia VsBotswana with match id: 853
Processing match: Cayman Islands VsUnited States of America with match id: 855
Processing match: Namibia VsBotswana with match id: 856
Processing match: Cayman Islands VsBermuda with match id: 857
Processing match: United States of America VsCanada with match id: 858
Processing match: Namibia VsBotswana with match id: 859
Processing match: Cayman Islands VsCanada with match id: 860
Processing match: United States of America VsBermuda with match id

Processing match: Namibia VsOman with match id: 987
Processing match: Sri Lanka VsAustralia with match id: 988
Processing match: Scotland VsUnited Arab Emirates with match id: 989
Processing match: Oman VsHong Kong with match id: 990
Processing match: Oman VsScotland with match id: 991
Processing match: New Zealand VsEngland with match id: 992
Processing match: Sri Lanka VsAustralia with match id: 993
Processing match: Netherlands VsIreland with match id: 994
Processing match: Papua New Guinea VsNamibia with match id: 995
Processing match: Ireland VsNamibia with match id: 996
Processing match: Papua New Guinea VsNetherlands with match id: 997
Processing match: New Zealand VsEngland with match id: 998
Processing match: India VsBangladesh with match id: 1000
Processing match: New Zealand VsEngland with match id: 1001
Processing match: Pakistan VsAustralia with match id: 1002
Processing match: Mozambique VsMalawi with match id: 1003
Processing match: Mozambique VsMalawi with match id: 100

In [27]:
combined_t20_batting_df.head()

Unnamed: 0,Match Name,teamInnings,Batting Position,Batsman Name,Dismissal,Runs,Balls,M,4s,6s,Match id,Strike Rate
0,Afghanistan VsIreland,Afghanistan,1,Karim Sadiq,st †NJ O'Brien b Johnston,3,6,10,0,0,128,50.0
1,Afghanistan VsIreland,Afghanistan,2,Shafiqullah,lbw b Johnston,23,16,27,3,1,128,143.75
2,Afghanistan VsIreland,Afghanistan,3,Mohammad Shahzad †,c KJ O'Brien b Johnston,10,8,8,2,0,128,125.0
3,Afghanistan VsIreland,Afghanistan,4,Nawroz Mangal (c),c Wilson b Dockrell,27,25,31,5,0,128,108.0
4,Afghanistan VsIreland,Afghanistan,5,Asghar Afghan,c White b Johnston,1,5,7,0,0,128,20.0


In [15]:
test_matches_df.head()

Unnamed: 0,Match,Team 1,Team 2,Winner,Margin,Ground,Match Date,Match id
0,Australia vs Pakistan,Australia,Pakistan,Australia,36 runs,Sydney,"Jan 3-6, 2010",Test # 1945
1,South Africa vs England,South Africa,England,drawn,,Cape Town,"Jan 3-7, 2010",Test # 1946
2,Australia vs Pakistan,Australia,Pakistan,Australia,231 runs,Hobart,"Jan 14-18, 2010",Test # 1947
3,South Africa vs England,South Africa,England,South Africa,inns & 74 runs,Johannesburg,"Jan 14-17, 2010",Test # 1948
4,Bangladesh vs India,Bangladesh,India,India,113 runs,Chattogram,"Jan 17-21, 2010",Test # 1949


In [210]:
test_matches_link[14]

'https://stats.espncricinfo.com/ci/engine/match/426402.html'

# Extract the batting and bowling stats for test matches

In [100]:
def scrap_test_matches_scorecards(matches_url):
    batting_dfs = []  # list of dataframes for batting tables
    bowling_dfs = []
    html_content = requests.get(matches_url)
    soup = BeautifulSoup(html_content.content,"html.parser")
    div_info = soup.find_all("div",class_ = "ds-w-full ds-bg-fill-content-prime ds-overflow-hidden ds-rounded-xl ds-border ds-border-line ds-mb-4")
    match_number = []
    for div in div_info:
        tbodys = div.find_all("tbody")
        if tbodys:
            for tbody in tbodys:
                trs = tbody.find_all("tr")
                for tr in trs:
                    td = tr.find_all("td")
                    if len(td) == 2:
                        if td[0].text == "Match number":
                            match_number.append(td[1].text)
#     print(match_number)
    for div in div_info:
        batting_tables = div.find_all("table",class_ = "ds-w-full ds-table ds-table-md ds-table-auto ci-scorecard-table")
        bowling_tables = div.find_all("table",class_ = "ds-w-full ds-table ds-table-md ds-table-auto")
        inningsTeam = div.find_all("span",class_ = "ds-text-title-xs ds-font-bold ds-capitalize")
        innings_team_names = []
        for team in inningsTeam:
            team_name = team.text.strip()
            if team_name:
                innings_team_names.append(team_name)
        if batting_tables:
            innings_batting_dfs = []  # list of dataframes for innings batting data
            for table in batting_tables:
                innings_team_name = inningsTeam.pop().text
                batting_columns = []
                for th in table.find("thead").find_all('th'):
                    batting_columns.append(th.text.strip())
                batting_data = []
                for tr in table.find("tbody").find_all('tr'):
                    batting_row = []
                    for td in tr.find_all("td"):
                        batting_row.append(td.text.strip())
                    if any(batting_row):
                        batting_data.append(batting_row)
                batting_columns = ['Batsman Name','Dismissal','Runs','Balls','M','4s','6s','Strike Rate']
                innings_batting_df = pd.DataFrame(batting_data, columns=batting_columns)
                innings_batting_df = innings_batting_df[~innings_batting_df['Batsman Name'].str.contains('Extras|TOTAL|Did not bat|Fall of wickets')]
                innings_batting_df['Match Number'] = match_number[0]
                innings_batting_dfs.append(innings_batting_df)
                innings_batting_df.insert(0, "teamInnings", innings_team_name)
                innings_batting_df.insert(1, "Batting Position", range(1, 1+len(innings_batting_df)))
            innings_batting_df = pd.concat(innings_batting_dfs, ignore_index=True)  # concatenate all innings batting dataframes
            batting_dfs.append(innings_batting_df)
        if bowling_tables:
            innings_bowling_dfs = []
            for table in bowling_tables:
#                 inningsteam_name = inningsTeam.pop().text
                bowling_columns = []
                for th in table.find("thead").find_all('th'):
                    batting_columns.append(th.text.strip())
                bowling_data = []
                for tr in table.find("tbody").find_all('tr'):
                    bowling_row = []
                    for td in tr.find_all("td"):
                        bowling_row.append(td.text.strip())
                    if len(bowling_row)!=1:
                        bowling_data.append(bowling_row)
                bowling_columns = ['Bowler Name','Over','Maiden','Runs','Wickets','Economy','dots','fours','sixes','wides','no balls']
                innings_bowling_df = pd.DataFrame(bowling_data, columns=bowling_columns)
                innings_bowling_df['Match Number'] = match_number[0]
                innings_bowling_team_name = list(filter(None, innings_team_names))
                if innings_bowling_team_name:
                    innings_bowling_df.insert(0, "teamInnings", innings_bowling_team_name[0])
                innings_bowling_dfs.append(innings_bowling_df)
            innings_bowling_dfs = pd.concat(innings_batting_dfs, ignore_index=True)  # concatenate all innings batting dataframes
            bowling_dfs.append(innings_bowling_df) 
    print(f"Processing match :{match_number[0]}")
    return pd.concat(batting_dfs, ignore_index=True),pd.concat(bowling_dfs, ignore_index=True)  # concatenate all batting dataframes


In [101]:
scrap_test_matches_scorecards("https://stats.espncricinfo.com/ci/engine/match/426402.html")

Processing match :Test no. 1959


(   teamInnings  Batting Position         Batsman Name  \
 0      England                 1   Andrew Strauss (c)   
 1      England                 2        Alastair Cook   
 2      England                 3       Jonathan Trott   
 3      England                 4      Kevin Pietersen   
 4      England                 5             Ian Bell   
 5      England                 6          Eoin Morgan   
 6      England                 7         Matt Prior †   
 7      England                 8         Graeme Swann   
 8      England                 9        Ajmal Shahzad   
 9      England                10       James Anderson   
 10     England                11          Steven Finn   
 11  Bangladesh                 1          Tamim Iqbal   
 12  Bangladesh                 2          Imrul Kayes   
 13  Bangladesh                 3      Junaid Siddique   
 14  Bangladesh                 4        Jahurul Islam   
 15  Bangladesh                 5    Mohammad Ashraful   
 16  Banglades

In [102]:
combined_test_batting_df = pd.DataFrame()
combined_test_bowling_df = pd.DataFrame()

# Loop through the match URLs
for match_url in test_matches_link:
    batting_df, bowling_df = scrap_test_matches_scorecards(match_url)
    combined_test_batting_df = combined_test_batting_df.append(batting_df)
    combined_test_bowling_df = combined_test_bowling_df.append(bowling_df)

# Write the combined batting and bowling dataframes to CSV files
combined_test_batting_df.to_csv('test_batting_data.csv', index=False)
combined_test_bowling_df.to_csv('test_bowling_data.csv', index=False)

Processing match :Test no. 1945
Processing match :Test no. 1946
Processing match :Test no. 1947
Processing match :Test no. 1948
Processing match :Test no. 1949
Processing match :Test no. 1950
Processing match :Test no. 1951
Processing match :Test no. 1952
Processing match :Test no. 1953
Processing match :Test no. 1954
Processing match :Test no. 1955
Processing match :Test no. 1956
Processing match :Test no. 1957
Processing match :Test no. 1958
Processing match :Test no. 1959
Processing match :Test no. 1960
Processing match :Test no. 1961
Processing match :Test no. 1962
Processing match :Test no. 1963
Processing match :Test no. 1964
Processing match :Test no. 1965
Processing match :Test no. 1966
Processing match :Test no. 1967
Processing match :Test no. 1968
Processing match :Test no. 1969
Processing match :Test no. 1970
Processing match :Test no. 1971
Processing match :Test no. 1972
Processing match :Test no. 1973
Processing match :Test no. 1974
Processing match :Test no. 1975
Processi

Processing match :Test no. 2201
Processing match :Test no. 2202
Processing match :Test no. 2203
Processing match :Test no. 2204
Processing match :Test no. 2205
Processing match :Test no. 2206
Processing match :Test no. 2207
Processing match :Test no. 2208
Processing match :Test no. 2209
Processing match :Test no. 2210
Processing match :Test no. 2211
Processing match :Test no. 2212
Processing match :Test no. 2213
Processing match :Test no. 2214
Processing match :Test no. 2215
Processing match :Test no. 2216
Processing match :Test no. 2217
Processing match :Test no. 2218
Processing match :Test no. 2219
Processing match :Test no. 2220
Processing match :Test no. 2221
Processing match :Test no. 2222
Processing match :Test no. 2223
Processing match :Test no. 2224
Processing match :Test no. 2225
Processing match :Test no. 2226
Processing match :Test no. 2227
Processing match :Test no. 2228
Processing match :Test no. 2229
Processing match :Test no. 2230
Processing match :Test no. 2231
Processi

# Edit the bowling data of test

In [158]:
test_info_df = pd.read_csv("test_matches_info.csv")
test_info_df['Match id'] = test_info_df['Match id'].str.replace("Test #","").str.strip()
test_info_df.head()

Unnamed: 0,Match,Team 1,Team 2,Winner,Margin,Ground,Match Date,Match id
0,Australia vs Pakistan,Australia,Pakistan,Australia,36 runs,Sydney,"Jan 3-6, 2010",1945
1,South Africa vs England,South Africa,England,drawn,,Cape Town,"Jan 3-7, 2010",1946
2,Australia vs Pakistan,Australia,Pakistan,Australia,231 runs,Hobart,"Jan 14-18, 2010",1947
3,South Africa vs England,South Africa,England,South Africa,inns & 74 runs,Johannesburg,"Jan 14-17, 2010",1948
4,Bangladesh vs India,Bangladesh,India,India,113 runs,Chattogram,"Jan 17-21, 2010",1949


In [149]:
test_matches_link[0]

'https://stats.espncricinfo.com/ci/engine/match/406200.html'

In [163]:
bowling_df = pd.read_csv("test_bowling_data.csv")
bowling_df.head()

Unnamed: 0,teamInnings,Bowler Name,Over,Maiden,Runs,Wickets,Economy,dots,fours,sixes,wides,no balls,Match Number
0,Australia,Mohammad Asif,20.0,6,41,6,2.05,101,5,0,0,0,Test no. 1945
1,Australia,Mohammad Sami,12.0,4,27,3,2.25,59,4,0,0,1,Test no. 1945
2,Australia,Umar Gul,10.2,0,38,1,3.67,47,6,0,1,0,Test no. 1945
3,Australia,Danish Kaneria,2.0,0,18,0,9.0,5,2,1,0,0,Test no. 1945
4,Pakistan,Doug Bollinger,21.5,5,72,4,3.29,105,8,0,1,6,Test no. 1945


In [170]:
bowling_df = bowling_df.rename(columns={'Match Number': 'Match id'})
bowling_df['Match id'] = bowling_df['Match id'].str.replace("Test no. ","").str.strip()

In [176]:
bowling_df = bowling_df.drop(columns=['match_name_x','Match','match_name_y'],axis = 'columns')

In [177]:
bowling_df.head()

Unnamed: 0,teamInnings,Bowler Name,Over,Maiden,Runs,Wickets,Economy,dots,fours,sixes,wides,no balls,Match id
0,Australia,Mohammad Asif,20.0,6,41,6,2.05,101,5,0,0,0,1945
1,Australia,Mohammad Sami,12.0,4,27,3,2.25,59,4,0,0,1,1945
2,Australia,Umar Gul,10.2,0,38,1,3.67,47,6,0,1,0,1945
3,Australia,Danish Kaneria,2.0,0,18,0,9.0,5,2,1,0,0,1945
4,Pakistan,Doug Bollinger,21.5,5,72,4,3.29,105,8,0,1,6,1945


In [178]:
match_df = test_info_df[['Match id', 'Match']]
match_df = match_df.rename(columns={'Match': 'match_name'})
bowling_df = pd.merge(bowling_df, match_df, on='Match id', how='left')

In [180]:
bowling_df[['team1', 'team2']] = bowling_df['match_name'].str.split(' vs ', expand=True)

In [182]:
bowling_df['teamInnings'] = np.where(bowling_df['teamInnings'] == bowling_df['team1'], bowling_df['team2'], bowling_df['team1'])

# save the edited bowling dataframe in the csv

In [185]:
bowling_df.to_csv("test_bowling_data.csv",index=False)

# Edit the test batting data

In [186]:
batting_df = pd.read_csv("test_batting_data.csv")
batting_df.head()

Unnamed: 0,teamInnings,Batting Position,Batsman Name,Dismissal,Runs,Balls,M,4s,6s,Strike Rate,Match Number
0,Australia,1,Shane Watson,c †Kamran Akmal b Mohammad Sami,6,24,39,1,0,25.0,Test no. 1945
1,Australia,2,Phillip Hughes,c Faisal Iqbal b Mohammad Sami,0,10,15,0,0,0.0,Test no. 1945
2,Australia,3,Ricky Ponting (c),c Umar Gul b Mohammad Sami,0,1,1,0,0,0.0,Test no. 1945
3,Australia,4,Michael Hussey,c Misbah-ul-Haq b Mohammad Asif,28,52,96,4,0,53.84,Test no. 1945
4,Australia,5,Michael Clarke,b Mohammad Asif,3,30,50,0,0,10.0,Test no. 1945


In [187]:
batting_df = batting_df.rename(columns={'Match Number': 'Match id'})
batting_df['Match id'] = batting_df['Match id'].str.replace("Test no. ","").str.strip()

In [190]:
batting_df = pd.merge(batting_df, match_df, on='Match id', how='left')

In [192]:
batting_df.to_csv("test_batting_data.csv",index=False)