In [1]:
import bs4, re
import bson
import requests
import pandas as pd
import numpy as np
from urllib3 import PoolManager

In [2]:
t20_teams = ["afghanistan","australia","bangladesh","england","india","ireland","namibia","netherlands","new zealand",\
             "oman","pakistan","papua new guinea","scotland","sri lanka","west indies"]

In [3]:
def extract_batter_data(table):
    data = [] 
    for row in table.find_all('tr'):
        cols = row.find_all('th')
        cols = [ele.text.strip().lower() for ele in cols]
        data.append([ele for ele in cols if ele])
        cols = row.find_all('td')
        cols = [ele.text.strip().lower() for ele in cols]
        data.append([ele for ele in cols if ele])
    df = pd.DataFrame(data[1:],columns=["player"]+data[0])
    return df

def extract_bowler_data(table):
    data = [] 
    for row in table.find_all('tr'):
        cols = row.find_all('th')
        cols = [ele.text.strip().lower() for ele in cols]
        data.append([ele for ele in cols if ele])
        cols = row.find_all('td')
        cols = [ele.text.strip().lower() for ele in cols]
        data.append([ele for ele in cols if ele])
    df = pd.DataFrame(data[1:],columns=data[0])
    return df

def extract_match_data(table):
    data = [] 
    for row in table.find_all('tr'):
        cols = row.find_all('th')
        cols = [ele.text.strip().lower() for ele in cols]
        data.append([ele for ele in cols if ele])
        cols = row.find_all('td')
        cols = [ele.text.strip().lower() for ele in cols]
        data.append([ele for ele in cols if ele])
    df = pd.DataFrame(data[1:],columns=data[0])
    return df

def restructure_dataframe(df):
    df.dropna(how='all',inplace=True)
    temp1 = df[df['start date'].notnull()]
    temp2 = df[df['start date'].isnull() & df.ground.notnull()]
    temp3 = df[df['start date'].isnull() & df.ground.isnull()]
    temp3.insert(2,'column3',0)
    temp3.drop(['ground','start date'],axis=1,inplace=True)
    temp1.drop(['br'],axis=1,inplace=True)
    temp2.drop(['start date'],axis=1,inplace=True)
    temp2.columns = temp1.columns
    temp3.columns = temp2.columns
    df = pd.concat([temp1,temp2,temp3])
    df = df.sort_index()
    df = df.reset_index(drop=True)
    return df

def restructure_dataframe_old(df):
    temp1 = df[df['start date'].notnull()]
    temp2 = df[df['start date'].isnull()&df.ground.notnull()]
    temp3 = df[(df.result != 'won')&(df.winner.notnull())]
    temp3.insert(2,'column3',0)
    temp3.drop(['ground','start date'],axis=1,inplace=True)
    temp1.drop(['br'],axis=1,inplace=True)
    temp2.drop(['start date'],axis=1,inplace=True)
    temp2.columns = temp1.columns
    temp3.columns = temp2.columns
    df = pd.concat([temp1,temp2,temp3])
    df = df.sort_index()
    df = df.reset_index(drop=True)
    return df

def batter_data(df):
    #print(df)
    playerdf = df[df.sr.notnull()]
    extradf = df[df.player.isin(["extras","total"])]
    extradf = extradf.set_index(["player"])
    #print(extradf.loc['total']['batting'])
    playerdf[['overs','run_rate']] = re.findall(r"[-+]?\d*\.\d+|\d+",extradf.loc['total']['batting'])[:2]
    value_split = extradf.loc['total']['r'].split("/")
    if len(value_split)==1:
        value_split.append(10)
    playerdf[['total_run','wickets']] = value_split
    return playerdf

def parse_scorecard_batter(tag):
    try:
        response = requests.get("https://stats.espncricinfo.com"+tag.get("href")).text
        soup = bs4.BeautifulSoup(response, "lxml")
        table = soup.find_all("table", {"class": "table batsman"})
        df = pd.DataFrame()
        for itr in range(2):
            tempdf = extract_batter_data(table[itr])
            tempdf = batter_data(tempdf)
            tempdf = tempdf.reset_index(drop=True)
            tempdf['team'] = 'team%s'%(itr+1)
            df= pd.concat([df,tempdf])
        return True, df.reset_index(drop=True)
    except:
        return False, None
    
def parse_scorecard_bowler(tag):
    try:
        response = requests.get("https://stats.espncricinfo.com"+tag.get("href")).text
        soup = bs4.BeautifulSoup(response, "lxml")
        table = soup.find_all("table", {"class": "table bowler"})
        df = pd.DataFrame()
        for itr in range(2):
            tempdf = extract_bowler_data(table[itr])
            tempdf = tempdf[tempdf.bowling.notnull()]
            tempdf = tempdf.reset_index(drop=True)
            tempdf['team'] = 'team%s'%(2-itr)
            df= pd.concat([df,tempdf])
        return True, df.reset_index(drop=True)
    except:
        return False, None

In [4]:
ids = ['']+[i for i in range(2,29)] #player records.
url = "https://stats.espncricinfo.com/ci/engine/stats/index.html?class=3;{}template=results;type=aggregate;view=results"

all_tags = []
for item in ids[24:]:
    #print(item)
    val = url.format("page={};".format(item)) if item else url.format(item)
    response = requests.get(val)
    soup = bs4.BeautifulSoup(response.text, "lxml")
    table = soup.find_all("table", {"class":"engineTable"})[2]
    df=extract_match_data(table)
    df=restructure_dataframe(df)
    df['id'] = [bson.objectid.ObjectId() for itr in range(df.shape[0])]
    tag_list = [a_tag for a_tag in soup.find_all('a') if a_tag.text == 'Match scorecard']
    #print(df.shape)
    #print(len(tag_list))
    for i in range(len(tag_list)):
        flag1, bat_df = parse_scorecard_batter(tag_list[i])
        flag2, bow_df = parse_scorecard_bowler(tag_list[i])
        if (flag1 == False) or (flag2 == False): continue
        print(i)
        print(df.iloc[i]['id'])
        bat_df['id'], bow_df['id']= df.iloc[i]['id'], df.iloc[i]['id']
        bat_df.to_csv(r"C:\Users\Alok\Downloads\dataset\espn_data\bat\bat_{}.csv".format(df.iloc[i]['id']))
        bow_df.to_csv(r"C:\Users\Alok\Downloads\dataset\espn_data\bowl\bow_{}.csv".format(df.iloc[i]['id']))
    df.to_csv(r"C:\Users\Alok\Downloads\dataset\espn_data\page_{}.csv".format(item))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


0
616dc65965ff3226702006ed
1
616dc65965ff3226702006ee
2
616dc65965ff3226702006ef
3
616dc65965ff3226702006f0
4
616dc65965ff3226702006f1
5
616dc65965ff3226702006f2
6
616dc65965ff3226702006f3
7
616dc65965ff3226702006f4
8
616dc65965ff3226702006f5
9
616dc65965ff3226702006f6
10
616dc65965ff3226702006f7
11
616dc65965ff3226702006f8
12
616dc65965ff3226702006f9
13
616dc65965ff3226702006fa
14
616dc65965ff3226702006fb
15
616dc65965ff3226702006fc
16
616dc65965ff3226702006fd
17
616dc65965ff3226702006fe
18
616dc65965ff3226702006ff
19
616dc65965ff322670200700
20
616dc65965ff322670200701
21
616dc65965ff322670200702
22
616dc65965ff322670200703
23
616dc65965ff322670200704
24
616dc65965ff322670200705
25
616dc65965ff322670200706
26
616dc65965ff322670200707
27
616dc65965ff322670200708
28
616dc65965ff322670200709
29
616dc65965ff32267020070a
30
616dc65965ff32267020070b
31
616dc65965ff32267020070c
32
616dc65965ff32267020070d
33
616dc65965ff32267020070e
34
616dc65965ff32267020070f
35
616dc65965ff322670200710
36