In [1]:
# import libraries
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs
import time
import random
import re
import os
# display options
pd.options.display.max_columns=999
pd.options.display.max_rows = 999

project_root_dir = os.path.normpath(os.getcwd() + os.sep + os.pardir)
file_path = os.path.join(project_root_dir, "data")
os.makedirs(file_path, exist_ok=True)

# function for loading data 
def load_data(filename, file_path=file_path):
    csv_path = os.path.join(file_path, filename)
    return pd.read_csv(csv_path)

# function for saving data as csv file
def save_dataframe(df, filename, file_path=file_path):
    """
    This function takes a dataframe and save it as a csv file.
    
    df: dataframe to save
    filename: Name to use for the csv file eg: 'my_file.csv'
    file_path = where to save the file
    """
    path = os.path.join(file_path, filename)
    df.to_csv(path, index=False)
    
def combine_all_years_data(function, year_list):
    """
    Common function for combining data for all the years for a 
    given table from ipl website or any other. All table have
    different functions to get the data from the websites.
    """
    try:
        # create an empty list to hold all the dataframes
        df_list = []
        # loop through each year and extract the data
        for year in year_list:
            # call the function to get the data for that year
            df = function(year)
            # append the data to the df list
            df_list.append(df)
            # add some random pause
            time.sleep(1 + 2*random.random())

        # concat all the dataframes
        df = pd.concat(df_list, ignore_index=True)
        
    except Exception as e:
        print(e)
        print(year)
    
    #return the dataframe
    return df 


# change the value for which you want to scrape the data defaults to 2008-2019
year_list = [year for year in range(2019,2007,-1)]   

# Maidens & Dots

### Dot Balls

In [2]:
url = "https://www.iplt20.com/stats/2019/most-dot-balls"
response = requests.get(url)
dots_html = response.text
dots_soup = bs(dots_html)
dots_table_data = dots_soup.find(class_ = "js-table")
# get the column names
col_names = []
for header in dots_table_data.find_all('th'):
    col_names.append(header.text.strip())
    
a_list = []
for data in dots_table_data.find_all('td'):
    a_list.append(' '.join(data.text.split()))

n = 13
final = [a_list[i:i + n] for i in range(0, len(a_list), n)]
df = pd.DataFrame(final)
df.columns = col_names
df.head()

Unnamed: 0,POS,PLAYER,Mat,Inns,Ov,Runs,Wkts,Dots,Avg,Econ,SR,4w,5w
0,1,Deepak Chahar,17,17,64.3,482,22,190,21.9,7.47,17.59,0,0
1,2,Jasprit Bumrah,16,16,61.4,409,19,169,21.52,6.63,19.47,0,0
2,3,Bhuvneshwar Kumar,15,15,59.0,461,13,168,35.46,7.81,27.23,0,0
3,4,Rashid Khan,15,15,60.0,377,17,166,22.17,6.28,21.17,0,0
4,5,Imran Tahir,17,17,64.2,431,26,149,16.57,6.69,14.84,2,0


In [3]:
# select only player name and Dots data
df = df[['PLAYER','Dots']]
# convert data type
df['Dots'] = pd.to_numeric(df['Dots'], errors='coerce').fillna(0)

In [4]:
def get_dot_balls_data(year):
    """This function gets the dot balls data for a particular year."""
    url = "https://www.iplt20.com/stats/{}/most-dot-balls".format(year)
    response = requests.get(url)
    dots_html = response.text
    dots_soup = bs(dots_html)
    dots_table_data = dots_soup.find(class_ = "js-table")
    # get the column names
    col_names = []
    for header in dots_table_data.find_all('th'):
        col_names.append(header.text.strip())

    a_list = []
    for data in dots_table_data.find_all('td'):
        a_list.append(' '.join(data.text.split()))

    n = 13
    final = [a_list[i:i + n] for i in range(0, len(a_list), n)]
    df = pd.DataFrame(final)
    df.columns = col_names
    
    # select only player name and Dots data
    df = df[['PLAYER','Dots']]
    # convert data type
    df['Dots'] = pd.to_numeric(df['Dots'], errors='coerce').fillna(0)
    return df
    

In [5]:
dot_balls_2019 = get_dot_balls_data(2019)

In [6]:
dot_balls_2019.head()

Unnamed: 0,PLAYER,Dots
0,Deepak Chahar,190
1,Jasprit Bumrah,169
2,Bhuvneshwar Kumar,168
3,Rashid Khan,166
4,Imran Tahir,149


### Maidens

In [7]:
url = "https://www.iplt20.com/stats/2019/most-maidens"
response = requests.get(url)
maidens_html = response.text
maidens_soup = bs(maidens_html)
maidens_table_data = maidens_soup.find(class_ = "js-table")
maidens_table_data

<div class="js-table">
<table class="table table--scroll-on-tablet top-players">
<tr class="top-players__header" data-widget="scroll-boundary">
<th class="top-players__freeze" title="Position">POS</th>
<th class="top-players__player top-players__freeze">PLAYER</th>
<th class="top-players__m top-players__padded" title="Matches">
                                Mat
                            </th>
<th class="top-players__inns" title="Innings">
                                Inns
                            </th>
<th class="top-players__ov" title="Overs">
                                Ov
                            </th>
<th class="top-players__r" title="Runs">
                                Runs
                            </th>
<th class="top-players__w" title="Wickets">
                                Wkts
                            </th>
<th class="top-players__maid" title="Maiden overs">
                                Maid
                            </th>
<th class="top-playe

In [8]:
# get the column names
col_names = []
for header in maidens_table_data.find_all('th'):
    col_names.append(header.text.strip())
col_names

['POS',
 'PLAYER',
 'Mat',
 'Inns',
 'Ov',
 'Runs',
 'Wkts',
 'Maid',
 'Avg',
 'Econ',
 'SR',
 '4w',
 '5w']

In [9]:
a_list = []
for data in maidens_table_data.find_all('td'):
    a_list.append(' '.join(data.text.split()))

n = 13
final = [a_list[i:i + n] for i in range(0, len(a_list), n)]
df = pd.DataFrame(final)
df.columns = col_names
df.head()

Unnamed: 0,POS,PLAYER,Mat,Inns,Ov,Runs,Wkts,Maid,Avg,Econ,SR,4w,5w
0,1,Jofra Archer,11,11,43.0,291,11,2,26.45,6.76,23.45,0,0
1,2,Deepak Chahar,17,17,64.3,482,22,2,21.9,7.47,17.59,0,0
2,3,Rashid Khan,15,15,60.0,377,17,1,22.17,6.28,21.17,0,0
3,4,Ravindra Jadeja,16,16,54.0,343,15,1,22.86,6.35,21.6,0,0
4,5,Imran Tahir,17,17,64.2,431,26,1,16.57,6.69,14.84,2,0


In [10]:
# select only player name and maid column
df = df[['PLAYER','Maid']]
df['Maid'] = pd.to_numeric(df['Maid'], errors='coerce').fillna(0)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   PLAYER  18 non-null     object
 1   Maid    18 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 416.0+ bytes


In [11]:
def get_maidens_data(year):
    """This function gets the player name and maidens
    data for a particular year.
    """
    try:
        url = "https://www.iplt20.com/stats/{}/most-maidens".format(year)
        response = requests.get(url)
        maidens_html = response.text
        maidens_soup = bs(maidens_html)
        maidens_table_data = maidens_soup.find(class_ = "js-table")
        # get the column names
        col_names = []
        for header in maidens_table_data.find_all('th'):
            col_names.append(header.text.strip())

        a_list = []
        for data in maidens_table_data.find_all('td'):
            a_list.append(' '.join(data.text.split()))

        n = 13
        final = [a_list[i:i + n] for i in range(0, len(a_list), n)]
        df = pd.DataFrame(final)
        df.columns = col_names

        # select only player name and maid column
        df = df[['PLAYER','Maid']]
        # change data type
        df['Maid'] = pd.to_numeric(df['Maid'], errors='coerce').fillna(0)
        
    except Exception as e:
        print(e)
        print(year)
    
    return df
       

In [12]:
maidens_2019 = get_maidens_data(2019)

In [13]:
maidens_2019.head()

Unnamed: 0,PLAYER,Maid
0,Jofra Archer,2
1,Deepak Chahar,2
2,Rashid Khan,1
3,Ravindra Jadeja,1
4,Imran Tahir,1


In [14]:
def get_dots_maidens(year):
    """
    Combine the dots, maidens and data into a single df.
    """
    try:
        dots_df = get_dot_balls_data(year)
        maidens_df = get_maidens_data(year)
        #hats_df = get_hat_tricks_data(year)

        df = pd.merge(left=dots_df, right=maidens_df,how='left', on=['PLAYER'])
        #df = pd.merge(left=df, right=hats_df,how='left',on=['PLAYER'])
        # fill missing values
        df.fillna(0, inplace=True)
    except Exception as e:
        print(e)
        print(year)
    
    return df

In [15]:
test_df = get_dots_maidens(2019)

In [16]:
test_df.head()

Unnamed: 0,PLAYER,Dots,Maid
0,Deepak Chahar,190,2.0
1,Jasprit Bumrah,169,1.0
2,Bhuvneshwar Kumar,168,1.0
3,Rashid Khan,166,1.0
4,Imran Tahir,149,1.0


# Most Wickets

In [17]:
url = "https://www.iplt20.com/stats/2019/most-wickets"
response = requests.get(url)
bowling_html = response.text
bowling_soup = bs(bowling_html)
bowling_soup

<!DOCTYPE html>
<html lang="en">
<head>
<meta content="IPLT20.com - Indian Premier League Official Website" name="twitter:title"/>
<meta content="ipl, iplt20, indian premier league, ipl cricket, ipl match, ipl live, ipl score, ipl scorecard, ipl stats, ipl schedule, ipl results, ipl points table, ipl teams, ipl videos, ipl teams, ipl news, BCCI IPL" name="keywords"/>
<meta content="website" property="og:type"/>
<meta content="Visit IPLT20.com the official IPLT20 website for minute-to-minute LIVE updates." name="description"/>
<meta content="Visit IPLT20.com the official IPLT20 website for minute-to-minute LIVE updates." name="twitter:description"/>
<meta content="IPLT20.com - Indian Premier League Official Website" property="og:title"/>
<title>IPLT20.com - Indian Premier League Official Website</title>
<meta content="Visit IPLT20.com the official IPLT20 website for minute-to-minute LIVE updates." property="og:description"/>
<meta charset="utf-8"/>
<meta content="width=device-width, ini

In [18]:
# get the table data
bowling_table_data = bowling_soup.find(class_ = "js-table")
bowling_table_data

<div class="js-table">
<table class="table table--scroll-on-tablet top-players">
<tr class="top-players__header" data-widget="scroll-boundary">
<th class="top-players__freeze" title="Position">POS</th>
<th class="top-players__player top-players__freeze">PLAYER</th>
<th class="top-players__m top-players__padded" title="Matches">
                                Mat
                            </th>
<th class="top-players__inns" title="Innings">
                                Inns
                            </th>
<th class="top-players__ov" title="Overs">
                                Ov
                            </th>
<th class="top-players__r" title="Runs">
                                Runs
                            </th>
<th class="top-players__w" title="Wickets">
                                Wkts
                            </th>
<th class="top-players__bbi" title="Bowling Figures">
                                BBI
                            </th>
<th class="top-play

In [19]:
# get the column names
col_names = []
for header in bowling_table_data.find_all('th'):
    col_names.append(header.text.strip())
col_names

['POS',
 'PLAYER',
 'Mat',
 'Inns',
 'Ov',
 'Runs',
 'Wkts',
 'BBI',
 'Avg',
 'Econ',
 'SR',
 '4w',
 '5w']

In [20]:
for data in bowling_table_data.find_all('td')[:20]:
    print(' '.join(data.text.split()))

1
Imran Tahir
17
17
64.2
431
26
4/12
16.57
6.69
14.84
2
0
2
Kagiso Rabada
12
12
47
368
25


In [21]:
a_list = []
for data in bowling_table_data.find_all('td'):
    a_list.append(' '.join(data.text.split()))

n = 13
final = [a_list[i:i + n] for i in range(0, len(a_list), n)]
df = pd.DataFrame(final)
df.columns = col_names
df.head()

Unnamed: 0,POS,PLAYER,Mat,Inns,Ov,Runs,Wkts,BBI,Avg,Econ,SR,4w,5w
0,1,Imran Tahir,17,17,64.2,431,26,4/12,16.57,6.69,14.84,2,0
1,2,Kagiso Rabada,12,12,47.0,368,25,4/21,14.72,7.82,11.28,2,0
2,3,Deepak Chahar,17,17,64.3,482,22,3/20,21.9,7.47,17.59,0,0
3,4,Shreyas Gopal,14,14,48.0,347,20,3/12,17.35,7.22,14.4,0,0
4,5,Jasprit Bumrah,16,16,61.4,409,19,3/20,21.52,6.63,19.47,0,0


In [22]:
# Add the nationality of each player in the dataframe
nationality_list = []
for index, data in enumerate(bowling_table_data.find_all('tr')[1:]):
    try:
        nationality_list.append(data['data-nationality'])
    except Exception as e:
        print(e)
        print(index)
        # add none 
        nationality_list.append(None)
df['Nationality'] = nationality_list

In [23]:
df.sample(10)

Unnamed: 0,POS,PLAYER,Mat,Inns,Ov,Runs,Wkts,BBI,Avg,Econ,SR,4w,5w,Nationality
17,18,Chris Morris,9,9,33.0,306,13,3/22,23.53,9.27,15.23,0,0,Overseas
22,23,Dwayne Bravo,12,12,41.1,330,11,3/33,30.0,8.01,22.45,0,0,Overseas
12,13,Ravichandran Ashwin,14,14,55.0,400,15,3/23,26.66,7.27,22.0,0,0,Indian
30,31,Keemo Paul,8,8,27.1,237,9,3/17,26.33,8.72,18.11,0,0,Overseas
78,79,Kulwant Khejroliya,2,2,5.0,47,1,1/29,47.0,9.4,30.0,0,0,Indian
60,61,Arshdeep Singh,3,3,10.0,109,3,2/43,36.33,10.9,20.0,0,0,Indian
33,34,Shardul Thakur,10,9,30.0,281,8,2/18,35.12,9.36,22.5,0,0,Indian
11,12,Ravindra Jadeja,16,16,54.0,343,15,3/9,22.86,6.35,21.6,0,0,Indian
29,30,Jaydev Unadkat,11,11,37.2,398,10,2/26,39.8,10.66,22.4,0,0,Indian
9,10,Harbhajan Singh,11,11,44.0,312,16,3/20,19.5,7.09,16.5,0,0,Indian


In [24]:
# Add the player link for more info in the dataframe
base_url = "https://www.iplt20.com"
player_link_list = []

# get all the links and add it to the list
for data in bowling_table_data.find_all('a'):
    player_link_list.append(base_url + data['href'])

# create a column with None value
df[14] = None
# iterate through each row and create a player name pattern
for index, row in df.iterrows():
    player_name = row['PLAYER'].replace(' ','-')
    player_regex = re.compile(r"{}".format(player_name),re.IGNORECASE)
    for item in player_link_list:
        # if the pattern matches any links
        if player_regex.search(item) != None:
            # then append it to that row of the df
            df.iloc[index,14] = item
# rename the column            
df.rename(columns={14:'Player Link'}, inplace=True)


# extract the player team name from the link and add to the df
team_regex = r"teams/(\w+-\w+-?\w+)"
df['Team'] = df['Player Link'].str.extract(team_regex, flags=re.IGNORECASE)
df['Team'] = df['Team'].apply(lambda x : str(x).title().replace('-',' '))

In [25]:
# convert data types from string to numeric
df['POS'] = pd.to_numeric(df['POS'], errors='coerce').fillna(0)
df['Mat'] = pd.to_numeric(df['Mat'], errors='coerce').fillna(0)
df['Inns'] = pd.to_numeric(df['Inns'], errors='coerce').fillna(0)
df['Ov'] = pd.to_numeric(df['Ov'], errors='coerce').fillna(0)
df['Runs'] = pd.to_numeric(df['Runs'], errors='coerce').fillna(0)
df['Wkts'] = pd.to_numeric(df['Wkts'], errors='coerce').fillna(0)
df['BBI'] = pd.to_numeric(df['BBI'], errors='coerce').fillna(0)
df['Avg'] = pd.to_numeric(df['Avg'], errors='coerce').fillna(0)
df['Econ'] = pd.to_numeric(df['Econ'], errors='coerce').fillna(0)
df['SR'] = pd.to_numeric(df['SR'], errors='coerce').fillna(0)
df['4w'] = pd.to_numeric(df['4w'], errors='coerce').fillna(0)
df['5w'] = pd.to_numeric(df['5w'], errors='coerce').fillna(0)

In [26]:
df.head()

Unnamed: 0,POS,PLAYER,Mat,Inns,Ov,Runs,Wkts,BBI,Avg,Econ,SR,4w,5w,Nationality,Player Link,Team
0,1,Imran Tahir,17,17,64.2,431,26,0.0,16.57,6.69,14.84,2,0,Overseas,https://www.iplt20.com/teams/chennai-super-kin...,Chennai Super Kings
1,2,Kagiso Rabada,12,12,47.0,368,25,0.0,14.72,7.82,11.28,2,0,Overseas,https://www.iplt20.com/teams/delhi-capitals/sq...,Delhi Capitals
2,3,Deepak Chahar,17,17,64.3,482,22,0.0,21.9,7.47,17.59,0,0,Indian,https://www.iplt20.com/teams/chennai-super-kin...,Chennai Super Kings
3,4,Shreyas Gopal,14,14,48.0,347,20,0.0,17.35,7.22,14.4,0,0,Indian,https://www.iplt20.com/teams/rajasthan-royals/...,Rajasthan Royals
4,5,Jasprit Bumrah,16,16,61.4,409,19,0.0,21.52,6.63,19.47,0,0,Indian,https://www.iplt20.com/teams/mumbai-indians/sq...,Mumbai Indians


In [27]:
def get_bowling_data(year):
    try:
        url = "https://www.iplt20.com/stats/{}/most-wickets".format(year)
        response = requests.get(url)
        bowling_html = response.text
        bowling_soup = bs(bowling_html)
        
        # get the table data
        bowling_table_data = bowling_soup.find(class_ = "js-table")
        
        # get the column names
        col_names = []
        for header in bowling_table_data.find_all('th'):
            col_names.append(header.text.strip())
            
        a_list = []
        for data in bowling_table_data.find_all('td'):
            a_list.append(' '.join(data.text.split()))

        n = 13
        final = [a_list[i:i + n] for i in range(0, len(a_list), n)]
        df = pd.DataFrame(final)
        df.columns = col_names
        
        # Add the nationality of each player in the dataframe
        nationality_list = []
        for index, data in enumerate(bowling_table_data.find_all('tr')[1:]):
            try:
                nationality_list.append(data['data-nationality'])
            except Exception as e:
                print(e)
                print(index)
                # add none 
                nationality_list.append(None)
        df['Nationality'] = nationality_list
        
        
        # Add the player link for more info in the dataframe
        base_url = "https://www.iplt20.com"
        player_link_list = []

        # get all the links and add it to the list
        for data in bowling_table_data.find_all('a'):
            player_link_list.append(base_url + data['href'])

        # create a column with None value
        df[14] = None
        # iterate through each row and create a player name pattern
        for index, row in df.iterrows():
            player_name = row['PLAYER'].replace(' ','-')
            player_regex = re.compile(r"{}".format(player_name),re.IGNORECASE)
            for item in player_link_list:
                # if the pattern matches any links
                if player_regex.search(item) != None:
                    # then append it to that row of the df
                    df.iloc[index,14] = item
        # rename the column            
        df.rename(columns={14:'Player Link'}, inplace=True)


        # extract the player team name from the link and add to the df
        team_regex = r"teams/(\w+-\w+-?\w+)"
        df['Team'] = df['Player Link'].str.extract(team_regex, flags=re.IGNORECASE)
        df['Team'] = df['Team'].apply(lambda x : str(x).title().replace('-',' '))
        
        # convert data types from string to numeric
        df['POS'] = pd.to_numeric(df['POS'], errors='coerce').fillna(0)
        df['Mat'] = pd.to_numeric(df['Mat'], errors='coerce').fillna(0)
        df['Inns'] = pd.to_numeric(df['Inns'], errors='coerce').fillna(0)
        df['Ov'] = pd.to_numeric(df['Ov'], errors='coerce').fillna(0)
        df['Runs'] = pd.to_numeric(df['Runs'], errors='coerce').fillna(0)
        df['Wkts'] = pd.to_numeric(df['Wkts'], errors='coerce').fillna(0)
        df['BBI'] = pd.to_numeric(df['BBI'], errors='coerce').fillna(0)
        df['Avg'] = pd.to_numeric(df['Avg'], errors='coerce').fillna(0)
        df['Econ'] = pd.to_numeric(df['Econ'], errors='coerce').fillna(0)
        df['SR'] = pd.to_numeric(df['SR'], errors='coerce').fillna(0)
        df['4w'] = pd.to_numeric(df['4w'], errors='coerce').fillna(0)
        df['5w'] = pd.to_numeric(df['5w'], errors='coerce').fillna(0)
        
        # extract the dots balls and maidens data
        df2 = get_dots_maidens(year) 
        
        # combine both the dataframes
        df = pd.merge(left=df, right=df2,how='left',on=['PLAYER'])
        # fill missing values
        df.fillna(0, inplace=True)
        
        # add season year
        df['Season'] = year
        
        
    except Exception as e:
        print(e)
        print(year)
        
        
        
    # return dataframe
    return df   
    

In [28]:
bowling_2019_df = get_bowling_data(2019)

In [29]:
bowling_2019_df.head()

Unnamed: 0,POS,PLAYER,Mat,Inns,Ov,Runs,Wkts,BBI,Avg,Econ,SR,4w,5w,Nationality,Player Link,Team,Dots,Maid,Season
0,1,Imran Tahir,17,17,64.2,431,26,0.0,16.57,6.69,14.84,2,0,Overseas,https://www.iplt20.com/teams/chennai-super-kin...,Chennai Super Kings,149.0,1.0,2019
1,2,Kagiso Rabada,12,12,47.0,368,25,0.0,14.72,7.82,11.28,2,0,Overseas,https://www.iplt20.com/teams/delhi-capitals/sq...,Delhi Capitals,113.0,0.0,2019
2,3,Deepak Chahar,17,17,64.3,482,22,0.0,21.9,7.47,17.59,0,0,Indian,https://www.iplt20.com/teams/chennai-super-kin...,Chennai Super Kings,190.0,2.0,2019
3,4,Shreyas Gopal,14,14,48.0,347,20,0.0,17.35,7.22,14.4,0,0,Indian,https://www.iplt20.com/teams/rajasthan-royals/...,Rajasthan Royals,107.0,1.0,2019
4,5,Jasprit Bumrah,16,16,61.4,409,19,0.0,21.52,6.63,19.47,0,0,Indian,https://www.iplt20.com/teams/mumbai-indians/sq...,Mumbai Indians,169.0,1.0,2019


In [30]:
# bowling_df = combine_all_years_data(get_bowling_data, year_list)

In [31]:
# bowling_df.shape

In [32]:
# bowling_df.sample(10)

In [33]:
# save_dataframe(bowling_df, 'bowling.csv', file_path)

## Team Win Losses

In [34]:
win_losses = pd.read_html("https://en.wikipedia.org/wiki/List_of_Indian_Premier_League_records_and_statistics")
win_losses

[                    0                                       1
 0           Countries                                   India
 1       Administrator                                    BCCI
 2              Format                                     T20
 3       First edition                                    2008
 4      Latest edition                                    2020
 5        Next edition                                    2021
 6   Tournament format  Double round-robin league and Playoffs
 7     Number of teams                                       8
 8    Current champion              Mumbai Indians (4th title)
 9     Most successful               Mumbai Indians (4 titles)
 10          Most runs                   Virat Kohli (5430)[1]
 11       Most wickets                 Lasith Malinga (170)[2]
 12                 TV                    List of broadcasters
 13            Website                              iplt20.com,
                                          Tournaments 

In [35]:
win_losses[3]

Unnamed: 0,Team,Span,Matches,Won,Lost,No Result,Tied and won,Tied and lost,Win %,Titles
0,CSK,"2008–2015, 2018–2020",168,101,66,0,1,1,60.77,3
1,DC,2008–2020,180,77,98,2,1,2,44.10,0
2,KXIP,2008–2020,179,81,95,2,1,0,46.08,0
3,KKR,2008–2020,180,93,84,0,3,0,52.50,2
4,MI,2008–2020,190,108,79,2,1,0,57.63,4
5,RR,"2008–2015, 2018–2020",149,75,69,2,1,1,52.04,1
6,RCB,2008–2020,184,84,93,2,1,4,47.50,0
7,SRH,2013–2020,111,58,51,1,1,0,53.15,1
8,Last updated: 28 September 2020[3] Note: Tie+W...,Last updated: 28 September 2020[3] Note: Tie+W...,Last updated: 28 September 2020[3] Note: Tie+W...,Last updated: 28 September 2020[3] Note: Tie+W...,Last updated: 28 September 2020[3] Note: Tie+W...,Last updated: 28 September 2020[3] Note: Tie+W...,Last updated: 28 September 2020[3] Note: Tie+W...,Last updated: 28 September 2020[3] Note: Tie+W...,Last updated: 28 September 2020[3] Note: Tie+W...,Last updated: 28 September 2020[3] Note: Tie+W...


In [36]:
win_losses_df = win_losses[3]

In [37]:
win_losses_df.drop([8], inplace=True)

In [38]:
win_losses_df

Unnamed: 0,Team,Span,Matches,Won,Lost,No Result,Tied and won,Tied and lost,Win %,Titles
0,CSK,"2008–2015, 2018–2020",168,101,66,0,1,1,60.77,3
1,DC,2008–2020,180,77,98,2,1,2,44.1,0
2,KXIP,2008–2020,179,81,95,2,1,0,46.08,0
3,KKR,2008–2020,180,93,84,0,3,0,52.5,2
4,MI,2008–2020,190,108,79,2,1,0,57.63,4
5,RR,"2008–2015, 2018–2020",149,75,69,2,1,1,52.04,1
6,RCB,2008–2020,184,84,93,2,1,4,47.5,0
7,SRH,2013–2020,111,58,51,1,1,0,53.15,1


In [39]:
val_dict = {"CSK": "Chennai Super Kings",
           "DC": "Delhi Capitals",
           "KXIP": "Kings XI Punjab",
           "KKR": "Kolkata Knight Riders",
           "MI": "Mumbai Indians",
           "RR": "Rajasthan Royals",
           "RCB": "Royal Challengers Banglore",
           "SRH": "Sunrisers Hyderabad"}

win_losses_df["Team"] = win_losses_df["Team"].map(val_dict)

In [40]:
win_losses_df

Unnamed: 0,Team,Span,Matches,Won,Lost,No Result,Tied and won,Tied and lost,Win %,Titles
0,Chennai Super Kings,"2008–2015, 2018–2020",168,101,66,0,1,1,60.77,3
1,Delhi Capitals,2008–2020,180,77,98,2,1,2,44.1,0
2,Kings XI Punjab,2008–2020,179,81,95,2,1,0,46.08,0
3,Kolkata Knight Riders,2008–2020,180,93,84,0,3,0,52.5,2
4,Mumbai Indians,2008–2020,190,108,79,2,1,0,57.63,4
5,Rajasthan Royals,"2008–2015, 2018–2020",149,75,69,2,1,1,52.04,1
6,Royal Challengers Banglore,2008–2020,184,84,93,2,1,4,47.5,0
7,Sunrisers Hyderabad,2013–2020,111,58,51,1,1,0,53.15,1


In [41]:
win_losses_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8 entries, 0 to 7
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Team           8 non-null      object
 1   Span           8 non-null      object
 2   Matches        8 non-null      object
 3   Won            8 non-null      object
 4   Lost           8 non-null      object
 5   No Result      8 non-null      object
 6   Tied and won   8 non-null      object
 7   Tied and lost  8 non-null      object
 8   Win %          8 non-null      object
 9   Titles         8 non-null      object
dtypes: object(10)
memory usage: 704.0+ bytes


In [42]:
print(win_losses_df.columns.to_list())

['Team', 'Span', 'Matches', 'Won', 'Lost', 'No Result', 'Tied and won', 'Tied and lost', 'Win\xa0%', 'Titles']


In [43]:
win_losses_df.rename(columns={'Win\xa0%':'Win %'}, inplace=True)

cols_list = ['Matches', 'Won', 'Lost', 'No Result', 'Tied and won', 'Tied and lost','Win %', 'Titles']

for col in cols_list:
    win_losses_df[col] = pd.to_numeric(win_losses_df[col], errors='coerce').fillna(0)

In [44]:
win_losses_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8 entries, 0 to 7
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Team           8 non-null      object 
 1   Span           8 non-null      object 
 2   Matches        8 non-null      int64  
 3   Won            8 non-null      int64  
 4   Lost           8 non-null      int64  
 5   No Result      8 non-null      int64  
 6   Tied and won   8 non-null      int64  
 7   Tied and lost  8 non-null      int64  
 8   Win %          8 non-null      float64
 9   Titles         8 non-null      int64  
dtypes: float64(1), int64(7), object(2)
memory usage: 704.0+ bytes


In [45]:
# save_dataframe(win_losses_df, 'wins_losses.csv', file_path)

In [46]:
def get_wins_losses_data():
    win_losses = pd.read_html("https://en.wikipedia.org/wiki/List_of_Indian_Premier_League_records_and_statistics")
    # select the win losses table
    win_losses_df = win_losses[3]
    # drop the last 
    win_losses_df.drop(win_losses_df.index[-1], inplace=True)
    #change names of the teams
    val_dict = {"CSK": "Chennai Super Kings",
           "DC": "Delhi Capitals",
           "KXIP": "Kings XI Punjab",
           "KKR": "Kolkata Knight Riders",
           "MI": "Mumbai Indians",
           "RR": "Rajasthan Royals",
           "RCB": "Royal Challengers Banglore",
           "SRH": "Sunrisers Hyderabad"}

    win_losses_df["Team"] = win_losses_df["Team"].map(val_dict)
    # rename the column
    win_losses_df.rename(columns={'Win\xa0%':'Win %'}, inplace=True)
    # columns list
    cols_list = ['Matches', 'Won', 'Lost', 'No Result', 'Tied and won', 'Tied and lost','Win %', 'Titles']
    # convert data types
    for col in cols_list:
        win_losses_df[col] = pd.to_numeric(win_losses_df[col], errors='coerce').fillna(0)
        
    return win_losses_df

## Bowling All Time

In [47]:
def get_bowling_data_all_time():
    try:
        url = "https://www.iplt20.com/stats/all-time/most-wickets"
        response = requests.get(url)
        bowling_html = response.text
        bowling_soup = bs(bowling_html)
        
        # get the table data
        bowling_table_data = bowling_soup.find(class_ = "js-table")
        
        # get the column names
        col_names = []
        for header in bowling_table_data.find_all('th'):
            col_names.append(header.text.strip())
            
        a_list = []
        for data in bowling_table_data.find_all('td'):
            a_list.append(' '.join(data.text.split()))

        n = 13
        final = [a_list[i:i + n] for i in range(0, len(a_list), n)]
        df = pd.DataFrame(final)
        df.columns = col_names
        
        # Add the nationality of each player in the dataframe
        nationality_list = []
        for index, data in enumerate(bowling_table_data.find_all('tr')[1:]):
            try:
                nationality_list.append(data['data-nationality'])
            except Exception as e:
                print(e)
                print(index)
                # add none 
                nationality_list.append(None)
        df['Nationality'] = nationality_list
        
        
        # Add the player link for more info in the dataframe
        base_url = "https://www.iplt20.com"
        player_link_list = []

        # get all the links and add it to the list
        for data in bowling_table_data.find_all('a'):
            player_link_list.append(base_url + data['href'])

        # create a column with None value
        df[14] = None
        # iterate through each row and create a player name pattern
        for index, row in df.iterrows():
            player_name = row['PLAYER'].replace(' ','-')
            player_regex = re.compile(r"{}".format(player_name),re.IGNORECASE)
            for item in player_link_list:
                # if the pattern matches any links
                if player_regex.search(item) != None:
                    # then append it to that row of the df
                    df.iloc[index,14] = item
        # rename the column            
        df.rename(columns={14:'Player Link'}, inplace=True)


        # extract the player team name from the link and add to the df
        team_regex = r"teams/(\w+-\w+-?\w+)"
        df['Team'] = df['Player Link'].str.extract(team_regex, flags=re.IGNORECASE)
        df['Team'] = df['Team'].apply(lambda x : str(x).title().replace('-',' '))
        
        # convert data types from string to numeric
        df['POS'] = pd.to_numeric(df['POS'], errors='coerce').fillna(0)
        df['Mat'] = pd.to_numeric(df['Mat'], errors='coerce').fillna(0)
        df['Inns'] = pd.to_numeric(df['Inns'], errors='coerce').fillna(0)
        df['Ov'] = pd.to_numeric(df['Ov'], errors='coerce').fillna(0)
        df['Runs'] = pd.to_numeric(df['Runs'].str.replace(',',''), errors='coerce').fillna(0)
        df['Wkts'] = pd.to_numeric(df['Wkts'], errors='coerce').fillna(0)
        #df['BBI'] = pd.to_numeric(df['BBI'], errors='coerce').fillna(0)
        df['Avg'] = pd.to_numeric(df['Avg'], errors='coerce').fillna(0)
        df['Econ'] = pd.to_numeric(df['Econ'], errors='coerce').fillna(0)
        df['SR'] = pd.to_numeric(df['SR'], errors='coerce').fillna(0)
        df['4w'] = pd.to_numeric(df['4w'], errors='coerce').fillna(0)
        df['5w'] = pd.to_numeric(df['5w'], errors='coerce').fillna(0)
        
        
    except Exception as e:
        print(e)
        print(year)
        
        
        
    # return dataframe
    return df   
    

In [48]:
bowling_all_time = get_bowling_data_all_time()

In [49]:
bowling_all_time.head()

Unnamed: 0,POS,PLAYER,Mat,Inns,Ov,Runs,Wkts,BBI,Avg,Econ,SR,4w,5w,Nationality,Player Link,Team
0,1,Lasith Malinga,122,122,471.1,3366,170,5/13,19.8,7.14,16.62,6,1,Overseas,https://www.iplt20.com/teams/mumbai-indians/sq...,Mumbai Indians
1,2,Amit Mishra,149,149,524.5,3857,159,5/17,24.25,7.34,19.8,3,1,Indian,https://www.iplt20.com/teams/delhi-capitals/sq...,Delhi Capitals
2,3,Piyush Chawla,160,159,532.4,4181,154,4/17,27.14,7.84,20.75,2,0,Indian,https://www.iplt20.com/teams/chennai-super-kin...,Chennai Super Kings
3,4,Harbhajan Singh,160,157,562.2,3967,150,5/18,26.44,7.05,22.49,1,1,Indian,https://www.iplt20.com/teams/chennai-super-kin...,Chennai Super Kings
4,5,Dwayne Bravo,134,131,431.0,3617,147,4/22,24.6,8.39,17.59,2,0,Overseas,https://www.iplt20.com/teams/chennai-super-kin...,Chennai Super Kings


In [50]:
bowling_all_time.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   POS          100 non-null    int64  
 1   PLAYER       100 non-null    object 
 2   Mat          100 non-null    int64  
 3   Inns         100 non-null    int64  
 4   Ov           100 non-null    float64
 5   Runs         100 non-null    int64  
 6   Wkts         100 non-null    int64  
 7   BBI          100 non-null    object 
 8   Avg          100 non-null    float64
 9   Econ         100 non-null    float64
 10  SR           100 non-null    float64
 11  4w           100 non-null    int64  
 12  5w           100 non-null    int64  
 13  Nationality  100 non-null    object 
 14  Player Link  88 non-null     object 
 15  Team         100 non-null    object 
dtypes: float64(4), int64(7), object(5)
memory usage: 12.6+ KB


In [51]:
# save_dataframe(bowling_all_time, 'bowling_all_time.csv', file_path)