In [0]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import csv
import sys
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
from tqdm import tqdm_notebook as tqdm
import warnings
warnings.filterwarnings('ignore')

In [0]:
def fetch_match_data(years,no_of_matches=1):

    # create DataFrames to store match information
    columns_for_match_level_data = ['Team 1','Team 2','Winner','Margin','Ground','Match Date']

    IPL_matches_df = pd.DataFrame(columns = columns_for_match_level_data)

    all_team_links_dict = dict()
    
    Base_URL = ["http://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=",";trophy=117;type=season"]
    if isinstance(years,list) == False or len(years) == 0:
        raise ValueError('Etiher the year is not passed in a list or the list is empty')
    
    try:


        # loop for all the years
        for year in tqdm(years):
            # build the URL
            url = str(year).join(Base_URL)

            print(f'collecting match level data for year {year}....')
            source = requests.get(url).text
            soup = BeautifulSoup(source, 'lxml')
            data_table = soup.find('table','engineTable')
            # print(str(data_table))
            all_links = data_table.find_all('a')
            df_temp = pd.read_html(str(data_table))[0]
            IPL_matches_df = IPL_matches_df.append(df_temp,ignore_index = True)
            # http://stats.espncricinfo.com/ci/content/team/335974.html
            for link in all_links:
                if "/ci/content/team/" in link['href']:
                    all_team_links_dict[link.text] = 'http://stats.espncricinfo.com'+str(link['href'])
                    
        return IPL_matches_df,all_team_links_dict

    except Exception as e:
        print(e)

In [3]:
# fetch_match_data([2019])
df,team_links = fetch_match_data([2009,2011,2012,2013,2014,2015,2016,2017,2018,2019])

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

collecting match level data for year 2009....
collecting match level data for year 2011....
collecting match level data for year 2012....
collecting match level data for year 2013....
collecting match level data for year 2014....
collecting match level data for year 2015....
collecting match level data for year 2016....
collecting match level data for year 2017....
collecting match level data for year 2018....
collecting match level data for year 2019....



In [4]:
team_links

{'Capitals': 'http://stats.espncricinfo.com/ci/content/team/335975.html',
 'Chargers': 'http://stats.espncricinfo.com/ci/content/team/335980.html',
 'Daredevils': 'http://stats.espncricinfo.com/ci/content/team/335975.html',
 'Guj Lions': 'http://stats.espncricinfo.com/ci/content/team/968725.html',
 'KKR': 'http://stats.espncricinfo.com/ci/content/team/335971.html',
 'Kings XI': 'http://stats.espncricinfo.com/ci/content/team/335973.html',
 'Kochi': 'http://stats.espncricinfo.com/ci/content/team/474668.html',
 'Mum Indians': 'http://stats.espncricinfo.com/ci/content/team/335978.html',
 'RCB': 'http://stats.espncricinfo.com/ci/content/team/335970.html',
 'Royals': 'http://stats.espncricinfo.com/ci/content/team/335977.html',
 'Sunrisers': 'http://stats.espncricinfo.com/ci/content/team/628333.html',
 'Super Kings': 'http://stats.espncricinfo.com/ci/content/team/335974.html',
 'Supergiant': 'http://stats.espncricinfo.com/ci/content/team/968721.html',
 'Supergiants': 'http://stats.espncricinf

In [0]:
def get_team_full_names(team_links):
    team_name_dict = dict()
    for team,link in tqdm(team_links.items()):
        try:
            source = requests.get(link).text
            soup = BeautifulSoup(source, 'lxml')
            full_team_name = soup.find('span','ClubhouseHeader__DisplayName')
            team_name_dict[team] = full_team_name.text
        except Exception as e:
            print(e)
    return team_name_dict

In [6]:
Full_team_names_dict = get_team_full_names(team_links)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [7]:
Full_team_names_dict

{'Capitals': 'Delhi Capitals',
 'Chargers': 'Deccan Chargers',
 'Daredevils': 'Delhi Capitals',
 'Guj Lions': 'Gujarat Lions',
 'KKR': 'Kolkata Knight Riders',
 'Kings XI': 'Kings XI Punjab',
 'Kochi': 'Kochi Tuskers Kerala',
 'Mum Indians': 'Mumbai Indians',
 'RCB': 'Royal Challengers Bangalore',
 'Royals': 'Rajasthan Royals',
 'Sunrisers': 'Sunrisers Hyderabad',
 'Super Kings': 'Chennai Super Kings',
 'Supergiant': 'Rising Pune Supergiant',
 'Supergiants': 'Rising Pune Supergiant',
 'Warriors': 'Pune Warriors'}

In [0]:
def convert_to_full_name(team_short_name):
    if team_short_name in Full_team_names_dict.keys():
        return Full_team_names_dict[team_short_name]
    else:
        return team_short_name

df['Team 1'] = df['Team 1'].apply(convert_to_full_name)
df['Team 2'] = df['Team 2'].apply(convert_to_full_name)

In [0]:
df['Winner'] = df['Winner'].apply(convert_to_full_name)

In [10]:
df = df.drop('Scorecard',axis = 1)
df

Unnamed: 0,Ground,Margin,Match Date,Team 1,Team 2,Winner
0,Cape Town,19 runs,"Apr 18, 2009",Chennai Super Kings,Mumbai Indians,Mumbai Indians
1,Cape Town,75 runs,"Apr 18, 2009",Royal Challengers Bangalore,Rajasthan Royals,Royal Challengers Bangalore
2,Cape Town,10 wickets,"Apr 19, 2009",Delhi Capitals,Kings XI Punjab,Delhi Capitals
3,Cape Town,8 wickets,"Apr 19, 2009",Deccan Chargers,Kolkata Knight Riders,Deccan Chargers
4,Port Elizabeth,92 runs,"Apr 20, 2009",Royal Challengers Bangalore,Chennai Super Kings,Chennai Super Kings
...,...,...,...,...,...,...
640,Mumbai,9 wickets,"May 5, 2019",Mumbai Indians,Kolkata Knight Riders,Mumbai Indians
641,Chennai,6 wickets,"May 7, 2019",Mumbai Indians,Chennai Super Kings,Mumbai Indians
642,Visakhapatnam,2 wickets,"May 8, 2019",Delhi Capitals,Sunrisers Hyderabad,Delhi Capitals
643,Visakhapatnam,6 wickets,"May 10, 2019",Chennai Super Kings,Delhi Capitals,Chennai Super Kings


In [0]:
# Now we have to collect data for year 2010 and 2008 from howstats.com

def collect_Match_level_data_2008_2010():
    link_2008 = 'http://www.howstat.com/cricket/Statistics/IPL/MatchList.asp?s=2008'
    link_2010 = 'http://www.howstat.com/cricket/Statistics/IPL/MatchList.asp?s=2010'
    try:
        source_2008 = requests.get(link_2008).text
        source_2010 = requests.get(link_2010).text
        soup_2008 = BeautifulSoup(source_2008, 'lxml')
        soup_2010 = BeautifulSoup(source_2010, 'lxml')  

        table_2008 = soup_2008.find('table','TableLined')
        table_2010 = soup_2010.find('table','TableLined')

        # print(len(table_2008))
        # print(table_2008.prettify())
        

        df_2008 = pd.read_html(str(table_2008))[0]
        df_2008 = df_2008.drop(6,axis = 1)
        df_2008.columns = ['season','match','Match Date','teams','Ground','result']
        df_2008 = df_2008.drop(0)
        df_2008 = df_2008.drop(df_2008.shape[0])
        df_2010 = pd.read_html(str(table_2010))[0]
        df_2010 = df_2010.drop(6,axis = 1)
        df_2010.columns = ['season','match','Match Date','teams','Ground','result']
        df_2010 = df_2010.drop(0)
        df_2010 = df_2010.drop(df_2010.shape[0])

        return df_2008,df_2010
    except Exception as e:
        print(e)

In [0]:
df_2008,df_2010 = collect_Match_level_data_2008_2010()

In [13]:
df_2010.head(2)

Unnamed: 0,season,match,Match Date,teams,Ground,result
1,2010,1st,12/03/2010,Deccan v. Kolkata,Mumbai,Kolkata by 11 Runs
2,2010,2nd,13/03/2010,Mumbai v. Rajasthan,Mumbai,Mumbai by 4 Runs


In [0]:
# we have to convert this dataFrame into the same format as others

for df_ in [df_2008,df_2010]:
    df_.drop(['season','match'],axis=1,inplace=True)
    df_['Margin'] = df_['result'].apply(lambda x: ' '.join(x.strip().split(' ')[2:]))
    df_['Winner'] = df_['result'].apply(lambda x: (x.strip().split(' ')[0].strip()))
    df_['Team 1'] = df_['teams'].apply(lambda x: (x.strip().split('v')[0]))
    df_['Team 2'] = df_['teams'].apply(lambda x: (x.strip().split('v')[1]))
    df_.drop(['teams','result'],axis = 1,inplace=True)
    

In [15]:
df_2010.head(2)

Unnamed: 0,Match Date,Ground,Margin,Winner,Team 1,Team 2
1,12/03/2010,Mumbai,11 Runs,Kolkata,Deccan,. Kolkata
2,13/03/2010,Mumbai,4 Runs,Mumbai,Mumbai,. Rajasthan


In [16]:
df_2008 = df_2008[['Ground','Margin','Match Date','Team 1','Team 2','Winner']]
df_2010 = df_2010[['Ground','Margin','Match Date','Team 1','Team 2','Winner']]

df_2010.head(3)

Unnamed: 0,Ground,Margin,Match Date,Team 1,Team 2,Winner
1,Mumbai,11 Runs,12/03/2010,Deccan,. Kolkata,Kolkata
2,Mumbai,4 Runs,13/03/2010,Mumbai,. Rajasthan,Mumbai
3,"Mohali, Chandigarh",5 Wickets,13/03/2010,Punjab,. Delhi,Delhi


In [0]:
df_2010['Team 2'] = df_2010['Team 2'].apply(lambda x: x.split(' ')[1])
df_2008['Team 2'] = df_2008['Team 2'].apply(lambda x: x.split(' ')[1])

In [18]:
df_2010.head(2)

Unnamed: 0,Ground,Margin,Match Date,Team 1,Team 2,Winner
1,Mumbai,11 Runs,12/03/2010,Deccan,Kolkata,Kolkata
2,Mumbai,4 Runs,13/03/2010,Mumbai,Rajasthan,Mumbai


In [0]:
def add_full_team_name(name):
    name_to_rt = None
    for t in Full_team_names_dict.values():
        if name in t:
            name_to_rt = t
    if name_to_rt != None:
        return name_to_rt
    else:
        return name
    

# for df_ in [df_2008,df_2010]:

df_2008['Team 1'] = df_2008['Team 1'].apply(add_full_team_name)
df_2008['Team 2'] = df_2008['Team 2'].apply(add_full_team_name)
df_2008['Winner'] = df_2008['Winner'].apply(add_full_team_name)

df_2010['Team 1'] = df_2010['Team 1'].apply(add_full_team_name)
df_2010['Team 2'] = df_2010['Team 2'].apply(add_full_team_name)
df_2010['Winner'] = df_2010['Winner'].apply(add_full_team_name)


In [20]:
df_2010.tail(5)

Unnamed: 0,Ground,Margin,Match Date,Team 1,Team 2,Winner
56,Kolkata,9 Wickets,19/04/2010,Kolkata Knight Riders,Mumbai Indians,Kolkata Knight Riders
57,Mumbai,35 Runs,21/04/2010,Bangalore,Mumbai Indians,Mumbai Indians
58,Mumbai,38 Runs,22/04/2010,Chennai Super Kings,Deccan Chargers,Chennai Super Kings
59,Mumbai,9 Wickets,24/04/2010,Bangalore,Deccan Chargers,Royal Challengers Bangalore
60,Mumbai,22 Runs,25/04/2010,Chennai Super Kings,Mumbai Indians,Chennai Super Kings


In [21]:
df.tail()

Unnamed: 0,Ground,Margin,Match Date,Team 1,Team 2,Winner
640,Mumbai,9 wickets,"May 5, 2019",Mumbai Indians,Kolkata Knight Riders,Mumbai Indians
641,Chennai,6 wickets,"May 7, 2019",Mumbai Indians,Chennai Super Kings,Mumbai Indians
642,Visakhapatnam,2 wickets,"May 8, 2019",Delhi Capitals,Sunrisers Hyderabad,Delhi Capitals
643,Visakhapatnam,6 wickets,"May 10, 2019",Chennai Super Kings,Delhi Capitals,Chennai Super Kings
644,Hyderabad (Deccan),1 run,"May 12, 2019",Mumbai Indians,Chennai Super Kings,Mumbai Indians


In [0]:
df = df.append(df_2010,ignore_index = True)
df = df.append(df_2008,ignore_index = True)

In [23]:
df.tail()

Unnamed: 0,Ground,Margin,Match Date,Team 1,Team 2,Winner
759,"Bengaluru, Bangalore",9 Wickets,28/05/2008,Bangalore,Mumbai Indians,Mumbai Indians
760,"Mohali, Chandigarh",41 Runs,28/05/2008,Punjab,Rajasthan Royals,Kings XI Punjab
761,Mumbai,105 Runs,30/05/2008,Delhi Capitals,Rajasthan Royals,Rajasthan Royals
762,Mumbai,9 Wickets,31/05/2008,Punjab,Chennai Super Kings,Chennai Super Kings
763,Mumbai,3 Wickets,01/06/2008,Chennai Super Kings,Rajasthan Royals,Rajasthan Royals


In [24]:
df.shape

(764, 6)

In [0]:
df.to_csv('/content/drive/My Drive/data/Full_Match_Data.csv')