In [345]:
import urllib3, os
import pandas as pd
http = urllib3.PoolManager()
from bs4 import BeautifulSoup

class Twenty20Data:
    def __init__(self):
        self.path = r"C:\Users\Alok\Downloads\dataset\cricket"
        self.url = "http://howstat.com/cricket/Statistics/Matches/MatchList_T20.asp?Group=%s&v=M&Range=%s"
        self.date_range = [("2005010120051231","2005"), ("2006010120061231","2006"), ("2007010120071231","2007"),\
                          ("2008010120081231","2008"), ("2009010120091231","2009"), ("2010010120101231","2010"),\
                          ("2011010120111231","2011"), ("2012010120121231","2012"), ("2013010120131231","2013"),\
                          ("2014010120141231","2014"), ("2015010120151231","2015"), ("2016010120161231","2016"),\
                          ("2017010120171231","2017"), ("2018010120181231","2018"), ("2019010120191231","2019"),\
                          ("2020010120201231","2020"), ("2021010120211231","2021")]

    def get_year_wise_data(self):
        for item in self.date_range[:]:
            print("Scraping data for year %s !!"%item[1])
            response = http.request('GET', self.url%(item[0],item[1]))
            soup = BeautifulSoup(response.data.decode('utf-8'), "lxml")
            table = soup.find("table", {"class": "TableLined"})
            df = self.extract(table)
            df.to_csv(os.path.join(self.path,"T20_Data_%s.csv"%(item[1])),index=None)
        print("Data saved to given path!")
        return
    
    def find_value_column(self, row):
        if row.team1 in row.result:
            return row.team1 
        elif row.team2 in row.result:
            return row.team2
        else: return "draw"
    
    def get_teams(self, row):
        print([item.strip() for item in row.countries.split("v.")])
        return [item.strip() for item in row.countries.split("v.")]
    
    def preprocess_df(self, df):
        df[['team1','team2']] = df.countries.str.split(" v. ",expand=True)
        #df[['team1','team2']] = df.apply(lambda row : self.get_teams(row), axis=1)
        df['winner'] = df.apply(lambda row : self.find_value_column(row), axis=1)
        df = df.drop(["Serial","&nbsp","ground"],axis=1)
        return df
    
    def extract(self, table):
        data = []
        for row in table.find_all('tr'):
            cols = row.find_all('td')
            cols = [ele.text.strip().lower() for ele in cols]
            data.append([ele for ele in cols if ele])
        df = pd.DataFrame(data[1:], columns=["Serial"]+data[0])
        df = self.preprocess_df(df)
        return df

In [346]:
t20 = Twenty20Data()

In [347]:
t20.get_year_wise_data()

Scraping data for year 2005 !!
Scraping data for year 2006 !!
Scraping data for year 2007 !!
Scraping data for year 2008 !!
Scraping data for year 2009 !!
Scraping data for year 2010 !!
Scraping data for year 2011 !!
Scraping data for year 2012 !!
Scraping data for year 2013 !!
Scraping data for year 2014 !!
Scraping data for year 2015 !!
Scraping data for year 2016 !!
Scraping data for year 2017 !!
Scraping data for year 2018 !!
Scraping data for year 2019 !!
Scraping data for year 2020 !!
Scraping data for year 2021 !!
Data saved to given path!
