In [11]:
from bs4 import BeautifulSoup
import pandas as pd
import requests as req
import collections
from collections import OrderedDict

In [12]:
bat_stats = ['Matches played', 'Innings batted', 'Not outs', 'Runs scored', 'Highest inns score', 
                'Batting average', 'Balls faced', 'Batting strike rate', 'Hundreds scored', 
                'Fifties scored', 'Boundary fours', 'Boundary sixes', 'Catches taken', 'Stumpings made']
bowl_stats = ['Matches played', 'Innings bowled', 'Balls bowled', 'Runs conceded', 'Wickets taken',
                'Bowling average', 'Economy rate','Bowling strike rate', '4 wickets in inning', 
                '5 wickets in inning', '10 wickets in match']

In [13]:
def combine_stats(urls):
    """
    Given a list of Cricinfo URLs, combines all stats into a one-row DataFrame
    
    Parameters:
    urls (list of str): the list of Cricinfo URLs
    
    Returns:
    DataFrame: the sum of all 15 players' stats in a DataFrame
    
    """
    all_stats = []
    for url in urls:
        all_stats.append(get_player_stats(url))
    counter = collections.Counter()
    num_batters = 0
    num_bowlers = 0
    for stat in all_stats: 
        if bat_stats[5] in stat:
            num_batters += 1
        else:
            num_bowlers += 1
        counter.update(stat)
    newdict = dict(counter)
    for key in newdict.keys():
        if key in bat_stats:
            newdict[key] = newdict[key]/num_batters
        else:
            newdict[key] = newdict[key]/num_bowlers
        newdict[key] = [newdict[key]]
    print(newdict)
    return pd.DataFrame.from_dict(newdict)

In [14]:
def get_player_stats(url):
    """
    Given the URL of the Cricinfo stats page of a player, ouputs their ODI stats as a dict.
    
    If they are listed as a batsman, only their batting/fielding stats will be included.
    If they are listed as a bowler, only their bowling stats will be included.
    If they are listed as an allrounder, both will be included.
    Best innings bowling and best match bowling will not be included.
    
    Parameters:
    url (str): the Cricinfo URL of the player
    
    Returns:
    dict: the relevant ODI stats of the player
    
    """
    
    bs = BeautifulSoup(req.get(url).text)
    paragraphs = bs.find_all('p', {'class': 'ciPlayerinformationtxt'})
    role = ''
    for par in paragraphs:
        if 'Playing role' in par.b.string:
            role = par.span.string.lower()
            
    tables = bs.find_all('table',{'class':'engineTable'}, limit=2)
    stats = OrderedDict()
    if 'batsman' in role or 'allrounder' in role:
        bat_stat_vals = []
        for row in tables[0].find_all('tr'):
            left_column = row.find('td')
            if left_column != None and left_column.get_text() == 'ODIs':
                for stat in row.find_all('td'):
                    value = stat.get_text()
                    if value == 'ODIs' or '/' in value:
                        continue
                    if '*' in value:
                        value = value.replace('*','')
                    value = float(value)
                    bat_stat_vals.append(value)
        if len(bat_stat_vals) == 0:
            return dict()
        stats.update(dict(zip(bat_stats, bat_stat_vals)))
        
    if 'bowler' in role or 'allrounder' in role:
        bowl_stat_vals = []
        for row in tables[1].find_all('tr'):
            left_column = row.find('td')
            if left_column != None and left_column.get_text() == 'ODIs':
                for stat in row.find_all('td'):
                    value = stat.get_text()
                    if value == 'ODIs' or '/' in value or '-' in value:
                        continue
                    if '*' in value:
                        value = value.replace('*','')
                    value = float(value)
                    bowl_stat_vals.append(value)
        if len(bowl_stat_vals) == 0:
            return dict()
        stats.update(dict(zip(bowl_stats, bowl_stat_vals)))
    
    return stats

In [15]:
urls = ['http://www.espncricinfo.com/india/content/player/253802.html',
       'http://www.espncricinfo.com/india/content/player/34102.html',
       'http://www.espncricinfo.com/india/content/player/28235.html',
       'http://www.espncricinfo.com/india/content/player/28081.html',
       'http://www.espncricinfo.com/india/content/player/290716.html',
       'http://www.espncricinfo.com/india/content/player/625371.html',
       'http://www.espncricinfo.com/india/content/player/326016.html',
       'http://www.espncricinfo.com/india/content/player/559235.html',
       'http://www.espncricinfo.com/india/content/player/430246.html',
       'http://www.espncricinfo.com/india/content/player/625383.html',
       'http://www.espncricinfo.com/india/content/player/481896.html',
       'http://www.espncricinfo.com/india/content/player/477021.html',
       'http://www.espncricinfo.com/india/content/player/30045.html',
       'http://www.espncricinfo.com/india/content/player/422108.html',
       'http://www.espncricinfo.com/india/content/player/234675.html']
india_stats = combine_stats(urls)
urls = ['http://www.espncricinfo.com/pakistan/content/player/227760.html',
       'http://www.espncricinfo.com/pakistan/content/player/1158100.html',
       'http://www.espncricinfo.com/pakistan/content/player/259551.html',
       'http://www.espncricinfo.com/pakistan/content/player/512191.html',
       'http://www.espncricinfo.com/pakistan/content/player/568276.html',
       'http://www.espncricinfo.com/pakistan/content/player/39950.html',
       'http://www.espncricinfo.com/pakistan/content/player/348144.html',
       'http://www.espncricinfo.com/pakistan/content/player/42657.html',
       'http://www.espncricinfo.com/pakistan/content/player/41434.html',
       'http://www.espncricinfo.com/pakistan/content/player/318788.html',
       'http://www.espncricinfo.com/pakistan/content/player/922943.html',
       'http://www.espncricinfo.com/pakistan/content/player/227758.html',
       'http://www.espncricinfo.com/pakistan/content/player/681305.html',
       'http://www.espncricinfo.com/pakistan/content/player/681117.html',
       'http://www.espncricinfo.com/pakistan/content/player/1072470.html']
pakistan_stats = combine_stats(urls)
urls = ['http://www.espncricinfo.com/england/content/player/24598.html',
       'http://www.espncricinfo.com/england/content/player/8917.html',
       'http://www.espncricinfo.com/england/content/player/297433.html',
       'http://www.espncricinfo.com/england/content/player/308967.html',
       'http://www.espncricinfo.com/england/content/player/662973.html',
       'http://www.espncricinfo.com/england/content/player/12454.html',
       'http://www.espncricinfo.com/england/content/player/249866.html',
       'http://www.espncricinfo.com/england/content/player/19264.html',
       'http://www.espncricinfo.com/england/content/player/244497.html',
       'http://www.espncricinfo.com/england/content/player/303669.html',
       'http://www.espncricinfo.com/england/content/player/298438.html',
       'http://www.espncricinfo.com/england/content/player/311158.html',
       'http://www.espncricinfo.com/england/content/player/308251.html',
       'http://www.espncricinfo.com/england/content/player/247235.html',
       'http://www.espncricinfo.com/england/content/player/351588.html']
england_stats = combine_stats(urls)
urls = ['http://www.espncricinfo.com/afghanistan/content/player/318340.html',
       'http://www.espncricinfo.com/afghanistan/content/player/793457.html',
       'http://www.espncricinfo.com/afghanistan/content/player/533956.html',
       'http://www.espncricinfo.com/afghanistan/content/player/320652.html',
       'http://www.espncricinfo.com/afghanistan/content/player/440970.html',
       'http://www.espncricinfo.com/afghanistan/content/player/524049.html',
       'http://www.espncricinfo.com/afghanistan/content/player/318339.html',
       'http://www.espncricinfo.com/afghanistan/content/player/25913.html',
       'http://www.espncricinfo.com/afghanistan/content/player/793463.html',
       'http://www.espncricinfo.com/afghanistan/content/player/516561.html',
       'http://www.espncricinfo.com/afghanistan/content/player/440963.html',
       'http://www.espncricinfo.com/afghanistan/content/player/311427.html',
       'http://www.espncricinfo.com/afghanistan/content/player/974109.html',
       'http://www.espncricinfo.com/afghanistan/content/player/352048.html',
       'http://www.espncricinfo.com/afghanistan/content/player/419873.html']
afghanistan_stats = combine_stats(urls)
urls = ['http://www.espncricinfo.com/australia/content/player/5334.html',
       'http://www.espncricinfo.com/australia/content/player/272477.html',
       'http://www.espncricinfo.com/australia/content/player/326434.html',
       'http://www.espncricinfo.com/australia/content/player/261354.html',
       'http://www.espncricinfo.com/australia/content/player/489889.html',
       'http://www.espncricinfo.com/australia/content/player/215155.html',
       'http://www.espncricinfo.com/australia/content/player/272279.html',
       'http://www.espncricinfo.com/australia/content/player/6683.html',
       'http://www.espncricinfo.com/australia/content/player/325026.html',
       'http://www.espncricinfo.com/australia/content/player/774223.html',
       'http://www.espncricinfo.com/australia/content/player/267192.html',
       'http://www.espncricinfo.com/australia/content/player/311592.html',
       'http://www.espncricinfo.com/australia/content/player/325012.html',
       'http://www.espncricinfo.com/australia/content/player/219889.html',
       'http://www.espncricinfo.com/australia/content/player/379504.html']
australia_stats = combine_stats(urls)
urls = ['http://www.espncricinfo.com/bangladesh/content/player/373538.html',
       'http://www.espncricinfo.com/bangladesh/content/player/56143.html',
       'http://www.espncricinfo.com/bangladesh/content/player/269237.html',
       'http://www.espncricinfo.com/bangladesh/content/player/550133.html',
       'http://www.espncricinfo.com/bangladesh/content/player/629070.html',
       'http://www.espncricinfo.com/bangladesh/content/player/629063.html',
       'http://www.espncricinfo.com/bangladesh/content/player/300619.html',
       'http://www.espncricinfo.com/bangladesh/content/player/330902.html',
       'http://www.espncricinfo.com/bangladesh/content/player/410763.html',
       'http://www.espncricinfo.com/bangladesh/content/player/56025.html',
       'http://www.espncricinfo.com/bangladesh/content/player/56007.html',
       'http://www.espncricinfo.com/bangladesh/content/player/56194.html',
       'http://www.espncricinfo.com/bangladesh/content/player/536936.html',
       'http://www.espncricinfo.com/bangladesh/content/player/436677.html',
       'http://www.espncricinfo.com/bangladesh/content/player/56029.html']
bangladesh_stats = combine_stats(urls)
urls = ['http://www.espncricinfo.com/newzealand/content/player/277906.html',
       'http://www.espncricinfo.com/newzealand/content/player/559066.html',
       'http://www.espncricinfo.com/newzealand/content/player/232364.html',
       'http://www.espncricinfo.com/newzealand/content/player/38699.html',
       'http://www.espncricinfo.com/newzealand/content/player/55395.html',
       'http://www.espncricinfo.com/newzealand/content/player/493773.html',
       'http://www.espncricinfo.com/newzealand/content/player/226492.html',
       'http://www.espncricinfo.com/newzealand/content/player/506612.html',
       'http://www.espncricinfo.com/newzealand/content/player/388802.html',
       'http://www.espncricinfo.com/newzealand/content/player/232359.html',
       'http://www.espncricinfo.com/newzealand/content/player/355269.html',
       'http://www.espncricinfo.com/newzealand/content/player/539511.html',
       'http://www.espncricinfo.com/newzealand/content/player/502714.html',
       'http://www.espncricinfo.com/newzealand/content/player/277912.html',
       'http://www.espncricinfo.com/newzealand/content/player/440516.html']
newzealand_stats = combine_stats(urls)
urls = ['http://www.espncricinfo.com/southafrica/content/player/44828.html',
       'http://www.espncricinfo.com/southafrica/content/player/379143.html',
       'http://www.espncricinfo.com/southafrica/content/player/321777.html',
       'http://www.espncricinfo.com/southafrica/content/player/44932.html',
       'http://www.espncricinfo.com/southafrica/content/player/43906.html',
       'http://www.espncricinfo.com/southafrica/content/player/600498.html',
       'http://www.espncricinfo.com/southafrica/content/player/337790.html',
       'http://www.espncricinfo.com/southafrica/content/player/327830.html',
       'http://www.espncricinfo.com/southafrica/content/player/540316.html',
       'http://www.espncricinfo.com/southafrica/content/player/550215.html',
       'http://www.espncricinfo.com/southafrica/content/player/47492.html',
       'http://www.espncricinfo.com/southafrica/content/player/542023.html',
       'http://www.espncricinfo.com/southafrica/content/player/481979.html',
       'http://www.espncricinfo.com/southafrica/content/player/40618.html',
       'http://www.espncricinfo.com/southafrica/content/player/379145.html']
southafrica_stats = combine_stats(urls)
urls = ['http://www.espncricinfo.com/srilanka/content/player/227772.html',
       'http://www.espncricinfo.com/srilanka/content/player/49619.html',
       'http://www.espncricinfo.com/srilanka/content/player/49758.html',
       'http://www.espncricinfo.com/srilanka/content/player/784369.html',
       'http://www.espncricinfo.com/srilanka/content/player/300631.html',
       'http://www.espncricinfo.com/srilanka/content/player/465793.html',
       'http://www.espncricinfo.com/srilanka/content/player/629074.html',
       'http://www.espncricinfo.com/srilanka/content/player/328026.html',
       'http://www.espncricinfo.com/srilanka/content/player/49764.html',
       'http://www.espncricinfo.com/srilanka/content/player/233514.html',
       'http://www.espncricinfo.com/srilanka/content/player/222354.html',
       'http://www.espncricinfo.com/srilanka/content/player/49700.html',
       'http://www.espncricinfo.com/srilanka/content/player/301236.html',
       'http://www.espncricinfo.com/srilanka/content/player/370040.html',
       'http://www.espncricinfo.com/srilanka/content/player/324358.html']
srilanka_stats = combine_stats(urls)
urls = ['http://www.espncricinfo.com/westindies/content/player/431901.html',
       'http://www.espncricinfo.com/westindies/content/player/391485.html',
       'http://www.espncricinfo.com/westindies/content/player/315594.html',
       'http://www.espncricinfo.com/westindies/content/player/604302.html',
       'http://www.espncricinfo.com/westindies/content/player/914567.html',
       'http://www.espncricinfo.com/westindies/content/player/581379.html',
       'http://www.espncricinfo.com/westindies/content/player/670025.html',
       'http://www.espncricinfo.com/westindies/content/player/670013.html',
       'http://www.espncricinfo.com/westindies/content/player/495551.html',
       'http://www.espncricinfo.com/westindies/content/player/446101.html',
       'http://www.espncricinfo.com/westindies/content/player/230553.html',
       'http://www.espncricinfo.com/westindies/content/player/277472.html',
       'http://www.espncricinfo.com/westindies/content/player/51880.html',
       'http://www.espncricinfo.com/westindies/content/player/276298.html',
       'http://www.espncricinfo.com/westindies/content/player/457249.html']
westindies_stats = combine_stats(urls)
combined_stats = [india_stats, pakistan_stats, england_stats, afghanistan_stats, australia_stats, bangladesh_stats,
                 newzealand_stats, southafrica_stats, srilanka_stats, westindies_stats]
df = pd.concat(combined_stats)
df.index = ['India', 'Pakistan', 'England', 'Afghanistan', 'Australia', 'Bangladesh', 'New Zealand', 'South Africa',
           'Sri Lanka', 'West Indies']
display(df)

{'Matches played': [157.3], 'Innings batted': [110.0], 'Not outs': [23.1], 'Runs scored': [4089.4], 'Highest inns score': [128.8], 'Batting average': [40.327], 'Balls faced': [4561.3], 'Batting strike rate': [91.667], 'Hundreds scored': [9.2], 'Fifties scored': [21.8], 'Boundary fours': [372.8], 'Boundary sixes': [74.2], 'Catches taken': [73.0], 'Stumpings made': [12.7], 'Innings bowled': [106.4], 'Balls bowled': [5220.2], 'Runs conceded': [4365.0], 'Wickets taken': [144.4], 'Bowling average': [66.916], 'Economy rate': [9.198], 'Bowling strike rate': [77.0], '4 wickets in inning': [5.0], '5 wickets in inning': [1.2], '10 wickets in match': [0.0]}
{'Matches played': [92.0], 'Innings batted': [69.9090909090909], 'Not outs': [10.818181818181818], 'Runs scored': [2223.090909090909], 'Highest inns score': [114.63636363636364], 'Batting average': [39.55], 'Balls faced': [2690.4545454545455], 'Batting strike rate': [85.58909090909088], 'Hundreds scored': [4.0], 'Fifties scored': [12.363636363

Unnamed: 0,Matches played,Innings batted,Not outs,Runs scored,Highest inns score,Batting average,Balls faced,Batting strike rate,Hundreds scored,Fifties scored,...,Innings bowled,Balls bowled,Runs conceded,Wickets taken,Bowling average,Economy rate,Bowling strike rate,4 wickets in inning,5 wickets in inning,10 wickets in match
India,157.3,110.0,23.1,4089.4,128.8,40.327,4561.3,91.667,9.2,21.8,...,106.4,5220.2,4365.0,144.4,66.916,9.198,77.0,5.0,1.2,0.0
Pakistan,92.0,69.909091,10.818182,2223.090909,114.636364,39.55,2690.454545,85.589091,4.0,12.363636,...,102.75,4736.25,3788.75,115.5,71.73,10.6475,81.2,3.0,1.25,0.0
England,112.181818,75.0,11.090909,2544.636364,114.636364,34.098182,2638.272727,91.858182,5.545455,13.909091,...,123.0,5830.25,5518.5,155.25,77.1725,11.86,78.55,7.0,1.75,0.0
Afghanistan,94.222222,59.666667,5.555556,1590.0,104.444444,29.146667,2132.444444,76.084444,1.444444,9.444444,...,75.666667,3359.333333,2574.166667,99.166667,41.308333,7.133333,51.566667,2.833333,1.166667,0.0
Australia,101.75,67.625,5.25,2487.875,131.75,39.115,2710.75,91.815,5.75,13.5,...,45.0,2107.857143,1856.0,63.142857,38.095714,6.112857,42.8,2.857143,0.857143,0.0
Bangladesh,122.090909,82.181818,12.090909,2384.636364,102.363636,31.041818,2976.454545,81.051818,2.818182,15.090909,...,179.5,8313.25,6721.25,210.25,66.355,10.215,79.1,6.5,1.75,0.0
New Zealand,128.555556,86.222222,13.0,2986.0,125.111111,35.52,3464.333333,92.472222,5.777778,17.222222,...,73.666667,3583.666667,3204.666667,103.333333,44.14,7.235,49.066667,3.166667,2.0,0.0
South Africa,163.285714,101.714286,15.857143,3779.571429,138.714286,48.001429,4194.857143,88.97,8.714286,19.285714,...,64.25,2819.0,2347.625,83.875,32.22,5.7125,37.8125,2.875,0.875,0.0
Sri Lanka,100.545455,58.818182,8.909091,1504.636364,94.909091,28.145455,1771.090909,88.938182,1.090909,9.272727,...,182.0,7353.75,6573.25,203.0,110.5475,13.825,118.35,4.75,3.25,0.0
West Indies,97.555556,71.444444,7.444444,2232.0,117.0,29.236667,2618.111111,81.802222,4.444444,10.777778,...,91.833333,4011.166667,3529.166667,102.0,84.775,9.488333,88.8,3.0,1.5,0.0


In [16]:
df.to_csv('Team_data.csv')

In [17]:
# encode = {'team1': {'India':1, 'Pakistan':2, 'England':3, 'Afghanistan':4, 'Australia':5, 'Bangladesh':6, 'New Zealand':7, 'South Africa':8, 'Sri Lanka':9, 'West Indies':10}, 
#          'team2': {'India':1, 'Pakistan':2, 'England':3, 'Afghanistan':4, 'Australia':5, 'Bangladesh':6, 'New Zealand':7, 'South Africa':8, 'Sri Lanka':9, 'West Indies':10},
# 'toss_winner': {'India':1, 'Pakistan':2, 'England':3, 'Afghanistan':4, 'Australia':5, 'Bangladesh':6, 'New Zealand':7, 'South Africa':8, 'Sri Lanka':9, 'West Indies':10},
# 'winner': {'India':1, 'Pakistan':2, 'England':3, 'Afghanistan':4, 'Australia':5, 'Bangladesh':6, 'New Zealand':7, 'South Africa':8, 'Sri Lanka':9, 'West Indies':10}
# matches.replace(encode, inplace=True)

In [18]:
filename = 'ContinousDataset.csv'
file = open(filename)

df2 = pd.read_csv(
    filepath_or_buffer = file,
#     names = ['Scorecard', 'Team1', 'Team2', 'Ground', 'Match Date', 'Winner', 'Venue_Team1', 'Venue_Team2'],
)

FileNotFoundError: [Errno 2] No such file or directory: 'ContinousDataset.csv'

In [None]:
df2.rename(index=str, columns={"Team 1": "Team1", "Team 2": "Team2"}, inplace=True)
match_date= list(df2['Match Date'])
new_match_date = []
for date in match_date:
    new_match_date.append(date[-4:])
df2['Match Date']=new_match_date
df2 = df2[2358:]
df2.head(10)



In [None]:
count_match=0
count=0
for i in range(len(df2['Team1'])):
    if((df2['Team1'].iloc[i]=='India' or df2['Team2'].iloc[i]=='India') and df2['Host_Country'].iloc[i]=='India'):
        count_match+=1
        if(df2['Winner'].iloc[i]=='India'):
            count+=1
        

In [None]:
count/count_match  #percentage of home wins

In [None]:
# encode = {'Team1': {'India':1, 'Pakistan':2, 'England':3, 'Afghanistan':4, 'Australia':5, 'Bangladesh':6, 
#                      'New Zealand':7, 'South Africa':8, 'Sri Lanka':9, 'West Indies':10, 'Kenya':0, 'Scotland':0, 
#                      'P.N.G':0, 'Zimbabwe':0, 'Canada':0, 'Hong Kong':0, 'U.A.E.':0, 'Ireland':0, 'Namibia':0, 
#                      'East Africa':0, 'Bermuda':0, 'Netherlands':0, 'U.S.A':0}, 
#           'Team2': {'India':1, 'Pakistan':2, 'England':3, 'Afghanistan':4, 'Australia':5, 'Bangladesh':6, 
#                      'New Zealand':7, 'South Africa':8, 'Sri Lanka':9, 'West Indies':10, 'Kenya':0, 'Scotland':0, 
#                      'P.N.G':0, 'Zimbabwe':0, 'Canada':0, 'Hong Kong':0, 'U.A.E.':0, 'Ireland':0, 'Namibia':0, 
#                      'East Africa':0, 'Bermuda':0, 'Netherlands':0, 'U.S.A':0},
#           'Winner': {'India':1, 'Pakistan':2, 'England':3, 'Afghanistan':4, 'Australia':5, 'Bangladesh':6, 
#                      'New Zealand':7, 'South Africa':8, 'Sri Lanka':9, 'West Indies':10, 'Kenya':0, 'Scotland':0, 
#                      'P.N.G':0, 'Zimbabwe':0, 'Canada':0, 'Hong Kong':0, 'U.A.E.':0, 'Ireland':0, 'Namibia':0, 
#                      'East Africa':0, 'Bermuda':0, 'Netherlands':0, 'U.S.A':0},
#           'Host_Country': {'India':1, 'Pakistan':2, 'England':3, 'Afghanistan':4, 'Australia':5, 'Bangladesh':6, 
#                      'New Zealand':7, 'South Africa':8, 'Sri Lanka':9, 'West Indies':10, 'Kenya':0, 'Scotland':0, 
#                      'P.N.G':0, 'Zimbabwe':0, 'Canada':0, 'Hong Kong':0, 'U.A.E.':0, 'Ireland':0, 'Namibia':0, 
#                      'East Africa':0, 'Bermuda':0, 'Netherlands':0, 'U.S.A':0}}
# df2.replace(encode, inplace=True)
          
          

In [None]:
df2= df2[df2.Team1 != 0]
df2= df2[df2.Team2 != 0]
df2= df2[df2.Host_Country != 0]
team1 = list(df2['Team1'])
# pd.get_dummies(team1)

In [None]:
filename = ''
file = open(filename)

df2 = pd.read_csv(
    filepath_or_buffer = file,
#     names = ['Scorecard', 'Team1', 'Team2', 'Ground', 'Match Date', 'Winner', 'Venue_Team1', 'Venue_Team2'],
)

In [None]:
x_data = df2[:1186]
x_data= x_data[['Team1', 'Team2', 'Host_Country', 'Winner', 'Match Date']]
x_data['Match Date'] = df2['Match Date'].astype('int32')
y_data = df2['Winner'][:1186]
