In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

# Importing to DFs

In [2]:
all_df = pd.read_csv('IPL_Since2022_Cleaned.csv')

all_df.pop('Unnamed: 0');

all_df.fillna(0, inplace=True)

In [3]:
all_df[all_df['wides']<1].groupby('striker').agg({"runs_off_bat":'sum', 'ball':'count'}).sort_values('runs_off_bat', ascending=False).nlargest(2, 'runs_off_bat')

Unnamed: 0_level_0,runs_off_bat,ball
striker,Unnamed: 1_level_1,Unnamed: 2_level_1
V Kohli,6624,5129
S Dhawan,6244,4942


# Cleaning

In [4]:
# Creating df for only runs that have come off the bat

all_df_no_wides = all_df[all_df['wides']==0]

In [5]:
# Converting "ball" column to string to split it and then converting overs and ball number back to "int"

all_df_no_wides['ball'] = all_df_no_wides['ball'].astype('str');

all_df_no_wides[['Over', 'Ball']] = all_df_no_wides.ball.str.split('.', expand=True);

all_df_no_wides['Over'] = all_df_no_wides['Over'].astype('int');

all_df_no_wides['Ball'] = all_df_no_wides['Ball'].astype('int');

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_df_no_wides['ball'] = all_df_no_wides['ball'].astype('str');
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_df_no_wides[['Over', 'Ball']] = all_df_no_wides.ball.str.split('.', expand=True);
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_df_no_wides[['Over', 'Ball']] = all_df_no_wides.ba

In [6]:
all_df_no_wides.head(2)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,Over,Ball
0,335982,2008,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,...,0.0,0.0,1.0,0.0,0,0,0.0,0.0,0,1
1,335982,2008,2008-04-18,M Chinnaswamy Stadium,1,0.2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0,2


# Getting venue records on overall level (sum total)

In [7]:
venue_overs_overall = all_df_no_wides.groupby(['venue', 'Over']).agg({'runs_off_bat':'sum', 'ball':'count'}).reset_index().sort_values(['venue', 'Over'])

In [8]:
venue_overs_overall.Over.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19], dtype=int64)

In [9]:
# Changing start from 0 to 1

venue_overs_overall['Over'] = venue_overs_overall['Over'] + 1 

In [10]:
venue_overs_overall.Over.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20], dtype=int64)

In [11]:
venue_overs_overall.shape

(980, 4)

In [12]:
# phase definitions

powerplay = list(range(1,7))
middle = list(range(7,16))
death = list(range(16,21))

In [13]:
# Labelling overs by phase

powerplay_venue_df = venue_overs_overall[venue_overs_overall['Over'].isin(powerplay)]

powerplay_venue_df['Over_type'] = 'powerplay'

powerplay_venue_df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  powerplay_venue_df['Over_type'] = 'powerplay'


(294, 5)

In [14]:
# Labelling overs by phase

middle_venue_df = venue_overs_overall[venue_overs_overall['Over'].isin(middle)]

middle_venue_df['Over_type'] = 'middle overs'

middle_venue_df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  middle_venue_df['Over_type'] = 'middle overs'


(441, 5)

In [15]:
# Labelling overs by phase

death_venue_df = venue_overs_overall[venue_overs_overall['Over'].isin(death)]

death_venue_df['Over_type'] = 'death overs'

death_venue_df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_venue_df['Over_type'] = 'death overs'


(245, 5)

In [16]:
venue_overs_overall_labelled = pd.concat([powerplay_venue_df, middle_venue_df, death_venue_df])

In [17]:
venue_overs_overall_labelled.shape

(980, 5)

In [18]:
venue_overs_overall_labelled.head(5)

Unnamed: 0,venue,Over,runs_off_bat,ball,Over_type
0,Arun Jaitley Stadium,1,160,168,powerplay
1,Arun Jaitley Stadium,2,220,168,powerplay
2,Arun Jaitley Stadium,3,260,168,powerplay
3,Arun Jaitley Stadium,4,252,168,powerplay
4,Arun Jaitley Stadium,5,213,168,powerplay


# Subgrouping by year, venue, over

In [19]:
venue_overs_by_year = all_df_no_wides.groupby(['season', 'venue', 'Over']).agg({'runs_off_bat':'sum', 'ball':'count'}).reset_index().sort_values(['venue', 'Over'])

In [20]:
venue_overs_by_year.shape

(3000, 5)

In [21]:
venue_overs_by_year.head()

Unnamed: 0,season,venue,Over,runs_off_bat,ball
2300,2018,Arun Jaitley Stadium,0,74,84
2500,2019,Arun Jaitley Stadium,0,86,84
2301,2018,Arun Jaitley Stadium,1,111,84
2501,2019,Arun Jaitley Stadium,1,109,84
2302,2018,Arun Jaitley Stadium,2,138,84


In [22]:
venue_overs_by_year['Over'] = venue_overs_by_year['Over']+1

In [23]:
# Labelling overs by phase

powerplay_venue_df = venue_overs_by_year[venue_overs_by_year['Over'].isin(powerplay)]

powerplay_venue_df['Over_type'] = 'powerplay'

powerplay_venue_df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  powerplay_venue_df['Over_type'] = 'powerplay'


(900, 6)

In [24]:
# Labelling overs by phase

middle_venue_df = venue_overs_by_year[venue_overs_by_year['Over'].isin(middle)]

middle_venue_df['Over_type'] = 'middle overs'

middle_venue_df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  middle_venue_df['Over_type'] = 'middle overs'


(1350, 6)

In [25]:
# Labelling overs by phase

death_venue_df = venue_overs_by_year[venue_overs_by_year['Over'].isin(death)]

death_venue_df['Over_type'] = 'death overs'

death_venue_df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_venue_df['Over_type'] = 'death overs'


(750, 6)

In [26]:
venue_overs_by_year_labelled = pd.concat([powerplay_venue_df, middle_venue_df, death_venue_df])

In [27]:
venue_overs_by_year_labelled.shape

(3000, 6)

In [28]:
np.sort(venue_overs_by_year_labelled.season.unique())

array([2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018,
       2019, 2020, 2021, 2022], dtype=int64)

In [29]:
# venue_overs_by_year_labelled.pop('Over')

In [30]:
venue_overs_by_year_labelled.head()

Unnamed: 0,season,venue,Over,runs_off_bat,ball,Over_type
2300,2018,Arun Jaitley Stadium,1,74,84,powerplay
2500,2019,Arun Jaitley Stadium,1,86,84,powerplay
2301,2018,Arun Jaitley Stadium,2,111,84,powerplay
2501,2019,Arun Jaitley Stadium,2,109,84,powerplay
2302,2018,Arun Jaitley Stadium,3,138,84,powerplay


In [31]:
# Further grouping by Over_type (phase of the innings)

venue_overs_by_year_labelled_v2 = venue_overs_by_year_labelled.groupby(['season', 'venue', 'Over_type']).agg({'runs_off_bat': 'sum', 'ball':'sum'}).reset_index()

In [32]:
venue_overs_by_year_labelled_v2.head()

Unnamed: 0,season,venue,Over_type,runs_off_bat,ball
0,2008,Dr DY Patil Sports Academy,death overs,206,155
1,2008,Dr DY Patil Sports Academy,middle overs,538,418
2,2008,Dr DY Patil Sports Academy,powerplay,320,289
3,2008,Eden Gardens,death overs,416,285
4,2008,Eden Gardens,middle overs,785,718


#### Testing 

In [33]:
Bangalore_2016 = venue_overs_by_year[(venue_overs_by_year['season']==2016) & (venue_overs_by_year['venue']=='M Chinnaswamy Stadium')]

In [34]:
Bangalore_2016['Venue_RpO'] = Bangalore_2016['runs_off_bat']/Bangalore_2016['ball']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Bangalore_2016['Venue_RpO'] = Bangalore_2016['runs_off_bat']/Bangalore_2016['ball']


In [35]:
Bangalore_2016.head()

Unnamed: 0,season,venue,Over,runs_off_bat,ball,Venue_RpO
1940,2016,M Chinnaswamy Stadium,1,95,108,0.87963
1941,2016,M Chinnaswamy Stadium,2,104,108,0.962963
1942,2016,M Chinnaswamy Stadium,3,124,108,1.148148
1943,2016,M Chinnaswamy Stadium,4,164,108,1.518519
1944,2016,M Chinnaswamy Stadium,5,182,108,1.685185


# Further subgrouping by player

In [36]:
player_venue_overs_overall = all_df_no_wides.groupby(['season', 'venue', 'Over', 'striker']).agg({'runs_off_bat':'sum', 'ball':'count'}).reset_index().sort_values(['season', 'venue', 'Over'])

In [37]:
player_venue_overs_overall.shape

(56951, 6)

In [38]:
player_venue_overs_overall.head(5)

Unnamed: 0,season,venue,Over,striker,runs_off_bat,ball
0,2008,Dr DY Patil Sports Academy,0,AC Gilchrist,9,5
1,2008,Dr DY Patil Sports Academy,0,G Gambhir,1,5
2,2008,Dr DY Patil Sports Academy,0,GC Smith,2,4
3,2008,Dr DY Patil Sports Academy,0,L Ronchi,5,4
4,2008,Dr DY Patil Sports Academy,0,Niraj Patel,1,5


In [39]:
player_venue_overs_overall['Over'] = player_venue_overs_overall['Over']+1

In [40]:
# player_venue_overs_overall[(player_venue_overs_overall['venue']=='M.Chinnaswamy Stadium') & (player_venue_overs_overall['striker']=='AD Hales')]['runs_off_bat'].sum()

In [41]:
# player_venue_overs_overall[(player_venue_overs_overall['venue']=='M.Chinnaswamy Stadium') & (player_venue_overs_overall['striker']=='AD Hales')]['ball'].sum()

In [42]:
powerplay_df = player_venue_overs_overall[player_venue_overs_overall['Over'].isin(powerplay)]

powerplay_df['Over_type'] = 'powerplay'

powerplay_df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  powerplay_df['Over_type'] = 'powerplay'


(15235, 7)

In [43]:
middle_df = player_venue_overs_overall[player_venue_overs_overall['Over'].isin(middle)]

middle_df['Over_type'] = 'middle overs'

middle_df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  middle_df['Over_type'] = 'middle overs'


(26741, 7)

In [44]:
death_df = player_venue_overs_overall[player_venue_overs_overall['Over'].isin(death)]

death_df['Over_type'] = 'death overs'

death_df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_df['Over_type'] = 'death overs'


(14975, 7)

In [45]:
player_venue_overs_grouped = pd.concat([powerplay_df, middle_df, death_df])

In [46]:
player_venue_overs_grouped.head(5)

Unnamed: 0,season,venue,Over,striker,runs_off_bat,ball,Over_type
0,2008,Dr DY Patil Sports Academy,1,AC Gilchrist,9,5,powerplay
1,2008,Dr DY Patil Sports Academy,1,G Gambhir,1,5,powerplay
2,2008,Dr DY Patil Sports Academy,1,GC Smith,2,4,powerplay
3,2008,Dr DY Patil Sports Academy,1,L Ronchi,5,4,powerplay
4,2008,Dr DY Patil Sports Academy,1,Niraj Patel,1,5,powerplay


In [47]:
player_venue_overs_grouped_v2 = player_venue_overs_grouped.groupby(['season', 'venue','striker', 'Over_type']).agg({'runs_off_bat': 'sum', 'ball':'sum'}).reset_index()

In [48]:
# player_venue_overs_grouped_v2[(player_venue_overs_grouped_v2['season']==2018) & (player_venue_overs_grouped_v2['venue']=='M.Chinnaswamy Stadium') & (player_venue_overs_grouped_v2['striker']=='AB de Villiers')]

# Cleaning repeating venue names (optional)

In [49]:
np.sort(all_df.venue.unique())

array(['Arun Jaitley Stadium', 'Arun Jaitley Stadium, Delhi',
       'Barabati Stadium', 'Brabourne Stadium',
       'Brabourne Stadium, Mumbai', 'Buffalo Park',
       'De Beers Diamond Oval', 'Dr DY Patil Sports Academy',
       'Dr DY Patil Sports Academy, Mumbai',
       'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
       'Dubai International Cricket Stadium', 'Eden Gardens',
       'Eden Gardens, Kolkata', 'Feroz Shah Kotla', 'Green Park',
       'Himachal Pradesh Cricket Association Stadium',
       'Holkar Cricket Stadium', 'JSCA International Stadium Complex',
       'Kingsmead', 'M Chinnaswamy Stadium', 'M.Chinnaswamy Stadium',
       'MA Chidambaram Stadium', 'MA Chidambaram Stadium, Chepauk',
       'MA Chidambaram Stadium, Chepauk, Chennai',
       'Maharashtra Cricket Association Stadium',
       'Maharashtra Cricket Association Stadium, Pune',
       'Narendra Modi Stadium, Ahmedabad', 'Nehru Stadium',
       'New Wanderers Stadium', 'Newlands', 'OUTsurance Oval

# Merging DFs

In [50]:
# player_venue_overs_grouped_v2[(player_venue_overs_grouped_v2['striker']=='AB de Villiers') & (player_venue_overs_grouped_v2['season']==2016) & (player_venue_overs_grouped_v2['venue']=='M Chinnaswamy Stadium')]

In [51]:
# Matches count per player per season per venue

matches_by_venue = all_df_no_wides.groupby(['season','venue', 'striker']).agg({'runs_off_bat':'sum', 'Ball':'count', 'match_id':pd.Series.nunique}).reset_index()

#Renaming Columns

matches_by_venue.rename({'match_id': 'matched_played'}, axis='columns', inplace=True)

matches_by_venue_needed_cols = matches_by_venue[['season', 'venue', 'striker', 'matched_played']]

In [52]:
# matches_by_venue[(matches_by_venue['striker']=='AB de Villiers') & (matches_by_venue['season']==2016) & (matches_by_venue['venue']=='M Chinnaswamy Stadium')]

In [53]:
matches_by_venue_needed_cols.head()

Unnamed: 0,season,venue,striker,matched_played
0,2008,Dr DY Patil Sports Academy,AB de Villiers,1
1,2008,Dr DY Patil Sports Academy,AC Gilchrist,1
2,2008,Dr DY Patil Sports Academy,AM Nayar,3
3,2008,Dr DY Patil Sports Academy,AM Rahane,1
4,2008,Dr DY Patil Sports Academy,CK Kapugedera,1


In [54]:
venue_overs_by_year_labelled_v2.rename({'runs_off_bat':'runs_off_bat_venue', 'ball':'ball_venue'}, axis='columns', inplace=True)

In [55]:
venue_overs_by_year_labelled_v2

Unnamed: 0,season,venue,Over_type,runs_off_bat_venue,ball_venue
0,2008,Dr DY Patil Sports Academy,death overs,206,155
1,2008,Dr DY Patil Sports Academy,middle overs,538,418
2,2008,Dr DY Patil Sports Academy,powerplay,320,289
3,2008,Eden Gardens,death overs,416,285
4,2008,Eden Gardens,middle overs,785,718
...,...,...,...,...,...
445,2022,"Narendra Modi Stadium, Ahmedabad",middle overs,245,216
446,2022,"Narendra Modi Stadium, Ahmedabad",powerplay,172,144
447,2022,"Wankhede Stadium, Mumbai",death overs,1911,1097
448,2022,"Wankhede Stadium, Mumbai",middle overs,2822,2272


In [56]:
# # Merging venue df (labelled by overs phase) and players df

player_venue_by_season_merged = pd.merge(player_venue_overs_grouped_v2, venue_overs_by_year_labelled_v2, on=['season', 'venue', 'Over_type'], how='inner')

In [57]:
player_venue_by_season_merged.head()

Unnamed: 0,season,venue,striker,Over_type,runs_off_bat,ball,runs_off_bat_venue,ball_venue
0,2008,Dr DY Patil Sports Academy,AB de Villiers,powerplay,21,17,320,289
1,2008,Dr DY Patil Sports Academy,AC Gilchrist,powerplay,47,26,320,289
2,2008,Dr DY Patil Sports Academy,AM Rahane,powerplay,0,7,320,289
3,2008,Dr DY Patil Sports Academy,G Gambhir,powerplay,1,9,320,289
4,2008,Dr DY Patil Sports Academy,GC Smith,powerplay,5,9,320,289


In [58]:
player_venue_by_season_merged.shape

(14396, 8)

In [59]:
# Merging with DF that has matches played by each player in each venue in each season

player_venue_by_season_merged_v2 = pd.merge(player_venue_by_season_merged, matches_by_venue_needed_cols, on=['season', 'venue', 'striker'], how='inner')

In [60]:
player_venue_by_season_merged_v2.shape

(14396, 9)

In [61]:
player_venue_by_season_merged_v2.head()

Unnamed: 0,season,venue,striker,Over_type,runs_off_bat,ball,runs_off_bat_venue,ball_venue,matched_played
0,2008,Dr DY Patil Sports Academy,AB de Villiers,powerplay,21,17,320,289,1
1,2008,Dr DY Patil Sports Academy,AC Gilchrist,powerplay,47,26,320,289,1
2,2008,Dr DY Patil Sports Academy,AC Gilchrist,middle overs,62,21,538,418,1
3,2008,Dr DY Patil Sports Academy,AM Rahane,powerplay,0,7,320,289,1
4,2008,Dr DY Patil Sports Academy,G Gambhir,powerplay,1,9,320,289,1


In [62]:
# Computing RpO for both players and Venues

player_venue_by_season_merged_v2['Venue_RpO'] = (player_venue_by_season_merged_v2['runs_off_bat_venue']/player_venue_by_season_merged_v2['ball_venue']) * 6

player_venue_by_season_merged_v2['Player_RpO'] = (player_venue_by_season_merged_v2['runs_off_bat']/player_venue_by_season_merged_v2['ball']) * 6

In [63]:
player_venue_by_season_merged_v2.head()

Unnamed: 0,season,venue,striker,Over_type,runs_off_bat,ball,runs_off_bat_venue,ball_venue,matched_played,Venue_RpO,Player_RpO
0,2008,Dr DY Patil Sports Academy,AB de Villiers,powerplay,21,17,320,289,1,6.643599,7.411765
1,2008,Dr DY Patil Sports Academy,AC Gilchrist,powerplay,47,26,320,289,1,6.643599,10.846154
2,2008,Dr DY Patil Sports Academy,AC Gilchrist,middle overs,62,21,538,418,1,7.722488,17.714286
3,2008,Dr DY Patil Sports Academy,AM Rahane,powerplay,0,7,320,289,1,6.643599,0.0
4,2008,Dr DY Patil Sports Academy,G Gambhir,powerplay,1,9,320,289,1,6.643599,0.666667


In [64]:
# Getting total matches played by a player in each season

matches_per_season = all_df_no_wides.groupby(['season', 'venue', 'striker']).agg({'runs_off_bat':'sum', 'match_id':pd.Series.nunique}).reset_index().groupby(['season', 'striker']).agg({'match_id':'sum'}).reset_index()

matches_per_season.rename({'match_id': 'total_matches_played_season'}, axis='columns', inplace=True)

In [65]:
matches_per_season.head()

Unnamed: 0,season,striker,total_matches_played_season
0,2008,A Chopra,5
1,2008,A Kumble,4
2,2008,A Mishra,3
3,2008,A Mukund,1
4,2008,A Nehra,3


In [66]:
player_venue_by_season_merged_v3 = pd.merge(player_venue_by_season_merged_v2, matches_per_season, on=['season','striker'], how='left')

In [67]:
player_venue_by_season_merged_v3.head(2)

Unnamed: 0,season,venue,striker,Over_type,runs_off_bat,ball,runs_off_bat_venue,ball_venue,matched_played,Venue_RpO,Player_RpO,total_matches_played_season
0,2008,Dr DY Patil Sports Academy,AB de Villiers,powerplay,21,17,320,289,1,6.643599,7.411765,6
1,2008,Dr DY Patil Sports Academy,AC Gilchrist,powerplay,47,26,320,289,1,6.643599,10.846154,14


In [68]:
# Renaming columns

player_venue_by_season_merged_v3.rename({'matched_played':'matches_played_venue'}, axis='columns', inplace=True)

In [69]:
player_venue_by_season_merged_v3.head(2)

Unnamed: 0,season,venue,striker,Over_type,runs_off_bat,ball,runs_off_bat_venue,ball_venue,matches_played_venue,Venue_RpO,Player_RpO,total_matches_played_season
0,2008,Dr DY Patil Sports Academy,AB de Villiers,powerplay,21,17,320,289,1,6.643599,7.411765,6
1,2008,Dr DY Patil Sports Academy,AC Gilchrist,powerplay,47,26,320,289,1,6.643599,10.846154,14


In [70]:
player_venue_by_season_merged_v3.shape

(14396, 12)

In [71]:
# Calculating Weighted matches

player_venue_by_season_merged_v3['Weighted Matches'] = round(player_venue_by_season_merged_v3['matches_played_venue']/player_venue_by_season_merged_v3['total_matches_played_season'], 4)

# Calculating RpO difference

player_venue_by_season_merged_v3['Run Diff.'] = round(player_venue_by_season_merged_v3['Player_RpO'] - player_venue_by_season_merged_v3['Venue_RpO'], 4)

# Calculating Weighted RpO

player_venue_by_season_merged_v3['Weighted RpO'] = round(player_venue_by_season_merged_v3['Run Diff.']*player_venue_by_season_merged_v3['Weighted Matches'], 4)

In [72]:
player_venue_by_season_merged_v3.head(2)

Unnamed: 0,season,venue,striker,Over_type,runs_off_bat,ball,runs_off_bat_venue,ball_venue,matches_played_venue,Venue_RpO,Player_RpO,total_matches_played_season,Weighted Matches,Run Diff.,Weighted RpO
0,2008,Dr DY Patil Sports Academy,AB de Villiers,powerplay,21,17,320,289,1,6.643599,7.411765,6,0.1667,0.7682,0.1281
1,2008,Dr DY Patil Sports Academy,AC Gilchrist,powerplay,47,26,320,289,1,6.643599,10.846154,14,0.0714,4.2026,0.3001


In [73]:
player_venue_by_season_merged_v3['Name'] = list(zip(player_venue_by_season_merged_v3.striker, player_venue_by_season_merged_v3.season))

In [74]:
player_venue_by_season_merged_v3.rename({'striker':'Player'}, axis='columns', inplace=True)

In [75]:
player_venue_by_season_merged_v3.head(2)

Unnamed: 0,season,venue,Player,Over_type,runs_off_bat,ball,runs_off_bat_venue,ball_venue,matches_played_venue,Venue_RpO,Player_RpO,total_matches_played_season,Weighted Matches,Run Diff.,Weighted RpO,Name
0,2008,Dr DY Patil Sports Academy,AB de Villiers,powerplay,21,17,320,289,1,6.643599,7.411765,6,0.1667,0.7682,0.1281,"(AB de Villiers, 2008)"
1,2008,Dr DY Patil Sports Academy,AC Gilchrist,powerplay,47,26,320,289,1,6.643599,10.846154,14,0.0714,4.2026,0.3001,"(AC Gilchrist, 2008)"


In [76]:
player_venue_by_season_merged_v3.shape

(14396, 16)

# Getting Number of outs per season

In [77]:
all_df_no_wides.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed', 'Over', 'Ball'],
      dtype='object')

In [78]:
outs_per_season = pd.DataFrame(all_df.groupby('season')['player_dismissed'].value_counts()).rename({'player_dismissed':'Outs'}, axis=1).reset_index().iloc[1:, :]

outs_per_season.rename({'player_dismissed':'Player'}, axis='columns', inplace=True)

In [79]:
outs_per_season.head(2)

Unnamed: 0,season,Player,Outs
1,2008,YK Pathan,14
2,2008,AC Gilchrist,13


In [80]:
# # Joining this on venue-player DF

# player_venue_by_season_merged_v4 = pd.merge(player_venue_by_season_merged_v3, outs_per_season, on=['season', 'Player'], how='left')

In [81]:
# player_venue_by_season_merged_v4.head(2)

# Runs per season for every player

In [82]:
player_total_runs_per_season = all_df_no_wides.groupby(['season', 'striker']).agg({'runs_off_bat':'sum'}).reset_index()

In [83]:
player_total_runs_per_season.head(2)

Unnamed: 0,season,striker,runs_off_bat
0,2008,A Chopra,42
1,2008,A Kumble,13


In [84]:
player_total_runs_per_season['Name'] = list(zip(player_total_runs_per_season.striker, player_total_runs_per_season.season))

In [85]:
player_total_runs_per_season.head(2)

Unnamed: 0,season,striker,runs_off_bat,Name
0,2008,A Chopra,42,"(A Chopra, 2008)"
1,2008,A Kumble,13,"(A Kumble, 2008)"


In [86]:
# over_500_runs = player_total_runs_per_season[player_total_runs_per_season['runs_off_bat']>=500]

In [87]:
# over_500_runs.head(2)

# Getting necessary values and plotting

In [88]:
over_500_runs = player_total_runs_per_season[player_total_runs_per_season['runs_off_bat']>=500]

over_500_runs.head(2)

Unnamed: 0,season,striker,runs_off_bat,Name
39,2008,G Gambhir,534,"(G Gambhir, 2008)"
115,2008,SE Marsh,616,"(SE Marsh, 2008)"


In [89]:
needed_values = np.array(over_500_runs.Name)

In [90]:
np.shape(needed_values)

(65,)

In [93]:
season_runs_above_500_NVRR = player_venue_by_season_merged_v3[player_venue_by_season_merged_v3['Name'].isin(needed_values)]

In [94]:
season_runs_above_500_NVRR.head(2)

Unnamed: 0,season,venue,Player,Over_type,runs_off_bat,ball,runs_off_bat_venue,ball_venue,matches_played_venue,Venue_RpO,Player_RpO,total_matches_played_season,Weighted Matches,Run Diff.,Weighted RpO,Name
4,2008,Dr DY Patil Sports Academy,G Gambhir,powerplay,1,9,320,289,1,6.643599,0.666667,14,0.0714,-5.9769,-0.4268,"(G Gambhir, 2008)"
27,2008,Dr DY Patil Sports Academy,ST Jayasuriya,powerplay,50,39,320,289,3,6.643599,7.692308,14,0.2143,1.0487,0.2247,"(ST Jayasuriya, 2008)"


In [95]:
req_df = season_runs_above_500_NVRR.groupby(['season','Player', 'Name']).agg({"runs_off_bat":'sum', 'ball':'sum','Weighted RpO':'sum'}).reset_index()

In [96]:
req_df_v2 = pd.merge(req_df, outs_per_season, on=['Player', 'season'], how='left')

In [97]:
req_df_v2['BpD'] = round(req_df_v2['ball']/req_df_v2['Outs'],2)

In [98]:
# req_df_v2.sort_values('Weighted RpO', ascending=False).to_csv('above_500_weighted_overs_sorted.csv')

In [99]:
# season_runs_above_500_NVRR.to_csv('above_500_weighted_overs_sorted_over_wise.csv')

In [100]:
# fig = px.scatter(req_df_v2, x = 'BpD', y='Weighted RpO', text='Name')
# fig.update_traces(textposition='top center')
# #fig.add_hline(y=0)
# #fig.add_vline(x=20)
# # #fig.update_layout(title_text='Life Expectency', title_x=0.5)
# fig.show()

In [235]:
# req_df_phase_wise = season_runs_above_500_NVRR.groupby(['season','Player', 'Name', 'Over_type']).agg({"runs_off_bat":'sum', 'ball':'sum','Weighted RpO':'sum'}).reset_index()

In [236]:
# req_df_phase_wise.sort_values('Weighted RpO', ascending=False).to_csv('above_500_weighted_overs_sorted_2.csv')

In [112]:
# death_since_2018 = venue_overs_by_year_labelled_v2[(venue_overs_by_year_labelled_v2['Over_type']=='death overs') & (venue_overs_by_year_labelled_v2['season'].isin([2018,2019,2020,2021,2022]))]

In [113]:
# death_since_2018.agg({'runs_off_bat_venue':'sum', 'ball_venue':'sum'})

In [125]:
# venue_overs_by_year_labelled_v2[(venue_overs_by_year_labelled_v2['Over_type']=='death overs') & (venue_overs_by_year_labelled_v2['season'].isin([2018,2019,2020,2022]))].agg({'runs_off_bat_venue':'sum', 'ball_venue':'sum'})

In [126]:
# 21815/13556*6