In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Bat ML

In [3]:
bat_df = pd.read_csv("../input/south-africa-cricket-data-from-2000-to-2020/bat.csv")
match_df = pd.read_csv("../input/south-africa-cricket-data-from-2000-to-2020/mat.csv", parse_dates=["match_date"])
opposition_df = pd.read_csv("../input/south-africa-cricket-data-from-2000-to-2020/opposition.csv")
player_df = pd.read_csv("../input/south-africa-cricket-data-from-2000-to-2020/player.csv")
ground_df = pd.read_csv("../input/south-africa-cricket-data-from-2000-to-2020/ground.csv")

In [4]:
bat_df.shape

(3934, 8)

In [5]:
bat_df.describe()

Unnamed: 0,player,mat,runs,ball,M,_4s,_6s,strike_rate
count,3934.0,3934.0,3934.0,3934.0,3934.0,3934.0,3934.0,3934.0
mean,95981.080834,424756.8,27.982715,33.500508,42.209202,2.531774,0.375191,77.576002
std,126650.660419,355021.7,30.470857,32.189661,50.80032,3.502842,1.884419,49.11889
min,40618.0,64657.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
25%,44932.0,65787.0,5.0,9.0,11.0,0.0,0.0,50.0
50%,45789.0,350348.0,17.0,23.0,29.0,1.0,0.0,75.0
75%,47015.0,649099.0,41.0,47.75,64.0,4.0,0.0,100.0
max,698189.0,1185321.0,185.0,160.0,257.0,21.0,16.0,460.0


In [6]:
# Prepare for merging
bat_df.rename(columns={"player":"player_id", "mat":"match_id"}, inplace=True)
match_df.rename(columns={"opposition":"opposition_id", "ground":"venue_id"}, inplace=True)
opposition_df.rename(columns={"opp_id":"opposition_id", "opp_name":"opposition_name", "rating":"opposition_rating"}, inplace=True)
ground_df.rename(columns={"ground_id":"venue_id", "ground_name":"venue_name", "country":"venue_country"}, inplace=True)
player_df.rename(columns={"name":"player_name"}, inplace=True)

In [7]:
df = bat_df.merge(
    player_df, how="left", on="player_id"
).merge(
    match_df, how="left", on="match_id"
).merge(
    ground_df, how="left", on="venue_id"
).merge(
    opposition_df, how="left", on="opposition_id"
)

print(f"Data shape: {df.shape}")

Data shape: (3934, 26)


In [8]:
df.columns

Index(['player_id', 'match_id', 'runs', 'ball', 'M', '_4s', '_6s',
       'strike_rate', 'player_name', 'odi_debut', 'playing_role',
       'batting_style', 'bowling_style', 'fielding_position', 'odi_no',
       'opposition_id', 'venue_id', 'match_date', 'toss', 'series', 'result',
       'match_days', 'venue_name', 'venue_country', 'opposition_name',
       'opposition_rating'],
      dtype='object')

### Data cleaning
* remove duplicates (note: np.nan != np.nan)
* Remove absent hurt (balls==0)
* sort by date (reindex too)
* nan is -99 for most columns, replace with 0

In [9]:
print(f"Old shape: {df.shape}")

# Drop duplicates
df.drop_duplicates(["player_id", "match_id", "runs", "ball"], inplace=True)

# Remove absent hurt
df.drop(df[df["ball"]==0].index, inplace=True)

print(f"New shape: {df.shape}")

Old shape: (3934, 26)
New shape: (3449, 26)


In [10]:
# Sort by match date
df.sort_values("match_date", axis=0, ignore_index=True, inplace=True)

# Replace -99 with 0
df.replace(-99, 0, inplace=True)

In [12]:
df.describe()

Unnamed: 0,player_id,match_id,runs,ball,M,_4s,_6s,strike_rate,odi_no,opposition_id,venue_id,opposition_rating
count,3449.0,3449.0,3449.0,3449.0,3449.0,3449.0,3449.0,3449.0,3449.0,3449.0,3449.0,3449.0
mean,98855.23717,429703.0,28.18846,33.603653,44.907219,2.582198,0.403885,77.761305,2771.299507,6.553784,78694.56,96.781096
std,130750.395555,365300.9,30.525281,32.191617,45.710885,3.151572,1.022427,48.913158,819.93483,5.762867,84303.2,26.356283
min,40618.0,64657.0,0.0,0.0,0.0,0.0,0.0,0.0,1544.0,1.0,56293.0,0.0
25%,44932.0,66097.0,5.0,9.0,10.0,0.0,0.0,50.0,2030.0,2.0,58792.0,85.0
50%,45789.0,299009.0,17.0,23.0,29.0,1.0,0.0,75.0,2691.0,6.0,59089.0,102.0
75%,47255.0,722339.0,41.0,48.0,65.0,4.0,0.0,100.0,3522.0,8.0,59306.0,116.0
max,698189.0,1185321.0,185.0,160.0,257.0,21.0,16.0,460.0,4254.0,40.0,1126475.0,125.0


Null values in Opposition_rating has to be replaced with their correct value. I couldn't find ODI men rating for Canada and Kenya, so we'll keep them at 0. [1](https://www.icc-cricket.com/rankings/mens/team-rankings/odi) and [2](https://sports.ndtv.com/cricket/icc-rankings) gives Netherlands a rating of 44

In [13]:
opposition_df[opposition_df["opposition_rating"]==-99]

Unnamed: 0,opposition_id,opposition_name,opposition_rating
8,15,Netherlands,-99
9,17,Canada,-99
11,26,Kenya,-99


In [14]:
# Replace Netherlands rating to 44
df.loc[df["opposition_name"]=="Netherlands", 'opposition_rating'] = 44

Lastly, probably worth noting that strike rate was **rounded down** for runs/ball.

### Batting attributes
* No. of innings = count(player appearance before that match)
* Batting average = sum(runs) / No. of innings
* Strike rate = 100 * sum(runs) / sum(ball)
* fifties = count(innings where 50 <= run < 100 )
* centuries = count(innings where 100 <= run)
* zeros = count(innings where run==0)
* highest score = max(runs)


### Derived attributes 
* Consistency: entire career
* Form: last 12 months
* Opposition: opposition team
* Venue: venue

In [59]:
# The training start from 2017 because all odi_debut for players after 2016 is more than the start date(2000) of df
train = df.loc[df["match_date"] >= "2017"]

print(f"Train shape: {train.shape}")

Train shape: (458, 26)


In [60]:
train.head()

Unnamed: 0,player_id,match_id,runs,ball,M,_4s,_6s,strike_rate,player_name,odi_debut,...,venue_id,match_date,toss,series,result,match_days,venue_name,venue_country,opposition_name,opposition_rating
2991,379143,936159,34,40,56,4,0,85.0,Quinton de Kock,2013-01-19,...,59159,2017-01-28,"South Africa , elected to field first",Sri Lanka tour of South Africa,South Africa led the 5-match series 1-0,28 January 2017 (50-over match),St George's Park,South Africa,Sri Lanka,85
2992,43906,936159,57,71,110,5,1,80.28,Hashim Amla,2008-03-09,...,59159,2017-01-28,"South Africa , elected to field first",Sri Lanka tour of South Africa,South Africa led the 5-match series 1-0,28 January 2017 (50-over match),St George's Park,South Africa,Sri Lanka,85
2993,44828,936159,55,68,85,5,0,80.88,Faf du Plessis,2011-01-18,...,59159,2017-01-28,"South Africa , elected to field first",Sri Lanka tour of South Africa,South Africa led the 5-match series 1-0,28 January 2017 (50-over match),St George's Park,South Africa,Sri Lanka,85
2994,44936,936159,30,27,32,3,1,111.11,AB de Villiers,2005-02-02,...,59159,2017-01-28,"South Africa , elected to field first",Sri Lanka tour of South Africa,South Africa led the 5-match series 1-0,28 January 2017 (50-over match),St George's Park,South Africa,Sri Lanka,85
2995,44932,936161,11,19,27,0,0,57.89,Jean-Paul Duminy,2004-08-20,...,59089,2017-02-01,"Sri Lanka , elected to field first",Sri Lanka tour of South Africa,South Africa led the 5-match series 2-0,1 February 2017 - day/night match (50-over match),Kingsmead,South Africa,Sri Lanka,85


In [26]:
def get_basic_attributes(df, bins, labels):
    no_of_innings = df.shape[0]
    if no_of_innings==0:
        return 0,0,0,0,0,0,0
    
    runs_scored = df["runs"].sum()
    balls_faced = df["ball"].sum()
    batting_average = np.around(runs_scored / no_of_innings, 2)
    strike_rate = np.around((runs_scored/balls_faced) * 100, 2)
    centuries = df[df["runs"] >= 100].shape[0]
    fifties = df[(df["runs"]>=50) & (df["runs"]<100)].shape[0]
    zeros = df[df["runs"]==0].shape[0]
    highest_score = df["runs"].max()
    
    no_of_innings = pd.cut([no_of_innings], bins["no_of_innings"], right=False, labels=labels["no_of_innings"])[0]
    batting_average = pd.cut([batting_average], bins["batting_average"], right=False, labels=labels["batting_average"])[0]
    strike_rate = pd.cut([strike_rate], bins["strike_rate"], right=False, labels=labels["strike_rate"])[0]
    centuries = pd.cut([centuries], bins["centuries"], right=False, labels=labels["centuries"])[0]
    fifties = pd.cut([fifties], bins["fifties"], right=False, labels=labels["fifties"])[0]
    if bins["zeros"]:
        zeros = pd.cut([zeros], bins["zeros"], right=False, labels=labels["zeros"])[0]
    if bins["highest_score"]:
        highest_score = pd.cut([highest_score], bins["highest_score"], right=False, labels=labels["highest_score"])[0]
    
    
    return no_of_innings, batting_average, strike_rate, centuries, fifties, zeros, highest_score

In [40]:
def consistency_attribute(row):
    mask = (
        (df["player_id"] == row["player_id"])
        & (df["match_date"] < row["match_date"])
    )
    consistency_df = df[mask]
    
    bins = {
        "no_of_innings": [1,50,100,125,150,np.inf],
        "batting_average": [0,10,20,30,40,np.inf],
        "strike_rate": [0,50,60,80,100,np.inf],
        "centuries": [0,1,5,10,15,20,np.inf],
        "fifties": [0,1,10,20,30,40,np.inf],
        "zeros": [0,1,5,10,15,20,np.inf],
        "highest_score": None,
    }
    labels = {
        "no_of_innings": [1,2,3,4,5],
        "batting_average": [1,2,3,4,5],
        "strike_rate": [1,2,3,4,5],
        "centuries": [0,1,2,3,4,5],
        "fifties": [0,1,2,3,4,5],
        "zeros": [0,1,2,3,4,5],
        "highest_score": None,
    }
    
    no_of_innings, batting_average, strike_rate, centuries, fifties, zeros, highest_score = get_basic_attributes(consistency_df, bins, labels)
    consistency = (0.4262*batting_average + 0.2566*no_of_innings + 0.1510*strike_rate + 0.0787*centuries + 0.0556*fifties - 0.0328*zeros)
    
    return np.around(consistency, 4)

In [39]:
def form_attribute(row):
    mask = (
        (df["player_id"] == row["player_id"])
        & (df["match_date"] < row["match_date"])
        & (row["match_date"]-df["match_date"] <= np.timedelta64(1, "Y"))
    )
    form_df = df[mask]
    bins = {
        "no_of_innings": [1,5,10,12,15,np.inf],
        "batting_average": [0,10,20,30,40,np.inf],
        "strike_rate": [0,50,60,80,100,np.inf],
        "centuries": [0,1,2,3,4,5,np.inf],
        "fifties": [0,1,3,5,7,10,np.inf],
        "zeros": [0,1,2,3,4,5,np.inf],
        "highest_score": None,
    }
    labels = {
        "no_of_innings": [1,2,3,4,5],
        "batting_average": [1,2,3,4,5],
        "strike_rate": [1,2,3,4,5],
        "centuries": [0,1,2,3,4,5],
        "fifties": [0,1,2,3,4,5],
        "zeros": [0,1,2,3,4,5],
        "highest_score": None,
    }
    
    no_of_innings, batting_average, strike_rate, centuries, fifties, zeros, highest_score = get_basic_attributes(form_df, bins, labels)
    form = (0.4262*batting_average + 0.2566*no_of_innings + 0.1510*strike_rate + 0.0787*centuries + 0.0556*fifties - 0.0328*zeros)
    
    return np.around(form, 4)

In [37]:
def opposition_attribute(row):
    mask = (
        (df["player_id"] == row["player_id"])
        & (df["match_date"] < row["match_date"])
        & (df["opposition_id"] == row["opposition_id"])
    )
    opposition_df = df[mask]
    
    bins = {
        "no_of_innings": [1,3,5,7,10,np.inf],
        "batting_average": [0,10,20,30,40,np.inf],
        "strike_rate": [0,50,60,80,100,np.inf],
        "centuries": [0,1,2,3,np.inf],
        "fifties": [0,1,3,5,7,10,np.inf],
        "zeros": [0,1,2,3,4,5,np.inf],
        "highest_score":  None,
    }
    labels = {
        "no_of_innings": [1,2,3,4,5],
        "batting_average": [1,2,3,4,5],
        "strike_rate": [1,2,3,4,5],
        "centuries": [0,3,4,5],
        "fifties": [0,1,2,3,4,5],
        "zeros": [0,1,2,3,4,5],
        "highest_score": None,
    }
    
    no_of_innings, batting_average, strike_rate, centuries, fifties, zeros, highest_score = get_basic_attributes(opposition_df, bins, labels)
    opposition = (0.4262*batting_average + 0.2566*no_of_innings + 0.1510*strike_rate + 0.0787*centuries + 0.0556*fifties - 0.0328*zeros)
    
    return np.around(opposition, 4)

In [38]:
def venue_attribute(row):
    mask = (
        (df["player_id"] == row["player_id"])
        & (df["match_date"] < row["match_date"])
        & (df["venue_country"] == row["venue_country"])
    )
    venue_df = df[mask]
    
    bins = {
        "no_of_innings": [1,2,3,4,5,np.inf],
        "batting_average": [0,10,20,30,40,np.inf],
        "strike_rate": [0,50,60,80,100,np.inf],
        "centuries": [0,1,2,np.inf],
        "fifties": [0,1,2,np.inf],
        "zeros": None,
        "highest_score": [0,1,25,50,100,150,np.inf],
    }
    labels = {
        "no_of_innings": [1,2,3,4,5],
        "batting_average": [1,2,3,4,5],
        "strike_rate": [1,2,3,4,5],
        "centuries": [0,4,5],
        "fifties": [0,4,5],
        "zeros": None,
        "highest_score": [0,1,2,3,4,5],
    }
    
    no_of_innings, batting_average, strike_rate, centuries, fifties, zeros, highest_score = get_basic_attributes(venue_df, bins, labels)
    venue = (0.4262*batting_average + 0.2566*no_of_innings + 0.1510*strike_rate + 0.0787*centuries + 0.0556*fifties + 0.0328*highest_score)
    
    return np.around(venue,4)

In [103]:
X = pd.DataFrame()
X["consistency"] = train.apply(consistency_attribute, axis=1)
X["form"] = train.apply(form_attribute, axis=1)
X["opposition"] = train.apply(opposition_attribute, axis=1)
X["venue"] = train.apply(venue_attribute, axis=1)
print(f"X shape: {X.shape}")

X shape: (458, 4)


In [104]:
# Personally, I think strike_rate is a better target for measuring performance.
# The models perform poorly for strike_rate though
y = train.loc[:, "runs"]

#### Data cleaning

Change 0s to mean

In [108]:
X.loc[X["consistency"]==0, "consistency"] = X[X["consistency"]!=0]["consistency"].mean()
X.loc[X["form"]==0, "form"] = X[X["form"]!=0]["form"].mean()
X.loc[X["opposition"]==0, "opposition"] = X[X["opposition"]!=0]["opposition"].mean()
X.loc[X["venue"]==0, "venue"] = X[X["venue"]!=0]["venue"].mean()

In [109]:
X.describe()

Unnamed: 0,consistency,form,opposition,venue
count,458.0,458.0,458.0,458.0
mean,2.785411,2.931287,2.990929,3.507608
std,1.083639,0.997357,1.061367,1.089377
min,0.801,0.801,0.801,0.8338
25%,1.895,2.249,2.1934,2.805
50%,2.785411,2.931543,2.990929,3.507608
75%,3.5627,3.6391,3.9294,4.3945
max,4.7749,4.5488,4.7849,5.0045


### Algorithm: Regression models
The paper used classification and measured accuracy.

In [76]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [110]:
model = LinearRegression()
score = cross_val_score(model, X.values, y.values, scoring="neg_mean_squared_error", n_jobs=-1)
print(f"Score: {score}\nMean: {score.mean()}")

Score: [-1063.33486834  -985.29811344  -525.32598422 -1002.76693727
  -925.15152331]
Mean: -900.375485315573


In [111]:
model = SVR()
score = cross_val_score(model, X.values, y.values, scoring="neg_mean_squared_error", n_jobs=-1)
print(f"Score: {score}\nMean: {score.mean()}")

Score: [-1198.19676241 -1008.01238709  -504.96110279 -1046.83525919
 -1064.1771652 ]
Mean: -964.4365353338126


In [112]:
model = DecisionTreeRegressor()
score = cross_val_score(model, X.values, y.values, scoring="neg_mean_squared_error", n_jobs=-1)
print(f"Score: {score}\nMean: {score.mean()}")

Score: [-2225.53774155 -2029.47484729 -1112.97131643 -1673.45999975
 -1765.7349901 ]
Mean: -1761.4357790221748


In [113]:
model = RandomForestRegressor()
score = cross_val_score(model, X.values, y.values, scoring="neg_mean_squared_error", n_jobs=-1)
print(f"Score: {score}\nMean: {score.mean()}")

Score: [-1210.63326377 -1010.11273507  -685.37276206 -1112.60278411
 -1022.8078779 ]
Mean: -1008.3058845828302


In [114]:
model = GradientBoostingRegressor()
score = cross_val_score(model, X.values, y.values, scoring="neg_mean_squared_error", n_jobs=-1)
print(f"Score: {score}\nMean: {score.mean()}")

Score: [-1214.33010386 -1113.70930709  -801.95370587 -1081.77468005
 -1141.68916911]
Mean: -1070.6913931948054


#### There are other input attributes but it wasn't included because the basic attribute is decent enough and our dataset is small. For illustration, here it is.

In [151]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [115]:
df.columns

Index(['player_id', 'match_id', 'runs', 'ball', 'M', '_4s', '_6s',
       'strike_rate', 'player_name', 'odi_debut', 'playing_role',
       'batting_style', 'bowling_style', 'fielding_position', 'odi_no',
       'opposition_id', 'venue_id', 'match_date', 'toss', 'series', 'result',
       'match_days', 'venue_name', 'venue_country', 'opposition_name',
       'opposition_rating'],
      dtype='object')

In [146]:
# Use sample or unique method to understand column values
df["toss"].sample(10)

1907           England , elected to bat first
1963             India , elected to bat first
182       South Africa , elected to bat first
1486     Netherlands , elected to field first
1300         Australia , elected to bat first
675     South Africa , elected to field first
3423         England , elected to field first
2888    South Africa , elected to field first
1954    South Africa , elected to field first
1060       Sri Lanka , elected to field first
Name: toss, dtype: object

In [131]:
def batting_hand(bat_style):
    # unique() = ['Left-hand bat', 'Right-hand bat']
    if "Right-hand" in bat_style:
        return 1
    elif "Left-hand" in bat_style:
        return 0
    else:
        return -1
    
X["batting_hand"] = train["batting_style"].apply(batting_hand)
X["batting_hand"].value_counts()

1    276
0    182
Name: batting_hand, dtype: int64

In [130]:
def bowling_hand(bowl_style):
    # There are nan in bowl_style
    if type(bowl_style)==float:
        return -2
    elif "right-arm" in bowl_style.lower():
        return 1
    elif "left-arm" in bowl_style.lower():
        return 0
    else:
        return -1
    
X["bowling_hand"] = train["bowling_style"].apply(bowling_hand)
X["bowling_hand"].value_counts()

 1    297
-1     73
-2     66
 0     22
Name: bowling_hand, dtype: int64

In [139]:
def match_time(arg):
    # There are nan in bowl_style
    if "day/night" in arg:
        return 1
    else:
        return 0
    
X["match_time"] = train["match_days"].apply(match_time)
X["match_time"].value_counts()

1    266
0    192
Name: match_time, dtype: int64

In [142]:
def venue(country):
    if country=="South Africa":
        return 1
    else:
        return 0

X["home_venue"] = train["venue_country"].apply(venue)
X["home_venue"].value_counts()

1    234
0    224
Name: home_venue, dtype: int64

In [152]:
def toss_attribute(toss):
    if "South Africa" in toss:
        return 1
    else:
        return 0

X["toss"] = train["toss"].apply(toss_attribute)
X["toss"].value_counts()

1    265
0    193
Name: toss, dtype: int64

In [None]:
X["opposition"] = train["opposition_rating"]
X["playing_role"] = le.fit_transform(train["playing_role"]) # One hot encoding is likely better. This is just illustration

In [153]:
X.head()

Unnamed: 0,consistency,form,opposition,venue,batting_hand,bowling_hand,match_time,home venue,home_venue,playing_role,toss
2991,3.5071,4.5163,85,5.0045,0,-2,0,1,1,8,1
2992,4.2889,3.5154,85,4.8535,1,1,0,1,1,7,1
2993,3.1134,3.9185,85,3.8896,1,-1,0,1,1,5,1
2994,4.7749,3.1801,85,4.9717,1,1,0,1,1,8,1
2995,3.5151,3.4464,85,4.3945,0,1,1,1,1,2,0


In [154]:
# One score before we go
model = LinearRegression()
score = cross_val_score(model, X.values, y.values, scoring="neg_mean_squared_error", n_jobs=-1)
print(f"Score: {score}\nMean: {score.mean()}")

Score: [-1007.10688971  -926.55911234  -730.87983388  -927.50300628
  -874.200867  ]
Mean: -893.2499418437425


#### Thanks for reading (or scrolling to the bottom :))