## Cricket Player Performance Prediction Using Machine Learning

##### Importing libraries

In [1]:
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

##### Importing Data

In [2]:
df = pd.read_csv('Batsman_Data.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Bat1,Runs,BF,SR,4s,6s,Opposition,Ground,Start Date,Match_ID,Batsman,Player_ID
0,1,DNB,-,-,-,-,-,v India,Nagpur,18 Dec 2009,ODI # 2933,Oshane Thomas,49619
1,2,DNB,-,-,-,-,-,v India,Kolkata,24 Dec 2009,ODI # 2935,Oshane Thomas,49619
2,3,DNB,-,-,-,-,-,v India,Delhi,27 Dec 2009,ODI # 2936,Oshane Thomas,49619
3,4,DNB,-,-,-,-,-,v Bangladesh,Dhaka,4 Jan 2010,ODI # 2937,Oshane Thomas,49619
4,5,DNB,-,-,-,-,-,v India,Dhaka,5 Jan 2010,ODI # 2938,Oshane Thomas,49619


##### shape

In [4]:
df.shape

(11149, 13)

##### Null Values

In [5]:
df.isnull().sum()

Unnamed: 0    0
Bat1          0
Runs          0
BF            0
SR            0
4s            0
6s            0
Opposition    0
Ground        0
Start Date    0
Match_ID      0
Batsman       0
Player_ID     0
dtype: int64

In [6]:
df.isnull().sum().sum()

0

##### Duplicate Values

In [7]:
df.duplicated().sum()

0

##### Info

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11149 entries, 0 to 11148
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  11149 non-null  int64 
 1   Bat1        11149 non-null  object
 2   Runs        11149 non-null  object
 3   BF          11149 non-null  object
 4   SR          11149 non-null  object
 5   4s          11149 non-null  object
 6   6s          11149 non-null  object
 7   Opposition  11149 non-null  object
 8   Ground      11149 non-null  object
 9   Start Date  11149 non-null  object
 10  Match_ID    11149 non-null  object
 11  Batsman     11149 non-null  object
 12  Player_ID   11149 non-null  int64 
dtypes: int64(2), object(11)
memory usage: 1.1+ MB


##### Data cleaning and Feature Engineering

In [9]:
df.drop(['Unnamed: 0'], axis = 1 , inplace = True)

In [10]:
df.drop(df[df.Bat1 == 'DNB'].index, inplace=True)

In [11]:
df.drop(df[df.Bat1 == 'TDNB'].index, inplace=True)

In [12]:
df.drop(df[df.Bat1 == 'absent'].index, inplace=True)

In [13]:
df.drop(df[df.Bat1 == 'sub'].index, inplace=True)

In [14]:
df.head()

Unnamed: 0,Bat1,Runs,BF,SR,4s,6s,Opposition,Ground,Start Date,Match_ID,Batsman,Player_ID
5,0*,0,8,0.00,0,0,v India,Dhaka,10 Jan 2010,ODI # 2941,Oshane Thomas,49619
6,0*,0,0,-,0,0,v England,The Oval,28 Jun 2011,ODI # 3165,Oshane Thomas,49619
9,1*,1,3,33.33,0,0,v England,Nottingham,6 Jul 2011,ODI # 3169,Oshane Thomas,49619
10,0*,0,2,0.00,0,0,v Australia,Pallekele,10 Aug 2011,ODI # 3175,Oshane Thomas,49619
11,0,0,2,0.00,0,0,v Pakistan,Dubai (DSC),11 Nov 2011,ODI # 3212,Oshane Thomas,49619


In [15]:
df['Bat1'] = df['Bat1'].str.replace('*', '.')
df['SR'] = df['SR'].str.replace('-', '0')
df['Runs'] = df['Runs'].str.replace('-', '0')
df['4s'] = df['4s'].str.replace('-', '0')
df['6s'] = df['6s'].str.replace('-', '0')
df.head()

  df['Bat1'] = df['Bat1'].str.replace('*', '.')


Unnamed: 0,Bat1,Runs,BF,SR,4s,6s,Opposition,Ground,Start Date,Match_ID,Batsman,Player_ID
5,0.0,0,8,0.0,0,0,v India,Dhaka,10 Jan 2010,ODI # 2941,Oshane Thomas,49619
6,0.0,0,0,0.0,0,0,v England,The Oval,28 Jun 2011,ODI # 3165,Oshane Thomas,49619
9,1.0,1,3,33.33,0,0,v England,Nottingham,6 Jul 2011,ODI # 3169,Oshane Thomas,49619
10,0.0,0,2,0.0,0,0,v Australia,Pallekele,10 Aug 2011,ODI # 3175,Oshane Thomas,49619
11,0.0,0,2,0.0,0,0,v Pakistan,Dubai (DSC),11 Nov 2011,ODI # 3212,Oshane Thomas,49619


In [16]:
df['SR']=df['SR'].astype(float)
df['Runs']=df['Runs'].astype(float)
df['4s']=df['4s'].astype(float)
df['6s']=df['6s'].astype(float)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8918 entries, 5 to 11146
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Bat1        8918 non-null   object 
 1   Runs        8918 non-null   float64
 2   BF          8918 non-null   object 
 3   SR          8918 non-null   float64
 4   4s          8918 non-null   float64
 5   6s          8918 non-null   float64
 6   Opposition  8918 non-null   object 
 7   Ground      8918 non-null   object 
 8   Start Date  8918 non-null   object 
 9   Match_ID    8918 non-null   object 
 10  Batsman     8918 non-null   object 
 11  Player_ID   8918 non-null   int64  
dtypes: float64(4), int64(1), object(7)
memory usage: 905.7+ KB


##### summary

In [18]:
df.describe()

Unnamed: 0,Runs,SR,4s,6s,Player_ID
count,8918.0,8918.0,8918.0,8918.0,8918.0
mean,28.709352,77.978627,2.601144,0.539807,211086.2
std,31.826135,50.343366,3.232154,1.180729,184867.9
min,0.0,0.0,0.0,0.0,5334.0
25%,5.0,50.0,0.0,0.0,44828.0
50%,17.0,75.945,1.0,0.0,227760.0
75%,42.0,100.0,4.0,1.0,318339.0
max,264.0,600.0,33.0,16.0,1158100.0


In [19]:
Total_runs_by_batsman = df.groupby(df['Batsman'])['Runs'].sum().sort_values(ascending = False)
Total_runs_by_batsman

Batsman
Virat Kohli          10843.0
MS Dhoni             10500.0
Chris Gayle          10151.0
Ross Taylor           8026.0
Rohit Sharma          8010.0
                      ...   
Anrich Nortje            8.0
Jason Behrendorff        7.0
Tabraiz Shamsi           0.0
Mohammad Hasnain         0.0
Nicholas Pooran          0.0
Name: Runs, Length: 148, dtype: float64

In [20]:
df.groupby(by = ['Batsman','Ground'])['Runs'].sum().sort_values(ascending = False)

Batsman           Ground      
Tamim Iqbal       Dhaka           2619.0
Shakib Al Hasan   Dhaka           2472.0
Mushfiqur Rahim   Dhaka           2351.0
Mahmudullah       Dhaka           1616.0
Shoaib Malik      Lahore          1030.0
                                   ...  
Kemar Roach       Chattogram         0.0
                  Chennai            0.0
                  Harare             0.0
Andre Russell     Johannesburg       0.0
Yuzvendra Chahal  Wellington         0.0
Name: Runs, Length: 3668, dtype: float64

In [21]:
df.groupby(by = ['Batsman','Opposition'])['Runs'].sum().sort_values(ascending = False)

Batsman         Opposition   
MS Dhoni        v Sri Lanka      2383.0
Virat Kohli     v Sri Lanka      2186.0
Rohit Sharma    v Australia      1980.0
Virat Kohli     v West Indies    1840.0
Eoin Morgan     v Australia      1815.0
                                  ...  
Shaheen Afridi  v New Zealand       0.0
Hamid Hassan    v Sri Lanka         0.0
                v Netherlands       0.0
                v England           0.0
Kusal Mendis    v Afghanistan       0.0
Name: Runs, Length: 1328, dtype: float64

In [22]:
df.columns

Index(['Bat1', 'Runs', 'BF', 'SR', '4s', '6s', 'Opposition', 'Ground',
       'Start Date', 'Match_ID', 'Batsman', 'Player_ID'],
      dtype='object')

In [23]:
for col in df.columns:
    print(col + '\n===========')
    print(df[col].value_counts())
    print('\n============================================')

Bat1
0       655
1       338
2       260
4       229
5       200
       ... 
180.      1
145.      1
183.      1
109.      1
164       1
Name: Bat1, Length: 319, dtype: int64

Runs
0.0      800
1.0      438
2.0      325
4.0      278
5.0      244
        ... 
168.0      1
159.0      1
215.0      1
176.0      1
164.0      1
Name: Runs, Length: 179, dtype: int64

BF
1      336
3      277
5      271
2      263
4      259
      ... 
158      1
173      1
148      1
159      1
155      1
Name: BF, Length: 162, dtype: int64

SR
0.00      800
100.00    403
50.00     286
66.66     181
33.33     167
         ... 
105.97      1
75.51       1
137.93      1
42.00       1
49.45       1
Name: SR, Length: 2031, dtype: int64

4s
0.0     2901
1.0     1630
2.0     1084
3.0      835
4.0      594
5.0      496
6.0      342
7.0      279
8.0      216
9.0      148
10.0      91
11.0      89
12.0      64
13.0      43
16.0      26
15.0      24
14.0      19
17.0      13
18.0       7
19.0       5
22.0       3
24.0 

In [24]:
df.head()

Unnamed: 0,Bat1,Runs,BF,SR,4s,6s,Opposition,Ground,Start Date,Match_ID,Batsman,Player_ID
5,0.0,0.0,8,0.0,0.0,0.0,v India,Dhaka,10 Jan 2010,ODI # 2941,Oshane Thomas,49619
6,0.0,0.0,0,0.0,0.0,0.0,v England,The Oval,28 Jun 2011,ODI # 3165,Oshane Thomas,49619
9,1.0,1.0,3,33.33,0.0,0.0,v England,Nottingham,6 Jul 2011,ODI # 3169,Oshane Thomas,49619
10,0.0,0.0,2,0.0,0.0,0.0,v Australia,Pallekele,10 Aug 2011,ODI # 3175,Oshane Thomas,49619
11,0.0,0.0,2,0.0,0.0,0.0,v Pakistan,Dubai (DSC),11 Nov 2011,ODI # 3212,Oshane Thomas,49619


In [25]:
bd_induvidual = df.query('Batsman == "Virat Kohli "')
bd_induvidual 

Unnamed: 0,Bat1,Runs,BF,SR,4s,6s,Opposition,Ground,Start Date,Match_ID,Batsman,Player_ID
6676,12,12.0,22,54.54,1.0,0.0,v Sri Lanka,Dambulla,18 Aug 2008,ODI # 2742,Virat Kohli,253802
6677,37,37.0,67,55.22,6.0,0.0,v Sri Lanka,Dambulla,20 Aug 2008,ODI # 2745,Virat Kohli,253802
6678,25,25.0,38,65.78,4.0,0.0,v Sri Lanka,Colombo (RPS),24 Aug 2008,ODI # 2750,Virat Kohli,253802
6679,54,54.0,66,81.81,7.0,0.0,v Sri Lanka,Colombo (RPS),27 Aug 2008,ODI # 2755,Virat Kohli,253802
6680,31,31.0,46,67.39,3.0,1.0,v Sri Lanka,Colombo (RPS),29 Aug 2008,ODI # 2756,Virat Kohli,253802
...,...,...,...,...,...,...,...,...,...,...,...,...
6898,44,44.0,45,97.77,6.0,1.0,v Australia,Hyderabad (Deccan),2 Mar 2019,ODI # 4102,Virat Kohli,253802
6899,116,116.0,120,96.66,10.0,0.0,v Australia,Nagpur,5 Mar 2019,ODI # 4106,Virat Kohli,253802
6900,123,123.0,95,129.47,16.0,1.0,v Australia,Ranchi,8 Mar 2019,ODI # 4109,Virat Kohli,253802
6901,7,7.0,6,116.66,1.0,0.0,v Australia,Mohali,10 Mar 2019,ODI # 4111,Virat Kohli,253802


In [29]:
df.drop(columns = ['Match_ID', 'Player_ID', 'Start Date'], axis = 1, inplace =True) 

In [30]:
df.head()

Unnamed: 0,Bat1,Runs,BF,SR,4s,6s,Opposition,Ground,Batsman
5,0.0,0.0,8,0.0,0.0,0.0,v India,Dhaka,Oshane Thomas
6,0.0,0.0,0,0.0,0.0,0.0,v England,The Oval,Oshane Thomas
9,1.0,1.0,3,33.33,0.0,0.0,v England,Nottingham,Oshane Thomas
10,0.0,0.0,2,0.0,0.0,0.0,v Australia,Pallekele,Oshane Thomas
11,0.0,0.0,2,0.0,0.0,0.0,v Pakistan,Dubai (DSC),Oshane Thomas


In [31]:
df = pd.get_dummies(data = df, columns = ['Opposition', 'Ground', 'Batsman'], drop_first = True)

In [32]:
df.head()

Unnamed: 0,Bat1,Runs,BF,SR,4s,6s,Opposition_v Africa XI,Opposition_v Asia XI,Opposition_v Australia,Opposition_v Bangladesh,...,Batsman_Tamim Iqbal,Batsman_Thisara Perera,Batsman_Tim Southee,Batsman_Tom Curran,Batsman_Tom Latham,Batsman_Trent Boult,Batsman_Usman Khawaja,Batsman_Vijay Shankar,Batsman_Virat Kohli,Batsman_Yuzvendra Chahal
5,0.0,0.0,8,0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0.0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1.0,1.0,3,33.33,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0.0,0.0,2,0.0,0.0,0.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
11,0.0,0.0,2,0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
df.shape

(8918, 306)

In [34]:
df.columns

Index(['Bat1', 'Runs', 'BF', 'SR', '4s', '6s', 'Opposition_v Africa XI',
       'Opposition_v Asia XI', 'Opposition_v Australia',
       'Opposition_v Bangladesh',
       ...
       'Batsman_Tamim Iqbal', 'Batsman_Thisara Perera', 'Batsman_Tim Southee',
       'Batsman_Tom Curran', 'Batsman_Tom Latham ', 'Batsman_Trent Boult',
       'Batsman_Usman Khawaja', 'Batsman_Vijay Shankar',
       'Batsman_Virat Kohli ', 'Batsman_Yuzvendra Chahal'],
      dtype='object', length=306)

In [35]:
df.head()

Unnamed: 0,Bat1,Runs,BF,SR,4s,6s,Opposition_v Africa XI,Opposition_v Asia XI,Opposition_v Australia,Opposition_v Bangladesh,...,Batsman_Tamim Iqbal,Batsman_Thisara Perera,Batsman_Tim Southee,Batsman_Tom Curran,Batsman_Tom Latham,Batsman_Trent Boult,Batsman_Usman Khawaja,Batsman_Vijay Shankar,Batsman_Virat Kohli,Batsman_Yuzvendra Chahal
5,0.0,0.0,8,0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0.0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1.0,1.0,3,33.33,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0.0,0.0,2,0.0,0.0,0.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
11,0.0,0.0,2,0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
y = df['Runs']

In [37]:
x = df.drop(['Runs'], axis=1)

In [38]:
y

5         0.0
6         0.0
9         1.0
10        0.0
11        0.0
         ... 
11142     0.0
11143    15.0
11144    46.0
11145     3.0
11146     1.0
Name: Runs, Length: 8918, dtype: float64

In [39]:
x

Unnamed: 0,Bat1,BF,SR,4s,6s,Opposition_v Africa XI,Opposition_v Asia XI,Opposition_v Australia,Opposition_v Bangladesh,Opposition_v Bermuda,...,Batsman_Tamim Iqbal,Batsman_Thisara Perera,Batsman_Tim Southee,Batsman_Tom Curran,Batsman_Tom Latham,Batsman_Trent Boult,Batsman_Usman Khawaja,Batsman_Vijay Shankar,Batsman_Virat Kohli,Batsman_Yuzvendra Chahal
5,0.,8,0.00,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0.,0,0.00,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1.,3,33.33,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0.,2,0.00,0.0,0.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,2,0.00,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11142,0.,2,0.00,0.0,0.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
11143,15,46,32.60,1.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11144,46,61,75.40,6.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11145,3,6,50.00,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
from sklearn.model_selection import train_test_split

In [41]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state=42)

In [42]:
print('x_train shape :', x_train.shape)
print('x_test shape :', x_test.shape)
print('y_train shape :', y_train.shape)
print('y_test shape :', y_test.shape)

x_train shape : (7134, 305)
x_test shape : (1784, 305)
y_train shape : (7134,)
y_test shape : (1784,)


In [43]:
#from sklearn.preprocessing import StandardScaler

sd = StandardScaler()
x_train = sd.fit_transform(x_train)
x_test = sd.transform(x_test)

<IPython.core.display.Javascript object>

In [44]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [45]:
models={
    'LR':LinearRegression(),
    'KNNR':KNeighborsRegressor(),
    'SVR':SVR(),
    'DT':DecisionTreeRegressor(),
    'RF':RandomForestRegressor() 
}

In [46]:
for name, model in models.items():
    print(f'Using model: {name}')
    model.fit(x_train, y_train)
    print(f'Training score :{model.score(x_train,y_train)}')
    print(f'Test Score :{model.score(x_test,y_test)}')
    y_pred= model.predict(x_test)
    print(f'RMSE: {np.sqrt(mean_squared_error(y_test,y_pred))}')
    print('======================================================')

Using model: LR
Training score :1.0
Test Score :0.9999999796392584


<IPython.core.display.Javascript object>

RMSE: 0.004688894843710456
Using model: KNNR
Training score :0.5898188800288545
Test Score :0.3761508850328289


<IPython.core.display.Javascript object>

RMSE: 25.95454849239235
Using model: SVR
Training score :0.7578189922120923
Test Score :0.7427943702862388


<IPython.core.display.Javascript object>

RMSE: 16.66533053325233
Using model: DT
Training score :1.0
Test Score :0.999125822057778


<IPython.core.display.Javascript object>

RMSE: 0.9715689313377652
Using model: RF
Training score :0.999907170511065
Test Score :0.9999098105841215


<IPython.core.display.Javascript object>

RMSE: 0.3120696139424478


In [47]:
model = LinearRegression()
model = model.fit(x_train,y_train)

In [48]:
y_pred = model.predict(x_test)

In [49]:
y_pred

array([-3.55271368e-15,  1.40000000e+01,  1.01000000e+02, ...,
        7.00000000e+00,  2.90000000e+01,  1.60000000e+01])

In [50]:
y_test.values

array([  0.,  14., 101., ...,   7.,  29.,  16.])