## XGB Classifier 

## Importing the libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [4]:
dataset = pd.read_csv('Q2_pitches_train.csv')
X_train = dataset.iloc[:, :-1].values
y_train = dataset.iloc[:, -1].values
dataset1 = pd.read_csv('Q2_pitches_test.csv')
X_test = dataset.iloc[:, :13].values


Check for null values

In [5]:
print(dataset.isnull().sum())

inning              0
is_bottom           0
balls               0
strikes             0
outs_before         0
is_lhp              0
is_lhb              0
bat_score_before    0
field_score         0
basecode_before     0
batterid            0
pitcherid           0
cid                 0
pitch_type          0
dtype: int64


In [6]:
print(dataset1.isnull().sum())

inning                   0
is_bottom                0
balls                    0
strikes                  0
outs_before              0
is_lhp                   0
is_lhb                   0
bat_score_before         0
field_score              0
basecode_before          0
batterid                 0
pitcherid                0
cid                      0
FF                  160306
FT                  160306
CB                  160306
SL                  160306
CH                  160306
dtype: int64


Converting the pitch type category in numerical format

In [7]:
from sklearn.preprocessing import LabelEncoder
Ly = LabelEncoder()
y_train = Ly.fit_transform(y_train)
y_train

array([2, 2, 3, ..., 4, 1, 2])

In [8]:
Ly.classes_

array(['CB', 'CH', 'FF', 'FT', 'SL'], dtype=object)

In [9]:
print(X_train[:10])

[[   3    0    1    0    1    0    1    0    1    0  347 1304 2014]
 [   1    0    2    2    2    1    0    0    0    0  269 1661 2052]
 [   2    1    0    0    0    0    0    0    0    0   43 1048 2029]
 [   7    0    0    0    1    0    0    5    1    1   98 1521 2049]
 [   7    1    0    0    2    1    1    6    1    1  460 1100 2050]
 [   5    0    2    2    0    1    0    0    5    0  134 1182 2018]
 [   1    0    0    0    2    1    0    2    0    7   83 1291 2000]
 [   8    0    0    2    0    1    1    4    5    3  544 1418 2060]
 [   8    1    2    1    1    0    1   10    0    1  341 1049 2081]
 [   8    0    0    0    2    0    0    2    1    4  420 1765 2086]]


In [10]:
print(y_train[:10])

[2 2 3 2 4 2 1 2 2 3]


In [11]:
print(X_test[:10])

[[   3    0    1    0    1    0    1    0    1    0  347 1304 2014]
 [   1    0    2    2    2    1    0    0    0    0  269 1661 2052]
 [   2    1    0    0    0    0    0    0    0    0   43 1048 2029]
 [   7    0    0    0    1    0    0    5    1    1   98 1521 2049]
 [   7    1    0    0    2    1    1    6    1    1  460 1100 2050]
 [   5    0    2    2    0    1    0    0    5    0  134 1182 2018]
 [   1    0    0    0    2    1    0    2    0    7   83 1291 2000]
 [   8    0    0    2    0    1    1    4    5    3  544 1418 2060]
 [   8    1    2    1    1    0    1   10    0    1  341 1049 2081]
 [   8    0    0    0    2    0    0    2    1    4  420 1765 2086]]


## Feature Scaling

In [12]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [14]:
print(X_train[:5])

[[-0.75049123 -0.97972577  0.11428095 -1.08009385  0.03356515 -0.61913404
   1.19239002 -0.89949497 -0.52464162 -0.60959635  0.39476806 -0.06237865
  -0.87998006]
 [-1.49360104 -0.97972577  1.14238622  1.33594257  1.25542449  1.61515914
  -0.83865177 -0.89949497 -0.88314777 -0.60959635 -0.02039384  1.75170987
   0.57320229]
 [-1.12204614  1.02069378 -0.91382432 -1.08009385 -1.18829419 -0.61913404
  -0.83865177 -0.89949497 -0.88314777 -0.60959635 -1.22329884 -1.36323765
  -0.30635545]
 [ 0.73572839 -0.97972577 -0.91382432 -1.08009385  0.03356515 -0.61913404
  -0.83865177  0.94355431 -0.52464162 -0.02024018 -0.93055647  1.04030261
   0.45847736]
 [ 0.73572839  1.02069378 -0.91382432 -1.08009385  1.25542449  1.61515914
   1.19239002  1.31216417 -0.52464162 -0.02024018  0.99622056 -1.09900067
   0.496719  ]]


In [15]:
print(X_test[:5])

[[-0.75049123 -0.97972577  0.11428095 -1.08009385  0.03356515 -0.61913404
   1.19239002 -0.89949497 -0.52464162 -0.60959635  0.39476806 -0.06237865
  -0.87998006]
 [-1.49360104 -0.97972577  1.14238622  1.33594257  1.25542449  1.61515914
  -0.83865177 -0.89949497 -0.88314777 -0.60959635 -0.02039384  1.75170987
   0.57320229]
 [-1.12204614  1.02069378 -0.91382432 -1.08009385 -1.18829419 -0.61913404
  -0.83865177 -0.89949497 -0.88314777 -0.60959635 -1.22329884 -1.36323765
  -0.30635545]
 [ 0.73572839 -0.97972577 -0.91382432 -1.08009385  0.03356515 -0.61913404
  -0.83865177  0.94355431 -0.52464162 -0.02024018 -0.93055647  1.04030261
   0.45847736]
 [ 0.73572839  1.02069378 -0.91382432 -1.08009385  1.25542449  1.61515914
   1.19239002  1.31216417 -0.52464162 -0.02024018  0.99622056 -1.09900067
   0.496719  ]]


## Training the model on the Training set

In [16]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier(random_state=0)
xgb_model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

## Predicting the Test Set probability

In [18]:
y_pred_proba = xgb_model.predict_proba(X_test)
print(y_pred_proba)


[[0.06884909 0.18187404 0.4465857  0.17926237 0.1234288 ]
 [0.14553118 0.22469804 0.38349476 0.11260774 0.13366829]
 [0.13810155 0.05254466 0.44641352 0.24208355 0.12085678]
 ...
 [0.06952212 0.10292505 0.38328165 0.19658728 0.24768391]
 [0.13324586 0.22410698 0.39315787 0.11604362 0.1334457 ]
 [0.15693551 0.13637745 0.34763953 0.25277746 0.10627006]]


## Creating the test set file with predicted probability

In [19]:
X_test = sc.inverse_transform(X_test)

In [23]:
X = pd.DataFrame(X_test, columns = ['inning', 'is_bottom', 'balls', 'strikes', 'outs_before', 'is_lhp', 'is_lhb', 'bat_score_before', 'field_score', 'basecode_before', 'batterid', 'pitcherid','cid'])
print(X.head())

   inning  is_bottom  balls  ...  batterid  pitcherid     cid
0     3.0        0.0    1.0  ...     347.0     1304.0  2014.0
1     1.0        0.0    2.0  ...     269.0     1661.0  2052.0
2     2.0        1.0    0.0  ...      43.0     1048.0  2029.0
3     7.0        0.0    0.0  ...      98.0     1521.0  2049.0
4     7.0        1.0    0.0  ...     460.0     1100.0  2050.0

[5 rows x 13 columns]


In [24]:
Y = pd.DataFrame(y_pred_proba, columns = ['CB', 'CH', 'FF', 'FT', 'SL'])
print(Y.head())

         CB        CH        FF        FT        SL
0  0.068849  0.181874  0.446586  0.179262  0.123429
1  0.145531  0.224698  0.383495  0.112608  0.133668
2  0.138102  0.052545  0.446414  0.242084  0.120857
3  0.097153  0.049362  0.365213  0.218246  0.270027
4  0.106114  0.044145  0.365560  0.211235  0.272947


In [25]:
result = pd.concat([X, Y], axis=1)
result.head()

Unnamed: 0,inning,is_bottom,balls,strikes,outs_before,is_lhp,is_lhb,bat_score_before,field_score,basecode_before,batterid,pitcherid,cid,CB,CH,FF,FT,SL
0,3.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,347.0,1304.0,2014.0,0.068849,0.181874,0.446586,0.179262,0.123429
1,1.0,0.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,269.0,1661.0,2052.0,0.145531,0.224698,0.383495,0.112608,0.133668
2,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.0,1048.0,2029.0,0.138102,0.052545,0.446414,0.242084,0.120857
3,7.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,1.0,1.0,98.0,1521.0,2049.0,0.097153,0.049362,0.365213,0.218246,0.270027
4,7.0,1.0,0.0,0.0,2.0,1.0,1.0,6.0,1.0,1.0,460.0,1100.0,2050.0,0.106114,0.044145,0.36556,0.211235,0.272947


In [30]:
from google.colab import files
result.to_csv('Q2_pitches_test_predict.csv')
files.download('Q2_pitches_test_predict.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>