# Model Deplyoment

## 0. Setup

In [1]:
# Start by importing the relevant libraries.
import pandas as pd
import pickle
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

## 1. 1st Innings

In [2]:
# load the final DataFrame from dataset exported under feature_extraction
final_df = pickle.load(open('dataset_level3_first_innings.pkl', 'rb'))

In [3]:
final_df

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,runs_x
17575,England,Australia,Southampton,117,33,5,8.068966,28.0,179
19251,England,Zimbabwe,Cape Town,51,89,8,9.870968,49.0,188
14747,Australia,England,Southampton,109,25,5,6.884211,40.0,157
25818,South Africa,India,Nottingham,44,91,9,9.103448,44.0,130
54916,Sri Lanka,West Indies,Pallekele,136,28,8,8.869565,40.0,215
...,...,...,...,...,...,...,...,...,...
23629,India,Bangladesh,Nottingham,67,68,9,7.730769,35.0,180
57548,Australia,Pakistan,Chandigarh,106,41,7,8.050633,42.0,193
24728,Sri Lanka,Pakistan,London,73,78,10,10.428571,47.0,150
10773,New Zealand,Sri Lanka,Auckland,128,21,4,7.757576,52.0,179


In [4]:
final_df['city'].unique()

array(['Southampton', 'Cape Town', 'Nottingham', 'Pallekele', 'Melbourne',
       'Wellington', 'Johannesburg', 'Harare', 'Mumbai', 'Kolkata',
       'Hamilton', 'Durban', 'Mount Maunganui', 'Barbados', 'Bangalore',
       'Mirpur', 'Dubai', 'London', 'Trinidad', 'Christchurch', 'Colombo',
       'Auckland', 'Abu Dhabi', 'Hambantota', 'Lauderhill', 'Manchester',
       'Nagpur', 'St Kitts', 'Sharjah', 'Lahore', 'Chandigarh',
       'Centurion', 'Sydney', 'St Lucia', 'Dhaka', 'Guyana', 'Chittagong',
       'Adelaide', 'Delhi', 'Cardiff', 'Greater Noida'], dtype=object)

In [5]:
# Split the dataset into training and testing data.
# Extract the input.
X = final_df.drop(columns = ['runs_x'])
# Extract the output.
y = final_df['runs_x']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [5]:
# Now we import the relevant classes to deploy our model.

# Column Transformer will enable us to apply different preprocessing
# transformations to diff subsets of the code. This is useful for us since our 
# data is a mix of numerical and categorical data.
from sklearn.compose import ColumnTransformer

# OneHotEncoder converts categorical features into a binary vector format
# which is more suitable to be inputted to a model.
from sklearn.preprocessing import OneHotEncoder

# Pipeline enables the streamlining workflows by creating a sequence of 
# transformation and estimator steps.
from sklearn.pipeline import Pipeline

# Standard Scaler normalizes the numerical data to be centred around zero.
from sklearn.preprocessing import StandardScaler

# RandomForestRegressor is an ensemble learning method. It operates using
# multiple decision trees and outputting the average prediction of the trees.
# It is called an ensemble method because of this reason - it combines models.
from sklearn.ensemble import RandomForestRegressor

# XGBRegressor is a gradient boosting algorithm. Gradient boosting is an
# ensemble method of building models sequentially - each model corrects the previous one.
from xgboost import XGBRegressor

# r2_score provides a measure of how well the predictions approximate the real
# data points. mean absolute error captures the average magnitude of the errors
# in the predictions, regardless of the direction.
from sklearn.metrics import r2_score,mean_absolute_error

In [7]:
# The next step is to create a column transformer for the columns with categorical values.
# We will be subjecting all three columns to the same transformation - OneHotEncoder.
# The other columns are left untouched.
trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse_output = False,drop = 'first'),['batting_team', 'bowling_team', 'city'])
], remainder = 'passthrough')

In [8]:
# We define the pipeline
# First step is to apply the transformer defined above to hot encode the categorical values.
# Second step is to normalize the feature values - ensures all features are on a similar scale.
# Third Step is to train a XGBoost regression model.
# XGB Regressor by its design of working with multiple decision trees, captures complex relationships
# between the features very well. Being a regressor, it is suitable for a continous target.
pipe = Pipeline(steps = [
    ('step1', trf),
    ('step2', StandardScaler()),
    ('step3', XGBRegressor(n_estimator = 1000, learning_rate = 0.2, max_depth = 12, random_state = 1))
])

In [10]:
# We train the pipeline define above using our X and y train dataset.
# X captures the input, and y has the target variable.
pipe.fit(X_train, y_train)
# Now we use the trained model to make predictions on the test dataset.
y_pred = pipe.predict(X_test)
# Print the Accuracy (R2 Score) of the predictions.
print(f"The R2 Score for the model is: {r2_score(y_test, y_pred): .4f}")
# Also print the absolute mean error.
print(f"The mean absolute error is: {mean_absolute_error(y_test,y_pred): .4f}")

Parameters: { "n_estimator" } are not used.



The R2 Score for the model is:  0.9849
The mean absolute error is:  2.1836


In [11]:
# Pickle dump the model to be used elsewhere in the future, if need be.
pickle.dump(pipe, open('pipe_first_innings.pkl', 'wb'))

## 2. 2nd Innings

In [3]:
# load the final DataFrame from dataset exported under feature_extraction
final_df2 = pickle.load(open('dataset_level3_second_innings.pkl', 'rb'))

In [4]:
final_df2

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,target_score,winner
19950,Bangladesh,Afghanistan,Dhaka,116,14,3,6.566038,28.0,164.0,0
32088,Ireland,Sri Lanka,London,127,5,4,6.626087,38.0,144.0,0
6232,New Zealand,India,Delhi,35,81,8,5.384615,28.0,202.0,0
49128,India,New Zealand,Chennai,92,53,8,8.238806,41.0,167.0,0
31408,Ireland,New Zealand,Nottingham,109,29,2,7.186813,42.0,198.0,0
...,...,...,...,...,...,...,...,...,...,...
56964,New Zealand,Sri Lanka,Chittagong,33,61,4,3.355932,14.0,119.0,0
47756,Bangladesh,Pakistan,Mirpur,15,87,7,2.727273,14.0,135.0,0
53620,South Africa,Pakistan,Dubai,45,79,9,6.585366,33.0,99.0,1
21114,Pakistan,Sri Lanka,Lahore,82,52,5,7.235294,37.0,182.0,0


In [5]:
# Split the dataset into training and testing data.
# Extract the input.
X = final_df2.drop(columns = ['winner'])
# Extract the output.
y = final_df2['winner']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [6]:
# Import the necessary classes and libraries to deploy the model and test it.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [7]:
# The next step is to create a column transformer for the columns with categorical values.
# We will be subjecting all three columns to the same transformation - OneHotEncoder.
# The other columns are left untouched.
trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse_output = False,drop = 'first'),['batting_team', 'bowling_team', 'city'])
], remainder = 'passthrough')

In [8]:
# We define the pipeline
# First step is to apply the transformer defined above to hot encode the categorical values.
# Second step is to normalize the feature values - ensures all features are on a similar scale.
# Third Step is to train a model.
pipe = Pipeline(steps = [
    ('step1', trf),
    ('step2', StandardScaler()),
    ('step3', RandomForestClassifier())
])

In [9]:
# We train the pipeline define above using our X and y train dataset.
# X captures the input, and y has the target variable.
pipe.fit(X_train, y_train)
# Now we use the trained model to make predictions on the test dataset.
y_prob_pred = pipe.predict_proba(X_test)
y_pred = pipe.predict(X_test)
# Print the Accuracy of the predictions.
print(f"The Accuracy Score for the model is: {accuracy_score(y_test, y_pred): .4f}")
print(f"The Precision Score for the model is: {precision_score(y_test, y_pred): .4f}")

The Accuracy Score for the model is:  0.9996
The Precision Score for the model is:  0.9998


A(0.8090) and P(0.7982) for Log Reg (default)
A(0.9636) and P(0.9549) for XGBClassifier (logloss)
A(0.9977) and P(0.9963) for Random Forest Classifier (default)

In [10]:
y_prob_pred

array([[1.  , 0.  ],
       [0.02, 0.98],
       [1.  , 0.  ],
       ...,
       [0.04, 0.96],
       [1.  , 0.  ],
       [0.99, 0.01]])

In [11]:
y_pred

array([0, 1, 0, ..., 1, 0, 0])

In [12]:
# Pickle dump the pipeline.
pickle.dump(pipe, open('pipe_second_innings.pkl', 'wb'))