In [1]:
# Start by importing the relevant libraries.
import pandas as pd
import pickle
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

In [2]:
# load the final DataFrame from dataset exported under feature_extraction
final_df = pickle.load(open('dataset_level3_first_innings.pkl', 'rb'))

In [12]:
final_df

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,runs_x
28702,India,England,Manchester,95,55,9,8.769231,50.0,165
45497,England,Pakistan,Manchester,77,60,7,7.700000,37.0,135
32964,South Africa,Australia,Colombo,144,2,5,7.322034,50.0,146
32286,West Indies,England,Pallekele,91,64,10,9.750000,53.0,179
43620,South Africa,Bangladesh,Mirpur,59,80,10,8.850000,41.0,169
...,...,...,...,...,...,...,...,...,...
20318,South Africa,New Zealand,London,50,73,9,6.382979,28.0,128
35186,New Zealand,England,London,77,72,9,9.625000,43.0,201
3104,Pakistan,West Indies,Trinidad,132,6,3,6.947368,35.0,137
17736,New Zealand,Pakistan,Cape Town,84,42,8,6.461538,33.0,143


In [5]:
# Split the dataset into training and testing data.
# Extract the input.
X = final_df.drop(columns = ['runs_x'])
# Extract the output.
y = final_df['runs_x']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [13]:
# Now we import the relevant classes to deploy our model.

# Column Transformer will enable us to apply different preprocessing
# transformations to diff subsets of the code. This is useful for us since our 
# data is a mix of numerical and categorical data.
from sklearn.compose import ColumnTransformer

# OneHotEncoder converts categorical features into a binary vector format
# which is more suitable to be inputted to a model.
from sklearn.preprocessing import OneHotEncoder

# Pipeline enables the streamlining workflows by creating a sequence of 
# transformation and estimator steps.
from sklearn.pipeline import Pipeline

# Standard Scaler normalizes the numerical data to be centred around zero.
from sklearn.preprocessing import StandardScaler

# RandomForestRegressor is an ensemble learning method. It operates using
# multiple decision trees and outputting the average prediction of the trees.
# It is called an ensemble method because of this reason - it combines models.
from sklearn.ensemble import RandomForestRegressor

# XGBRegressor is a gradient boosting algorithm. Gradient boosting is an
# ensemble method of building models sequentially - each model corrects the previous one.
from xgboost import XGBRegressor

# r2_score provides a measure of how well the predictions approximate the real
# data points. mean absolute error captures the average magnitude of the errors
# in the predictions, regardless of the direction.
from sklearn.metrics import r2_score,mean_absolute_error

In [16]:
# The next step is to create a column transformer for the columns with categorical values.
# We will be subjecting all three columns to the same transformation - OneHotEncoder.
# The other columns are left untouched.
trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse_output = False,drop = 'first'),['batting_team', 'bowling_team', 'city'])
], remainder = 'passthrough')

In [17]:
# We define the pipeline
# First step is to apply the transformer defined above to hot encode the categorical values.
# Second step is to normalize the feature values - ensures all features are on a similar scale.
# Third Step is to train a XGBoost regression model.
# XGB Regressor by its design of working with multiple decision trees, captures complex relationships
# between the features very well. Being a regressor, it is suitable for a continous target.
pipe = Pipeline(steps = [
    ('step1', trf),
    ('step2', StandardScaler()),
    ('step3', XGBRegressor(n_estimator = 1000, learning_rate = 0.2, max_depth = 12, random_state = 1))
])

In [20]:
# We train the pipeline define above using our X and y train dataset.
# X captures the input, and y has the target variable.
pipe.fit(X_train, y_train)
# Now we use the trained model to make predictions on the test dataset.
y_pred = pipe.predict(X_test)
# Print the Accuracy (R2 Score) of the predictions.
print(f"The R2 Score for the model is: {r2_score(y_test, y_pred): .4f}")
# Also print the absolute mean error.
print(f"The mean absolute error is: {mean_absolute_error(y_test,y_pred): .4f}")

Parameters: { "n_estimator" } are not used.



The R2 Score for the model is:  0.9852
The mean absolute error is:  1.9611


In [27]:
# Pickle dump the model to be used elsewhere in the future, if need be.
pickle.dump(pipe, open('pipe_first_innings.pkl', 'wb'))