In [1]:
# Start by importing the relevant libraries.
import pandas as pd
import pickle
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

In [2]:
# load the final DataFrame from dataset exported under feature_extraction
final_df = pickle.load(open('dataset_level3_first_innings.pkl', 'rb'))

In [3]:
final_df

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,runs_x
23931,Pakistan,Bangladesh,St Lucia,138,30,10,9.200000,58.0,172
31055,England,Pakistan,Abu Dhabi,72,48,7,6.000000,24.0,129
27291,Sri Lanka,India,Chandigarh,188,7,3,9.982301,36.0,206
15188,West Indies,England,London,57,90,9,11.400000,57.0,208
23077,Bangladesh,West Indies,St Kitts,85,32,3,5.795455,23.0,118
...,...,...,...,...,...,...,...,...,...
40544,India,Australia,Mirpur,112,24,6,7.000000,46.0,159
44123,Pakistan,Sri Lanka,Colombo,59,71,8,7.224490,45.0,175
9683,South Africa,England,Centurion,60,90,10,12.000000,60.0,222
31226,West Indies,Bangladesh,Mirpur,129,4,3,6.672414,33.0,132


In [4]:
# Split the dataset into training and testing data.
# Extract the input.
X = final_df.drop(columns = ['runs_x'])
# Extract the output.
y = final_df['runs_x']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [5]:
# Now we import the relevant classes to deploy our model.

# Column Transformer will enable us to apply different preprocessing
# transformations to diff subsets of the code. This is useful for us since our 
# data is a mix of numerical and categorical data.
from sklearn.compose import ColumnTransformer

# OneHotEncoder converts categorical features into a binary vector format
# which is more suitable to be inputted to a model.
from sklearn.preprocessing import OneHotEncoder

# Pipeline enables the streamlining workflows by creating a sequence of 
# transformation and estimator steps.
from sklearn.pipeline import Pipeline

# Standard Scaler normalizes the numerical data to be centred around zero.
from sklearn.preprocessing import StandardScaler

# RandomForestRegressor is an ensemble learning method. It operates using
# multiple decision trees and outputting the average prediction of the trees.
# It is called an ensemble method because of this reason - it combines models.
from sklearn.ensemble import RandomForestRegressor

# XGBRegressor is a gradient boosting algorithm. Gradient boosting is an
# ensemble method of building models sequentially - each model corrects the previous one.
from xgboost import XGBRegressor

# r2_score provides a measure of how well the predictions approximate the real
# data points. mean absolute error captures the average magnitude of the errors
# in the predictions, regardless of the direction.
from sklearn.metrics import r2_score,mean_absolute_error

In [6]:
# The next step is to create a column transformer for the columns with categorical values.
# We will be subjecting all three columns to the same transformation - OneHotEncoder.
# The other columns are left untouched.
trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse_output = False,drop = 'first'),['batting_team', 'bowling_team', 'city'])
], remainder = 'passthrough')

In [7]:
# We define the pipeline
# First step is to apply the transformer defined above to hot encode the categorical values.
# Second step is to normalize the feature values - ensures all features are on a similar scale.
# Third Step is to train a XGBoost regression model.
# XGB Regressor by its design of working with multiple decision trees, captures complex relationships
# between the features very well. Being a regressor, it is suitable for a continous target.
pipe = Pipeline(steps = [
    ('step1', trf),
    ('step2', StandardScaler()),
    ('step3', XGBRegressor(n_estimator = 1000, learning_rate = 0.2, max_depth = 12, random_state = 1))
])

In [8]:
# We train the pipeline define above using our X and y train dataset.
# X captures the input, and y has the target variable.
pipe.fit(X_train, y_train)
# Now we use the trained model to make predictions on the test dataset.
y_pred = pipe.predict(X_test)
# Print the Accuracy (R2 Score) of the predictions.
print(f"The R2 Score for the model is: {r2_score(y_test, y_pred): .4f}")
# Also print the absolute mean error.
print(f"The mean absolute error is: {mean_absolute_error(y_test,y_pred): .4f}")

Parameters: { "n_estimator" } are not used.



The R2 Score for the model is:  0.9869
The mean absolute error is:  1.8987


In [9]:
# Pickle dump the model to be used elsewhere in the future, if need be.
pickle.dump(pipe, open('pipe_first_innings.pkl', 'wb'))