In [1]:
# Start by importing the relevant libraries.
import pandas as pd
import pickle
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

In [2]:
# load the final DataFrame from dataset exported under feature_extraction
final_df = pickle.load(open('dataset_level3_first_innings.pkl', 'rb'))

In [3]:
final_df

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,runs_x
22713,Australia,England,Manchester,86,35,7,6.070588,30.0,145
57953,England,Sri Lanka,Delhi,161,5,7,8.400000,64.0,171
18502,South Africa,India,Johannesburg,31,86,8,5.470588,29.0,126
36915,New Zealand,Zimbabwe,Harare,94,65,10,10.254545,52.0,187
45449,South Africa,Australia,Centurion,90,26,4,5.744681,22.0,128
...,...,...,...,...,...,...,...,...,...
44562,South Africa,Sri Lanka,Hambantota,134,6,4,7.052632,43.0,145
36899,New Zealand,Zimbabwe,Harare,55,81,10,8.461538,37.0,187
8896,South Africa,Pakistan,Johannesburg,153,9,7,8.270270,53.0,188
58659,England,West Indies,Kolkata,108,40,6,8.100000,60.0,155


In [13]:
final_df['city'].unique()

array(['Manchester', 'Delhi', 'Johannesburg', 'Harare', 'Centurion',
       'Barbados', 'Mirpur', 'Wellington', 'Adelaide', 'Dhaka', 'London',
       'Southampton', 'Pallekele', 'Lauderhill', 'Kolkata',
       'Christchurch', 'Bangalore', 'Chittagong', 'St Kitts', 'Colombo',
       'Cardiff', 'Hambantota', 'Melbourne', 'St Lucia', 'Lahore',
       'Cape Town', 'Nagpur', 'Sydney', 'Mount Maunganui', 'Abu Dhabi',
       'Dubai', 'Sharjah', 'Trinidad', 'Auckland', 'Mumbai', 'Durban',
       'Guyana', 'Chandigarh', 'Greater Noida', 'Nottingham', 'Hamilton'],
      dtype=object)

In [4]:
# Split the dataset into training and testing data.
# Extract the input.
X = final_df.drop(columns = ['runs_x'])
# Extract the output.
y = final_df['runs_x']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [5]:
# Now we import the relevant classes to deploy our model.

# Column Transformer will enable us to apply different preprocessing
# transformations to diff subsets of the code. This is useful for us since our 
# data is a mix of numerical and categorical data.
from sklearn.compose import ColumnTransformer

# OneHotEncoder converts categorical features into a binary vector format
# which is more suitable to be inputted to a model.
from sklearn.preprocessing import OneHotEncoder

# Pipeline enables the streamlining workflows by creating a sequence of 
# transformation and estimator steps.
from sklearn.pipeline import Pipeline

# Standard Scaler normalizes the numerical data to be centred around zero.
from sklearn.preprocessing import StandardScaler

# RandomForestRegressor is an ensemble learning method. It operates using
# multiple decision trees and outputting the average prediction of the trees.
# It is called an ensemble method because of this reason - it combines models.
from sklearn.ensemble import RandomForestRegressor

# XGBRegressor is a gradient boosting algorithm. Gradient boosting is an
# ensemble method of building models sequentially - each model corrects the previous one.
from xgboost import XGBRegressor

# r2_score provides a measure of how well the predictions approximate the real
# data points. mean absolute error captures the average magnitude of the errors
# in the predictions, regardless of the direction.
from sklearn.metrics import r2_score,mean_absolute_error

In [7]:
# The next step is to create a column transformer for the columns with categorical values.
# We will be subjecting all three columns to the same transformation - OneHotEncoder.
# The other columns are left untouched.
trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse_output = False,drop = 'first'),['batting_team', 'bowling_team', 'city'])
], remainder = 'passthrough')

In [8]:
# We define the pipeline
# First step is to apply the transformer defined above to hot encode the categorical values.
# Second step is to normalize the feature values - ensures all features are on a similar scale.
# Third Step is to train a XGBoost regression model.
# XGB Regressor by its design of working with multiple decision trees, captures complex relationships
# between the features very well. Being a regressor, it is suitable for a continous target.
pipe = Pipeline(steps = [
    ('step1', trf),
    ('step2', StandardScaler()),
    ('step3', XGBRegressor(n_estimator = 1000, learning_rate = 0.2, max_depth = 12, random_state = 1))
])

In [10]:
# We train the pipeline define above using our X and y train dataset.
# X captures the input, and y has the target variable.
pipe.fit(X_train, y_train)
# Now we use the trained model to make predictions on the test dataset.
y_pred = pipe.predict(X_test)
# Print the Accuracy (R2 Score) of the predictions.
print(f"The R2 Score for the model is: {r2_score(y_test, y_pred): .4f}")
# Also print the absolute mean error.
print(f"The mean absolute error is: {mean_absolute_error(y_test,y_pred): .4f}")

Parameters: { "n_estimator" } are not used.



The R2 Score for the model is:  0.9849
The mean absolute error is:  2.1836


In [11]:
# Pickle dump the model to be used elsewhere in the future, if need be.
pickle.dump(pipe, open('pipe_first_innings.pkl', 'wb'))