<a href="https://colab.research.google.com/github/amalsalilan/IPL_Infographics_Data_Analytics_-_Data_Visualization_Infosys_Internship_Oct2024/blob/Nagira/Final_notebook_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import xgboost as xg
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Load the Data
data = pd.read_csv('/content/sample_data/updated_all_season_details.csv', low_memory=False)

In [7]:
# Calculate Cumulative Score After Each Ball
data['cumulative_runs'] = data.groupby(['season', 'match_id', 'current_innings'])['runs'].cumsum()


In [11]:
# Display a sample of data to check cumulative calculation
print(data[['season', 'match_id', 'home_team', 'away_team', 'current_innings', 'over', 'ball', 'runs', 'cumulative_runs']].head())

   season  match_id home_team away_team current_innings  over  ball  runs  \
0    2023   1359475        GT       CSK             CSK     1     1     0   
1    2023   1359475        GT       CSK             CSK     1     2     1   
2    2023   1359475        GT       CSK             CSK     1     3     0   
3    2023   1359475        GT       CSK             CSK     1     4     1   
4    2023   1359475        GT       CSK             CSK     1     5     0   

   cumulative_runs  
0                0  
1                1  
2                1  
3                2  
4                2  


In [12]:
# Select the input features (including cumulative_runs) and the target variable (runs)
features = data[['season', 'match_id', 'home_team', 'away_team', 'current_innings', 'over', 'ball', 'batsman1_runs', 'batsman1_balls']]
target = data['cumulative_runs']

In [13]:
object_cols = features.select_dtypes(include=['object']).columns.tolist()

In [14]:
# Convert categorical columns to numerical values using one-hot encoding
features = pd.get_dummies(features, columns=object_cols, drop_first=True)

In [15]:
features

Unnamed: 0,season,match_id,over,ball,batsman1_runs,batsman1_balls,home_team_DC,home_team_GL,home_team_GT,home_team_KKR,...,current_innings_KKR,current_innings_Kochi,current_innings_LSG,current_innings_MI,current_innings_PBKS,current_innings_PWI,current_innings_RCB,current_innings_RPS,current_innings_RR,current_innings_SRH
0,2023,1359475,1,1,0,1,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1,2023,1359475,1,2,0,2,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2,2023,1359475,1,3,0,1,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,2023,1359475,1,4,1,2,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,2023,1359475,1,5,0,3,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242545,2008,336040,20,2,7,7,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
242546,2008,336040,20,3,8,8,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
242547,2008,336040,20,4,9,9,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
242548,2008,336040,20,4,6,5,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [16]:
# Step 4: Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# # Model 1: Linear Regression

In [17]:
lr_model = LinearRegression()

In [18]:
lr_model.fit(X_train, y_train)

In [19]:
lr_predictions = lr_model.predict(X_test)

# Model 2: Random Forest Regressor

In [20]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

In [21]:
rf_model.fit(X_train, y_train)

In [22]:
rf_predictions = rf_model.predict(X_test)

# Model 3: Ridge Regression

In [23]:
rdg_model = Ridge(solver ='auto', random_state = 42)

In [24]:
rdg_model.fit(X_train, y_train)

In [25]:
# predicting the y_test
rdg_predictions = rdg_model.predict(X_test)

# Model 4: XGBoost Regression

In [30]:
xgb_model= XGBRegressor(n_estimators= 100, max_depth= 4, min_samples_split= 5, learning_rate= 0.01, random_state= 42)

In [31]:
xgb_model.fit(X_train, y_train)

In [32]:
xgb_predictions = xgb_model.predict(X_test)

# Evaluation metrics

In [29]:
# Linear Regression Metrics
lr_mse = mean_squared_error(y_test, lr_predictions)
lr_r2 = r2_score(y_test, lr_predictions)
print(f"Linear Regression\t- MSE: {lr_mse:.4f}, R2 Score: {lr_r2:.4f}")

# Random Forest Metrics
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)
print(f"Random Forest Regressor - MSE: {rf_mse:.4f}, R2 Score: {rf_r2:.4f}")

#Ridge Regression
rdg_mse = mean_squared_error(y_test, rdg_predictions)
rdg_r2 = r2_score(y_test, rdg_predictions)
print(f"Ridge Regression\t- MSE: {rdg_mse:.4f}, R2 Score: {rdg_r2:.4f}")

#XGB regression
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_r2 = r2_score(y_test, xgb_predictions)
print(f"XGBoost Regression\t- MSE: {xgb_mse:.4f}, R2 Score: {xgb_r2:.4f}")

Linear Regression	- MSE: 347.1593, R2 Score: 0.8612
Random Forest Regressor - MSE: 138.1822, R2 Score: 0.9447
Ridge Regression	- MSE: 347.1591, R2 Score: 0.8612
XGBoost Regression	- MSE: 702.1949, R2 Score: 0.7192


# Make a new prediction

In [33]:
new_data = pd.DataFrame({
    'season': [2017],
    'match_id': [1082601],
    'home_team': ['KKR'],
    'away_team': ['PBKS'],
    'current_innings': ['KKR'],
    'over': [15],
    'ball': [3],
    'batsman1_runs': [64],
    'batsman1_balls': [41],
})

# Convert categorical columns to match the model’s encoding
new_data = pd.get_dummies(new_data, columns=['home_team', 'away_team'], drop_first=True)

# Align new data columns with training features
new_data = new_data.reindex(columns=X_train.columns, fill_value=0)

# Make predictions
predicted_runs_lr = lr_model.predict(new_data)
predicted_runs_rf = rf_model.predict(new_data)
predicted_runs_rdg = rdg_model.predict(new_data)
predicted_runs = xgb_model.predict(new_data)
print("Predicted Runs for the new data Linear Regression:", round(predicted_runs[0]))
print("Predicted Runs for the new data Random Forest:", round(predicted_runs_rf[0]))
print("Predicted Runs for the new data Ridge Regression:", round(predicted_runs_rdg[0]))
print("Predicted Runs for the new data XGBoost Regression:", round(predicted_runs[0]))

Predicted Runs for the new data Linear Regression: 107
Predicted Runs for the new data Random Forest: 125
Predicted Runs for the new data Ridge Regression: 128
Predicted Runs for the new data XGBoost Regression: 107


# Parameter Hypertuning using RandomSearchCV

In [41]:
param_distributions_ridge = {
    'alpha': [0.01, 0.1, 1],  # Regularization strength
    'solver': ['svd', 'lsqr', 'sag'],  # Solver algorithm
    'fit_intercept': [True, False],  # Whether to calculate the intercept
    'tol': [1e-3, 1e-4],  # Tolerance for stopping criteria
}

In [42]:
random_search = RandomizedSearchCV(
    estimator=rdg_model,
    param_distributions=param_distributions_ridge,
    n_iter=5,  # Number of parameter settings that are sampled.
    scoring='neg_mean_squared_error', # used neg_mean_squared_error for regression
    cv=3,  # Number of folds in cross-validation
    random_state=42,  # For reproducibility
    n_jobs=-1  # Use all available cores for parallel processing
)

In [44]:
random_search.fit(X_train, y_train)

In [48]:
# Get the best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'tol': 0.0001, 'solver': 'svd', 'fit_intercept': True, 'alpha': 0.1}


In [49]:
# Get the best estimator
best_rdg_model = random_search.best_estimator_

In [50]:
# Make predictions using the best estimator
rdg_predictions = best_rdg_model.predict(X_test)

In [53]:
# Evaluate the model
rdg_mse = mean_squared_error(y_test, rdg_predictions) # Assuming y_test is your test target
rdg_r2 = r2_score(y_test, rdg_predictions)
print(f"Random Forest Regressor - MSE: {rdg_mse:.4f}, R2 Score: {rdg_r2:.4f}")

Random Forest Regressor - MSE: 347.1593, R2 Score: 0.8612


In [55]:
rdg_new_predictions=best_rdg_model.predict(new_data)
print("Predicted Runs for the new data Random Forest:", round(rdg_new_predictions[0]))

Predicted Runs for the new data Random Forest: 128
