In [217]:
import warnings
warnings.filterwarnings('ignore')

In [218]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from lightgbm import LGBMRanker
from scipy.stats import kendalltau
from sklearn.metrics import ndcg_score
from scipy.stats import spearmanr
from fuzzywuzzy import process
from sklearn.preprocessing import StandardScaler 

In [219]:
data = pd.read_excel('RCBKKR.xlsx')

In [260]:
df = data.copy()

df.shape



In [221]:
# Calculate the number of null values in each column
null_values = df.isnull().sum()

# Calculate the percentage of null values in each column
null_percentage = (null_values / len(df)) * 100

# Combine the null values and their percentages into a DataFrame
null_summary = pd.DataFrame({'Null Values': null_values, 'Percentage': null_percentage})

print(null_summary)



In [222]:
df.info()



In [223]:
# Select only the integer and float type columns
int_float_columns = df.select_dtypes(include=['int64', 'float64'])

# Calculate the correlation matrix
correlation_matrix = int_float_columns.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(16, 12))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix Heatmap')
plt.show()



In [224]:
# List of categorical columns to test
categorical_columns = ['team', 'batter', 'bowler', 'city', 'venue', 'Batter_role', 'retained', 'stage']

# Dictionary to store the results
chi2_results = {}

# Perform chi-square test for each categorical column
for col in categorical_columns:
    contingency_table = pd.crosstab(df[col], df['Total Points'])
    chi2, p, dof, ex = chi2_contingency(contingency_table)
    chi2_results[col] = p

# Convert the results to a DataFrame for better visualization
chi2_results_df = pd.DataFrame(list(chi2_results.items()), columns=['Feature', 'p-value'])

print(chi2_results_df)



# Drop & Scale Columns

In [261]:
columns_to_drop = ['matches', 'match_id', 'event_name', 'match_type', 'match_number','matches_played_till_date', 'retained',
                   'matches_played_till_date_2', 'match_date', 'batter_id', 'Year', 'known_as', 'season_info', 'match_number_info']

df = df.drop(columns=columns_to_drop)

## Scaling not needed for RF & XGBoost it will help to avoid post prediction reverse scaling

In [262]:
df.columns



In [263]:
sorted_unique_teams = sorted(df['team'].unique().tolist())
print(sorted_unique_teams)



In [264]:
# Define the replacements
replacements = {
    'Deccan Chargers': 'Sunrisers Hyderabad',
    'Delhi Daredevils': 'Delhi Capitals',
    'Gujarat Lions': 'Gujarat Titans',
    'Kings XI Punjab': 'Punjab Kings',
    'Rising Pune Supergiant': 'Rising Pune Supergiants',
    'Royal Challengers Bangalore': 'Royal Challengers Bengaluru'
}

# Replace the team names in the 'team' column
df['team'] = df['team'].replace(replacements)

# Season/Year conversion

In [265]:
df['season'].unique()



In [None]:
df['Year_Since'] = 2025 - df['season']

df['Year_Since'].unique()



In [267]:
unique_pairs = df[['Year_Since', 'season']].drop_duplicates()
print(unique_pairs)



In [268]:
df = df.drop(columns=['season'])

# Prepare Validation Set (2024)

In [269]:
df['Year_Since'].value_counts()



In [270]:
df_2024 = df[df['Year_Since'] == 1]

df = df[df['Year_Since'] != 1]

In [271]:
df_2024.shape



In [272]:
df.shape



# Removing old data

In [273]:
df['Year_Since'].unique()



In [274]:
# Remove records of seasons from 2008 till 2014
df = df[~df['Year_Since'].isin(range(11, 17))]

In [275]:
df.info()



# Target Encoding

In [282]:
# List of categorical columns to encode
categorical_columns = ['batter', 'bowler', 'team', 'city', 'venue', 'Batter_role', 'stage']

In [283]:
import category_encoders as ce

# Initialize the target encoder
target_encoder = ce.TargetEncoder(cols=categorical_columns)

# Fit the encoder on the training data and transform it
df_encoded = target_encoder.fit_transform(df[categorical_columns], df['Total Points'])

# Transform the validation data using the same encoder
df_2024_encoded = target_encoder.transform(df_2024[categorical_columns])

# Store original categories before replacing them
original_categories = {col: df[col].copy() for col in categorical_columns}

# Replace the original categorical columns with the encoded values
df[categorical_columns] = df_encoded
df_2024[categorical_columns] = df_2024_encoded

# Create a reverse mapping dictionary
coding_map = {}
df_unique = df[categorical_columns].copy()  # Copy only categorical columns

# Transform only unique values while keeping the full DataFrame structure
df_transformed = target_encoder.transform(df_unique)

for col in categorical_columns:
    unique_categories = original_categories[col].unique()
    transformed_values = df_transformed[col].values  # Get transformed values
    coding_map[col] = dict(zip(transformed_values, unique_categories))  # Create mapping

print("Encoding completed successfully!")




# Random Forest

In [242]:
# Define the input and output variables
X = df.drop(columns=['Total Points'])
y = df['Total Points']
X_2024 = df_2024.drop(columns=['Total Points'])
y_2024 = df_2024['Total Points']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [243]:
# Create the Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)



In [244]:
# Make test predictions & store train predictions for meta model

rf_train_preds = rf.predict(X_train)
rf_test_preds = rf.predict(X_test)

In [245]:
# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, rf_test_preds)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, rf_test_preds)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared
r2 = r2_score(y_test, rf_test_preds)

# Calculate Adjusted R-squared
n = X_test.shape[0]
p = X_test.shape[1]
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared: {r2}')
print(f'Adjusted R-squared: {adjusted_r2}')



In [246]:
# Make predictions on the 2024 data using the Random Forest model
rf_2024_preds = rf.predict(X_2024)

# Calculate Mean Absolute Error (MAE) for the 2024 predictions
mae_2024 = mean_absolute_error(y_2024, rf_2024_preds)

# Calculate Mean Squared Error (MSE) for the 2024 predictions
mse_2024 = mean_squared_error(y_2024, rf_2024_preds)

# Calculate Root Mean Squared Error (RMSE) for the 2024 predictions
rmse_2024 = np.sqrt(mse_2024)

# Calculate R-squared for the 2024 predictions
r2_2024 = r2_score(y_2024, rf_2024_preds)

# Calculate Adjusted R-squared for the 2024 predictions
n_2024 = X_2024.shape[0]
p_2024 = X_2024.shape[1]
adjusted_r2_2024 = 1 - (1 - r2_2024) * (n_2024 - 1) / (n_2024 - p_2024 - 1)

print(f'Mean Absolute Error (MAE) for 2024: {mae_2024}')
print(f'Mean Squared Error (MSE) for 2024: {mse_2024}')
print(f'Root Mean Squared Error (RMSE) for 2024: {rmse_2024}')
print(f'R-squared for 2024: {r2_2024}')
print(f'Adjusted R-squared for 2024: {adjusted_r2_2024}')



In [247]:
df_2024['rf_2024_preds'] = rf_2024_preds

In [284]:
for col in categorical_columns:
    df_2024[col] = df_2024[col].map(coding_map[col])

print("Reverse encoding completed successfully!")



In [285]:
df_2024.head()



## XGBoost Model

In [159]:
# Create the XGBoost model
xgb = xgb.XGBRegressor(n_estimators=100, random_state=42)

# Train the model
xgb.fit(X_train, y_train)



In [160]:
# Make test predictions & store train predictions for meta model

xgb_train_preds = xgb.predict(X_train)
xgb_test_preds = xgb.predict(X_test)

In [161]:
# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, xgb_test_preds)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, xgb_test_preds)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared
r2 = r2_score(y_test, xgb_test_preds)

# Calculate Adjusted R-squared
n = X_test.shape[0]
p = X_test.shape[1]
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared: {r2}')
print(f'Adjusted R-squared: {adjusted_r2}')



# Ranking Model

In [162]:
df_rank = df.copy()
df_rank['rank'] = df_rank.groupby(['Year_Since', 'bowler'])['Total Points'].rank(ascending=False, method='first')
df_rank['rank'] = df_rank['rank'].astype(int)
df_rank = df_rank.drop(columns=['Total Points'])

In [163]:
df_rank['rank'].unique()



In [164]:
# Define the input and output variables
X_rank = df_rank.drop(columns=['rank'])
y_rank = df_rank['rank']

# Split the data into training and testing sets
X_train_rank, X_test_rank, y_train_rank, y_test_rank = train_test_split(X_rank, y_rank, test_size=0.2, random_state=42)

In [175]:
# Ensures no values exceed the expected range
#y_train_rank = y_train_rank.clip(0, 30)

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="gbdt",
    n_estimators=500
)


# Number of matches in training data
num_matches = len(X_train_rank) // 242  

# Create a list where each match has 242 records
group_sizes = np.full(num_matches, 242, dtype=int)

# Adjust the last group to ensure total sum matches X_train
difference = len(X_train_rank) - sum(group_sizes)
if difference != 0:
    group_sizes[-1] += difference  # Absorb the difference in the last group

print("New Sum of group_sizes:", sum(group_sizes))
print("Expected dataset size:", len(X_train_rank))

# Fit the model with the corrected group parameter
ranker.fit(X_train_rank, y_train_rank, group=group_sizes)







In [176]:
# Make test predictions & store train predictions for meta model

lgb_train_preds = ranker.predict(X_train_rank)
lgb_test_preds = ranker.predict(X_test_rank)

In [177]:
tau, _ = kendalltau(y_test_rank, lgb_test_preds)
print("Kendall’s Tau:", tau)


rho, _ = spearmanr(y_test_rank, lgb_test_preds)
print("Spearman’s Rank Correlation:", rho)



ndcg = ndcg_score([y_test_rank], [lgb_test_preds])
print("NDCG Score:", ndcg)



In [252]:
df_rank_2024 = df_2024.copy()
df_rank_2024['rank'] = df_rank_2024.groupby(['Year_Since', 'bowler'])['Total Points'].rank(ascending=False, method='first') 
df_rank_2024['rank'] = df_rank_2024['rank'].astype(int) 
df_rank_2024 = df_rank_2024.drop(columns=['Total Points'])

X_rank_2024 = df_rank_2024.drop(columns=['rank'])
y_rank_2024 = df_rank_2024['rank']

In [255]:
X_rank_2024 = X_rank_2024.drop(columns=['rf_2024_preds'])

In [256]:
# Make predictions on the 2024 data using the LightGBM model
lgb_2024_preds = ranker.predict(X_rank_2024)

In [257]:
tau, _ = kendalltau(y_rank_2024, lgb_2024_preds)
print("Kendall’s Tau:", tau)


rho, _ = spearmanr(y_rank_2024, lgb_2024_preds)
print("Spearman’s Rank Correlation:", rho)



ndcg = ndcg_score([y_rank_2024], [lgb_2024_preds])
print("NDCG Score:", ndcg)



# Metamodel

In [168]:
# Create meta-features for training data
meta_train = np.column_stack((rf_train_preds, xgb_train_preds, lgb_train_preds))
meta_train_df = pd.DataFrame(meta_train, columns=['rf_pred', 'xgb_pred', 'lgb_pred'])

# Create meta-features for test data
meta_test = np.column_stack((rf_test_preds, xgb_test_preds, lgb_test_preds))
meta_test_df = pd.DataFrame(meta_test, columns=['rf_pred', 'xgb_pred', 'lgb_pred'])

In [169]:
meta_train_df.head()



In [170]:
# Create the XGBoost model
meta_model = XGBRegressor(n_estimators=100, random_state=42)

# Train the model
meta_model.fit(meta_train_df, y_train)



In [171]:
final_train_preds = meta_model.predict(meta_train_df)
final_test_preds = meta_model.predict(meta_test_df)

In [172]:
# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, final_test_preds)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, final_test_preds)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared
r2 = r2_score(y_test, final_test_preds)

# Calculate Adjusted R-squared
n = X_test.shape[0]
p = X_test.shape[1]
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared: {r2}')
print(f'Adjusted R-squared: {adjusted_r2}')

