In [None]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import lightgbm as lgb
import category_encoders as ce
from lightgbm import LGBMRanker
from scipy.stats import kendalltau
from sklearn.metrics import ndcg_score
from scipy.stats import spearmanr


In [2]:
df = pd.read_excel('RCBKKR.xlsx')

In [3]:
df['Rank'] = df.groupby(['season', 'bowler'])['Total Points'].rank(method='min', ascending=False)

In [4]:
df['Rank'].describe()

count    51047.000000
mean        25.215801
std         28.097198
min          1.000000
25%          4.000000
50%         11.000000
75%         45.000000
max        198.000000
Name: Rank, dtype: float64

In [5]:
df['Rank'] = df['Rank'].astype(int)

In [6]:
df.head()

Unnamed: 0,matches,match_id,event_name,season,match_type,match_number,match_date,team,batter_id,batter,...,Year,known_as,Batter_role,retained,base_price,sold_price,season_info,match_number_info,stage,Rank
0,1,335982,Indian Premier League,2008,T20,1.0,2008-04-18,Royal Challengers Bangalore,af7dadf7,B Akhil,...,,,-1,-1,5326200,5907240,2007/08,1.0,Points Match,58
1,1,335982,Indian Premier League,2008,T20,1.0,2008-04-18,Kolkata Knight Riders,66b30f71,AB Dinda,...,,,-1,-1,4505200,15599800,2007/08,1.0,Points Match,3
2,1,335982,Indian Premier League,2008,T20,1.0,2008-04-18,Kolkata Knight Riders,66b30f71,AB Dinda,...,,,-1,-1,5038600,15599800,2007/08,1.0,Points Match,71
3,1,335982,Indian Premier League,2008,T20,1.0,2008-04-18,Royal Challengers Bangalore,f3cb53a1,MV Boucher,...,2008.0,MV Boucher,4,-1,8702000,19579500,2007/08,1.0,Points Match,58
4,1,335982,Indian Premier League,2008,T20,1.0,2008-04-18,Royal Challengers Bangalore,d0513f63,CL White,...,2008.0,CL White,1,-1,4351000,21755000,2007/08,1.0,Points Match,58


In [7]:
df.shape

(51047, 46)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51047 entries, 0 to 51046
Data columns (total 46 columns):
 #   Column                                   Non-Null Count  Dtype         
---  ------                                   --------------  -----         
 0   matches                                  51047 non-null  int64         
 1   match_id                                 51047 non-null  int64         
 2   event_name                               51047 non-null  object        
 3   season                                   51047 non-null  int64         
 4   match_type                               51047 non-null  object        
 5   match_number                             48173 non-null  float64       
 6   match_date                               51047 non-null  datetime64[ns]
 7   team                                     51047 non-null  object        
 8   batter_id                                51047 non-null  object        
 9   batter                                 

In [9]:
# Calculate the number of null values in each column
null_values = df.isnull().sum()

# Calculate the percentage of null values in each column
null_percentage = (null_values / len(df)) * 100

# Combine the null values and their percentages into a DataFrame
null_summary = pd.DataFrame({'Null Values': null_values, 'Percentage': null_percentage})

print(null_summary)

                                         Null Values  Percentage
matches                                            0    0.000000
match_id                                           0    0.000000
event_name                                         0    0.000000
season                                             0    0.000000
match_type                                         0    0.000000
match_number                                    2874    5.630106
match_date                                         0    0.000000
team                                               0    0.000000
batter_id                                          0    0.000000
batter                                             0    0.000000
Batter Batting IR                                  0    0.000000
Batter Bowling IR                                  0    0.000000
bowler                                             0    0.000000
Bowler Batting IR                                  0    0.000000
Bowler Bowling IR        

In [10]:
df['Year_Since'] = 2025 - df['season']

df['Year_Since'].unique()

array([17, 16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1],
      dtype=int64)

In [11]:
unique_pairs = df[['Year_Since', 'season']].drop_duplicates()
print(unique_pairs)

       Year_Since  season
0              17    2008
1580           16    2009
3254           15    2010
4570           14    2011
6716           13    2012
9110           12    2013
12108          11    2014
13965          10    2015
16365           9    2016
18739           8    2017
20923           7    2018
24191           6    2019
27409           5    2020
30531           4    2021
33989           3    2022
39015           2    2023
44635           1    2024


In [12]:
columns_to_drop = ['matches', 'season', 'match_id', 'event_name', 'match_type', 'match_number', 'match_date', 'batter_id', 'Year', 'known_as', 'season_info',
                   'match_number_info', 'Total Points']

df = df.drop(columns=columns_to_drop)

In [13]:
df_2024 = df[df['Year_Since'] == 1]

df = df[df['Year_Since'] != 1]

In [14]:
# List of categorical columns to encode
categorical_columns = ['team', 'batter', 'bowler', 'city', 'venue', 'Batter_role', 'stage']

# Label Encoding

In [16]:
# Initialize the target encoder
target_encoder = ce.TargetEncoder(cols=categorical_columns)

# Fit the encoder on the training data and transform the training data
df_encoded = target_encoder.fit_transform(df[categorical_columns], df['Rank'])

# Transform the validation data using the same encoder
df_2024_encoded = target_encoder.transform(df_2024[categorical_columns])

# Replace the original categorical columns with the encoded columns in the dataframes
df[categorical_columns] = df_encoded
df_2024[categorical_columns] = df_2024_encoded

# Create a coding map to reverse the encoding
coding_map = {}
for col in categorical_columns:
    # Fit the encoder on a single column
    single_col_encoder = ce.TargetEncoder()
    single_col_encoder.fit(df[col], df['Rank'])
    # Create the mapping for the single column
    coding_map[col] = {
        category: single_col_encoder.transform(pd.Series([category])).iloc[0, 0]  # Extract scalar value
        for category in df[col].unique()
    }

# Display the coding map for verification
print("Coding Map:")
for col, mapping in coding_map.items():
    print(f"{col}: {mapping}")

Coding Map:
team: {19.186147186147185: 19.186147186147185, 19.99869659666908: 19.99869659666908, 31.032472939217318: 31.032472939217318, 28.75386148480319: 28.75386148480319, 20.457975986277873: 20.457975986277873, 31.233720104191335: 31.233720104191335, 25.424163783160324: 25.424163783160324, 34.86161316428379: 34.86161316428379, 18.48717949073664: 18.48717949073664, 19.81121495327103: 19.81121495327103, 24.535549399815327: 24.535549399815327, 20.612716763005814: 20.612716763005814, 26.697022767075307: 26.697022767075307, 16.95886075949486: 16.95886075949486, 22.9626572327044: 22.9626572327044, 33.69094304388422: 33.69094304388422, 37.7258883248731: 37.7258883248731, 30.865211810012838: 30.865211810012838}
batter: {31.854937128382424: 31.854937128382424, 47.12495220993374: 47.12495220993374, 30.70466382570109: 30.70466382570109, 46.1309636930314: 46.1309636930314, 5.825057295645531: 5.825057295645531, 30.581169740321023: 30.581169740321023, 40.50081536644667: 40.50081536644667, 8.1108

In [18]:
# Reverse the encoding for each categorical column
for col in categorical_columns:
    reverse_map = {v: k for k, v in coding_map[col].items()}
    df_2024[col] = df_2024[col].map(reverse_map)

# Display the first few rows of the dataframe to verify the changes
print("\nReversed Validation Data:")
print(df_2024)


Reversed Validation Data:
            team     batter  Batter Batting IR  Batter Bowling IR     bowler  \
44635  34.861613  42.632812                120                120  27.308174   
44636  34.861613  42.632812                120                120  52.102896   
44637  34.861613  45.581924                120                 24  27.308174   
44638  34.861613  45.581924                120                 24  52.102896   
44639        NaN  31.948979                120                120  43.579084   
...          ...        ...                ...                ...        ...   
51042  24.535549  27.798664                120                120  51.041139   
51043  19.998697   8.676305                120                120  20.394673   
51044  24.535549  27.798664                120                120  52.144904   
51045  19.998697  33.467836                120                 49  55.365159   
51046  19.998697  13.340040                120                120   8.035367   

       Bowle

In [17]:
# Define the input and output variables
X = df.drop(columns=['Rank'])
y = df['Rank']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y_train.describe()

count    202114.000000
mean         39.951562
std          27.361289
min           1.000000
25%          19.000000
50%          38.000000
75%          56.000000
max         209.000000
Name: Rank, dtype: float64

In [None]:
# Ensures no values exceed the expected range
y_train = y_train.clip(0, 30)

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="gbdt",
    n_estimators=500
)


# Number of matches in training data
num_matches = len(X_train) // 242  

# Create a list where each match has 242 records
group_sizes = np.full(num_matches, 242, dtype=int)

# Adjust the last group to ensure total sum matches X_train
difference = len(X_train) - sum(group_sizes)
if difference != 0:
    group_sizes[-1] += difference  # Absorb the difference in the last group

print("New Sum of group_sizes:", sum(group_sizes))
print("Expected dataset size:", len(X_train))

# Fit the model with the corrected group parameter
ranker.fit(X_train, y_train, group=group_sizes)

New Sum of group_sizes: 202114
Expected dataset size: 202114
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019038 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4852
[LightGBM] [Info] Number of data points in the train set: 202114, number of used features: 33


In [None]:
# Make predictions
y_pred = ranker.predict(X_test)

Mean Absolute Error (MAE): 45.49554464363356
Mean Squared Error (MSE): 2728.7772698155104
Root Mean Squared Error (RMSE): 52.23769969873779
R-squared: -2.5886215487244537
Adjusted R-squared: -2.590966820753524


In [None]:
tau, _ = kendalltau(y_test, y_pred)
print("Kendall’s Tau:", tau)


rho, _ = spearmanr(y_test, y_pred)
print("Spearman’s Rank Correlation:", rho)



ndcg = ndcg_score([y_test], [y_pred])
print("NDCG Score:", ndcg)

Kendall’s Tau: 0.38486837835297566
Spearman’s Rank Correlation: 0.5536270765710256
NDCG Score: 0.9467344448418078


## Mean Reciprocal Rank (MRR)

In [63]:
def mean_reciprocal_rank(y_true, y_pred):
    """
    y_true: List of lists containing relevant item(s) for each query.
    y_pred: List of lists containing ranked predictions for each query.
    """
    reciprocal_ranks = []

    # Convert Pandas Series to lists (if applicable)
    y_true = y_true.tolist() if isinstance(y_true, pd.Series) else y_true
    y_pred = y_pred.tolist() if isinstance(y_pred, pd.Series) else y_pred

    for true_labels, pred_ranking in zip(y_true, y_pred):
        # Debugging Output
        print(f"True Labels: {true_labels}, Pred Ranking: {pred_ranking}")

        # Ensure both are lists
        if isinstance(true_labels, (int, float)):  
            true_labels = [true_labels]
        if isinstance(pred_ranking, (int, float)):  
            pred_ranking = [pred_ranking]

        # Ensure data type consistency
        true_labels = [int(label) for label in true_labels]
        pred_ranking = [int(pred) for pred in pred_ranking]

        for rank, pred in enumerate(pred_ranking, start=1):
            if pred in true_labels:  # First relevant item found
                reciprocal_ranks.append(1 / rank)
                break
        else:
            reciprocal_ranks.append(0)  # No relevant item found

    return np.mean(reciprocal_ranks)


print("MRR:", mean_reciprocal_rank(y_test, y_pred))


True Labels: 9, Pred Ranking: -7.341092610523712
True Labels: 58, Pred Ranking: 1.1790072993968788
True Labels: 3, Pred Ranking: -8.439119731487034
True Labels: 59, Pred Ranking: 3.2044586918162117
True Labels: 25, Pred Ranking: -6.163337669744326
True Labels: 132, Pred Ranking: -6.348172890387569
True Labels: 20, Pred Ranking: -10.012434051513898
True Labels: 66, Pred Ranking: -6.545230048214446
True Labels: 62, Pred Ranking: -4.381228740585481
True Labels: 2, Pred Ranking: -8.959301802288051
True Labels: 72, Pred Ranking: -7.7792671180304245
True Labels: 33, Pred Ranking: -1.4995980403770601
True Labels: 9, Pred Ranking: -3.3343833450958043
True Labels: 31, Pred Ranking: -8.359222443008607
True Labels: 11, Pred Ranking: -11.40418411764162
True Labels: 35, Pred Ranking: -5.422781735567374
True Labels: 9, Pred Ranking: -6.350629863618318
True Labels: 4, Pred Ranking: -4.90839584006249
True Labels: 29, Pred Ranking: -8.22361578741964
True Labels: 48, Pred Ranking: -6.11582583635063
True

## Precision at K (P@K) for top k rankings

In [59]:
def precision_at_k(y_true, y_pred, k):
    """
    y_true: List of sets containing relevant items for each query.
    y_pred: List of lists containing ranked predictions for each query.
    k: Number of top predictions to consider.
    """
    precisions = []
    
    # Convert Pandas Series to list
    y_true = y_true.tolist() if isinstance(y_true, pd.Series) else y_true
    y_pred = y_pred.tolist() if isinstance(y_pred, pd.Series) else y_pred
    
    # Ensure both y_true and y_pred are lists of lists
    if isinstance(y_true[0], (int, float)):  
        y_true = [[label] for label in y_true]
    if isinstance(y_pred[0], (int, float)):  
        y_pred = [[pred] for pred in y_pred]

    for true_labels, pred_ranking in zip(y_true, y_pred):
        top_k_preds = pred_ranking[:k]  # Take top K predictions
        #relevant_count = sum(1 for pred in top_k_preds if pred in true_labels)
        relevant_count = sum(1 for pred in top_k_preds if pred == true_labels[0])
        precisions.append(relevant_count / k)

    return np.mean(precisions)

# Example usage
print("P@3:", precision_at_k(y_test, y_pred, k=30))


P@3: 0.0
