<a href="https://colab.research.google.com/github/Ziyaul2404/ML-Project-to-predict-Virat-Kohli-s-Score/blob/main/OLS_VK_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_excel('/content/Virat_Kohli_ML_Dataset_Final_1-650.xlsx')

In [None]:
# Independent variables (Features)
X_cols = [
    'Avg', 'Economy', 'Pitch_Excellent', 'Pitch_Good', 'Pitch_Medium',
    'Pitch_Bad', 'Form_Excellent', 'Form_Good', 'Form_Medium',
    'Form_Bad', 'Chase', 'Age', 'Fav_Grounds', 'Position', 'Luck'
]
y_col = 'Runs'

X = df[X_cols]
y = df[y_col]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Add constant for the intercept term
X_train_const = sm.add_constant(X_train)
X_test_const = sm.add_constant(X_test)

# Fit the OLS model
model = sm.OLS(y_train, X_train_const).fit()

# Display the statistical summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Runs   R-squared:                       0.899
Model:                            OLS   Adj. R-squared:                  0.897
Method:                 Least Squares   F-statistic:                     376.5
Date:                Fri, 19 Dec 2025   Prob (F-statistic):          1.67e-243
Time:                        11:03:41   Log-Likelihood:                -2118.8
No. Observations:                 520   AIC:                             4264.
Df Residuals:                     507   BIC:                             4319.
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Avg                -0.0292      0.023     

In [None]:
# Predictions
y_pred = model.predict(X_test_const)

# Metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared Score: {r2:.2f}")

Root Mean Squared Error: 11.38
R-squared Score: 0.92


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 1. Load the dataset
df = pd.read_excel('/content/Virat_Kohli_ML_Dataset_Final_1-650.xlsx')

# 2. Define Features (X) and Target (y)
X_cols = ['Avg', 'Economy', 'Pitch_Excellent', 'Pitch_Good', 'Pitch_Medium', 'Pitch_Bad',
          'Form_Excellent', 'Form_Good', 'Form_Medium', 'Form_Bad', 'Chase', 'Age',
          'Fav_Grounds', 'Position', 'Luck']
X = df[X_cols]
y = df['Runs']

# 3. Split into Training (80%) and Testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    r2 = r2_score(y_test, predictions)

    print(f"--- {name} ---")
    print(f"RMSE: {rmse:.2f}")
    print(f"R2 Score: {r2:.2f}\n")

--- Linear Regression ---
RMSE: 11.38
R2 Score: 0.92

--- Random Forest ---
RMSE: 12.58
R2 Score: 0.90

--- Gradient Boosting ---
RMSE: 13.01
R2 Score: 0.90



In [None]:
#Prediction_Script
import pandas as pd
from sklearn.linear_model import LinearRegression

# 1. Load and Train the model on the full dataset
df = pd.read_excel('/content/Virat_Kohli_ML_Dataset_Final_1-650.xlsx')

X_cols = ['Avg', 'Economy', 'Pitch_Excellent', 'Pitch_Good', 'Pitch_Medium', 'Pitch_Bad',
          'Form_Excellent', 'Form_Good', 'Form_Medium', 'Form_Bad', 'Chase', 'Age',
          'Fav_Grounds', 'Position', 'Luck']

X = df[X_cols]
y = df['Runs']

model = LinearRegression()
model.fit(X, y)

# 2. Function to Predict Next Match Score
def predict_score(avg, economy, pitch, form, chase, age, fav_ground, position, luck):
    # Map Pitch and Form types to dummy variables
    pitch_vals = {'Excellent': [1,0,0,0], 'Good': [0,1,0,0], 'Medium': [0,0,1,0], 'Bad': [0,0,0,1]}
    form_vals = {'Excellent': [1,0,0,0], 'Good': [0,1,0,0], 'Medium': [0,0,1,0], 'Bad': [0,0,0,1]}

    p = pitch_vals.get(pitch, [0,0,1,0]) # Default to Medium if not found
    f = form_vals.get(form, [0,0,1,0])    # Default to Medium if not found

    # Prepare input for model as a DataFrame with column names
    match_data_list = [[avg, economy] + p + f + [chase, age, fav_ground, position, luck]]
    match_data_df = pd.DataFrame(match_data_list, columns=X_cols)
    prediction = model.predict(match_data_df)[0]

    return max(0, round(prediction))

# --- CHANGE THESE VALUES FOR THE NEXT MATCH ---
score = predict_score(
    avg=30,        # Current career/recent average
    economy=7,     # Match economy context
    pitch='excellent',    # Options: 'Excellent', 'Good', 'Medium', 'Bad'
    form='excellent',# Options: 'Excellent', 'Good', 'Medium', 'Bad'
    chase=1,         # 1 if chasing, 0 if batting first
    age=36,          # Current age
    fav_ground=1,    # 1 if ground is favorable, else 0
    position=1,      # Batting position (1 = Top Order)
    luck=1           # 1 if luck is on his side (toss/dropped catches), else 0
)

print(f"The predicted score for the next match is: {score} runs")

The predicted score for the next match is: 56 runs
