In [None]:
pip install pandas scikit-learn

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings

# --- MODIFIED TO USE ONLY ONE FILE ---
# 1. Load the deliveries dataset
ball_by_ball_file_path = '/content/deliveries.csv'

try:
    # Attempt to load the dataset
    ball_df = pd.read_csv(ball_by_ball_file_path)
    print("✅ deliveries.csv loaded successfully!")

    # --- ALL SUBSEQUENT CODE RUNS ONLY IF THE FILE IS LOADED ---

    # 2. Data Preprocessing & Feature Engineering

    # Calculate the total score for each innings of each match
    total_runs_df = ball_df.groupby(['match_id', 'inning']).sum(numeric_only=True)['total_runs'].reset_index()

    # --- FIX: Correctly determine the winner of each match ---
    # Get target scores from inning 1
    inning1_scores = total_runs_df[total_runs_df['inning'] == 1].copy()
    inning1_scores.rename(columns={'total_runs': 'inning1_score'}, inplace=True)
    inning1_scores['target'] = inning1_scores['inning1_score'] + 1

    # Get final scores from inning 2
    inning2_scores = total_runs_df[total_runs_df['inning'] == 2].copy()
    inning2_scores.rename(columns={'total_runs': 'inning2_score'}, inplace=True)

    # Get the teams that played in the second innings
    inning2_teams = ball_df[ball_df['inning'] == 2][['match_id', 'batting_team', 'bowling_team']].drop_duplicates()

    # Combine all match outcome information
    match_outcomes = pd.merge(inning1_scores, inning2_scores, on='match_id')
    match_outcomes = pd.merge(match_outcomes, inning2_teams, on='match_id')

    # Determine the winner based on final scores
    match_outcomes['winner'] = np.where(match_outcomes['inning2_score'] >= match_outcomes['inning1_score'], match_outcomes['batting_team'], match_outcomes['bowling_team'])

    # Merge the correct winner and target into the main dataframe
    match_df = ball_df.merge(match_outcomes[['match_id', 'target', 'winner']], on='match_id')

    # Filter for the second innings only
    inning2_df = match_df[match_df['inning'] == 2].copy()

    # Calculate the state of the game after each ball
    inning2_df['current_score'] = inning2_df.groupby('match_id')['total_runs'].cumsum()
    inning2_df['runs_left'] = inning2_df['target'] - inning2_df['current_score']
    inning2_df['balls_left'] = 120 - ((inning2_df['over'] - 1) * 6 + inning2_df['ball'])
    inning2_df['is_wicket'] = inning2_df['player_dismissed'].notna().astype('int')
    wickets = inning2_df.groupby('match_id')['is_wicket'].cumsum()
    inning2_df['wickets_left'] = 10 - wickets
    inning2_df['crr'] = (inning2_df['current_score'] * 6) / (120 - inning2_df['balls_left'])
    inning2_df['rrr'] = (inning2_df['runs_left'] * 6) / inning2_df['balls_left']

    # Create the final target variable: 1 if batting team won, 0 otherwise
    inning2_df['result'] = np.where(inning2_df['batting_team'] == inning2_df['winner'], 1, 0)

    # 3. Final Data Cleaning
    # Select the features for the model
    final_df = inning2_df[['runs_left', 'balls_left', 'wickets_left', 'target', 'crr', 'rrr', 'result']]

    # Drop rows with missing values
    final_df.dropna(inplace=True)

    # Remove data from completed matches to prevent leakage
    final_df = final_df[final_df['balls_left'] != 0]

    # Remove infinite values
    final_df = final_df[~final_df.isin([np.inf, -np.inf]).any(axis=1)]

    # Shuffle the dataframe
    final_df = final_df.sample(frac=1).reset_index(drop=True)

    # 4. Model Training
    # Define features (X) and target (y)
    X = final_df.drop(columns=['result'])
    y = final_df['result']

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the Logistic Regression model
    model = LogisticRegression(solver='liblinear')
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n## Model Evaluation")
    print(f"Test Set Accuracy: {accuracy:.4f} (This is a more realistic score)")

    # 5. Making Predictions - INTERACTIVE
    def predict_win_probability(runs_left, balls_left, wickets_left, target, crr, rrr):
        """Predicts the win probability for the chasing team."""
        input_data = pd.DataFrame({
            'runs_left': [runs_left],
            'balls_left': [balls_left],
            'wickets_left': [wickets_left],
            'target': [target],
            'crr': [crr],
            'rrr': [rrr]
        })

        win_prob = model.predict_proba(input_data)[0][1]
        return win_prob

    # Example Prediction - Interactive
    print("\n## Live Win Probability Calculator")
    try:
        target_score = int(input("Enter the target score set by the first team: "))
        runs = int(input("Enter runs needed to win: "))
        balls = int(input("Enter balls left: "))
        wickets = int(input("Enter wickets left (1-10): "))

        if balls > 0 and 0 < wickets <= 10 and runs >= 0:
            # Calculate CRR and RRR based on the user's input
            current_run_rate = ((target_score - runs) * 6) / (120 - balls) if (120 - balls) > 0 else 0
            required_run_rate = (runs * 6) / balls if balls > 0 else float('inf')

            probability = predict_win_probability(runs, balls, wickets, target_score, current_run_rate, required_run_rate)

            print("\n--- Prediction ---")
            print(f"Scenario: Need {runs} runs in {balls} balls with {wickets} wickets left.")
            print(f"Chasing Team's Win Probability: **{probability*100:.2f}%**")
            print(f"Bowling Team's Win Probability: **{(1-probability)*100:.2f}%**")
        else:
            print("Invalid input. Please check the numbers you entered.")

    except ValueError:
        print("Invalid input. Please enter whole numbers only.")


except FileNotFoundError:
    print(f"❌ Error: File not found at the path: '{ball_by_ball_file_path}'")
    print("Please make sure you have uploaded the 'deliveries.csv' file to your Colab session.")