# Batting Intent Analysis - IPL Cricket Data

This notebook analyzes batting intent patterns in IPL cricket matches using ball-by-ball delivery data. 
We'll examine strike rates, boundary percentages, and performance across different phases of the game.

## Analysis Overview:
1. **Data Import and Setup** - Load IPL delivery data and required libraries
2. **Phase Classification** - Categorize overs into Powerplay, Middle, and Death phases
3. **Individual Batting Analysis** - Strike rates by player and phase
4. **Team Performance Analysis** - Team-wise batting intent across phases
5. **Ball Outcome Analysis** - Boundary vs dot ball percentages
6. **Progressive Analysis** - Over-wise run progression
7. **Runs vs Wickets** - Match dynamics analysis
8. **Batting Profiles** - Comprehensive player statistics
9. **Radar Chart Visualization** - Multi-metric player comparison

In [None]:
# Import necessary libraries for data analysis and visualization
import pandas as pd          # For data manipulation and analysis
import seaborn as sns        # For statistical data visualization
import matplotlib.pyplot as plt  # For creating plots and charts
import numpy as np           # For numerical operations
from math import pi, ceil    # For mathematical operations (radar charts)

# Load the IPL deliveries dataset
# This dataset contains ball-by-ball information for IPL matches
df = pd.read_csv('ipl_match_1473461_deliveries.csv')

# Create a copy of the dataframe to preserve original data
df_copy = df.copy()

# Display basic information about the dataset
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

In [None]:
# Define a function to categorize cricket match phases based on over number
# Cricket matches have different strategic phases:
# - Powerplay (1-6 overs): Fielding restrictions, aggressive batting
# - Middle Overs (7-15 overs): Consolidation phase, steady scoring
# - Death Overs (16-20 overs): Final push, maximum aggression

def get_phase(over):
    """
    Classify cricket match phases based on over number
    
    Args:
        over (int): Over number in the match
    
    Returns:
        str: Phase name ('Powerplay', 'Middle Overs', or 'Death Overs')
    """
    if over < 6:
        return 'Powerplay'
    elif 6 <= over < 15:
        return 'Middle Overs'
    else:
        return 'Death Overs'

# Apply the phase classification to each delivery in the dataset
df_copy['phase'] = df_copy['over'].apply(get_phase)

# Display the distribution of deliveries across different phases
print("Distribution of deliveries across match phases:")
print(df_copy['phase'].value_counts())

In [None]:
# Analyze individual batting intent across different match phases
# This analysis shows how aggressively each batsman plays in different phases

# Group data by batter and phase to calculate batting statistics
batting_intent = (
    df_copy.groupby(['batter', 'phase'])
    .agg(
        balls_faced=('runs_batter', 'count'),  # Total balls faced
        total_runs=('runs_batter', 'sum')      # Total runs scored
    )
    .reset_index()
)

# Calculate strike rate (runs per 100 balls) - key metric for batting intent
batting_intent['strike_rate'] = (batting_intent['total_runs'] / batting_intent['balls_faced']) * 100

# Filter out batsmen with very few balls faced (less than 5) for statistical significance
batting_intent = batting_intent[batting_intent['balls_faced'] >= 5]
batting_intent.sort_values(by=['batter', 'phase'], inplace=True)

# Prepare data for visualization
batters = batting_intent['batter'].unique()
phases = batting_intent['phase'].unique()

# Create grouped bar chart to compare strike rates across phases
bar_width = 0.25
x = np.arange(len(batters))

plt.figure(figsize=(12, 6))

# Create bars for each phase
for idx, phase in enumerate(phases):
    phase_data = batting_intent[batting_intent['phase'] == phase]
    # Get strike rate for each batter in this phase (0 if no data)
    strike_rates = [
        phase_data[phase_data['batter'] == batter]['strike_rate'].values[0]
        if batter in phase_data['batter'].values else 0
        for batter in batters
    ]
    
    plt.bar(x + idx * bar_width, strike_rates, width=bar_width, 
            label=phase, edgecolor='black')

# Customize the plot
plt.xticks(x + bar_width * (len(phases) - 1) / 2, batters, rotation=45, ha='right')
plt.ylabel("Strike Rate")
plt.xlabel("Batter")
plt.title("Batting Intent: Strike Rate Across Phases of the Game", fontsize=16)
plt.legend(title="Game Phase")
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

# Display summary statistics
print("\nAverage Strike Rate by Phase:")
phase_summary = batting_intent.groupby('phase')['strike_rate'].agg(['mean', 'std']).round(2)
print(phase_summary)

In [None]:
# Analyze team-wise batting intent across different match phases
# This shows which teams are more aggressive in different phases of the game

# Create team identifier (assuming 'team' column exists)
df_copy['batting_team'] = df_copy['team']

# Group data by team and phase to calculate team batting statistics
team_phase_intent = (
    df_copy.groupby(['batting_team', 'phase'])
    .agg(
        balls_faced=('runs_batter', 'count'),  # Total balls faced by team
        total_runs=('runs_batter', 'sum')      # Total runs scored by team
    )
    .reset_index()
)

# Calculate team strike rate for each phase
team_phase_intent['strike_rate'] = (
    team_phase_intent['total_runs'] / team_phase_intent['balls_faced']
) * 100

# Prepare data for visualization
teams = team_phase_intent['batting_team'].unique()
phases = team_phase_intent['phase'].unique()

# Create grouped bar chart for team comparison
bar_width = 0.15
x = np.arange(len(phases))
plt.figure(figsize=(12, 6))

# Create bars for each team
for idx, team in enumerate(teams):
    team_data = team_phase_intent[team_phase_intent['batting_team'] == team]
    # Get strike rate for each phase (0 if no data)
    strike_rates = [
        team_data[team_data['phase'] == phase]['strike_rate'].values[0]
        if phase in team_data['phase'].values else 0
        for phase in phases
    ]
    plt.bar(x + idx * bar_width, strike_rates, width=bar_width,
            label=team, edgecolor='black')

# Customize the plot
plt.xticks(x + bar_width * (len(teams) - 1) / 2, phases)
plt.xlabel("Match Phase")
plt.ylabel("Strike Rate")
plt.title("Team-wise Batting Intent Across Match Phases", fontsize=16)
plt.legend(title="Batting Team")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Display team performance summary
print("\nTeam Performance Summary:")
team_summary = team_phase_intent.pivot(index='batting_team', columns='phase', values='strike_rate').round(2)
print(team_summary)

In [None]:
# Analyze ball outcomes to understand batting aggression patterns
# This analysis categorizes each ball into Dot, Run, or Boundary
# High boundary percentage indicates aggressive batting
# High dot ball percentage indicates defensive/struggling batting

# Categorize each ball outcome based on runs scored
df_copy['ball_outcome'] = df_copy['runs_batter'].apply(
    lambda x: 'Dot' if x == 0 else ('Boundary' if x >= 4 else 'Run')
)

# Calculate percentage of each outcome type for each batter
batter_outcome_stats = (
    df_copy.groupby('batter')
    .ball_outcome.value_counts(normalize=True)  # Get percentages
    .unstack()                                   # Pivot to columns
    .fillna(0) * 100                            # Convert to percentage
).reset_index()

# Get total balls faced by each batter for filtering
balls_faced = df_copy.groupby('batter').size().reset_index(name='balls_faced')
batter_outcome_stats = batter_outcome_stats.merge(balls_faced, on='batter')

# Filter batters with significant sample size (at least 10 balls)
batter_outcome_stats = batter_outcome_stats[batter_outcome_stats['balls_faced'] >= 10]

# Sort by boundary percentage to show most aggressive batters first
batter_outcome_stats = batter_outcome_stats.sort_values(by='Boundary', ascending=False)

# Prepare data for visualization
batters = batter_outcome_stats['batter']
boundary_perc = batter_outcome_stats['Boundary']
dot_perc = batter_outcome_stats['Dot']

# Create comparison chart of boundary vs dot ball percentages
bar_width = 0.4
x = np.arange(len(batters))

plt.figure(figsize=(12, 6))
plt.bar(x - bar_width/2, boundary_perc, width=bar_width, 
        color='green', edgecolor='black', label='Boundary%')
plt.bar(x + bar_width/2, dot_perc, width=bar_width, 
        color='red', edgecolor='black', label='Dot ball%')

# Customize the plot
plt.xticks(x, batters, rotation=45, ha='right')
plt.ylabel("Percentage (%)")
plt.xlabel("Batter")
plt.title("Boundary % vs Dot Ball % per Batter", fontsize=16)
plt.legend(title="Ball Outcome")
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

# Display top aggressive and defensive batters
print("\nTop 5 Most Aggressive Batters (Highest Boundary %):")
print(batter_outcome_stats.head()[['batter', 'Boundary', 'Dot', 'balls_faced']].round(2))

print("\nTop 5 Most Defensive Batters (Highest Dot Ball %):")
defensive_batters = batter_outcome_stats.sort_values(by='Dot', ascending=False)
print(defensive_batters.head()[['batter', 'Boundary', 'Dot', 'balls_faced']].round(2))

In [None]:
# Analyze over-wise run progression for top performing batters
# This shows how consistent top batters are in scoring runs across overs
# Helps identify if batters start slow and accelerate or maintain consistency

# Identify top 4 run-scorers in the dataset
top_batters = (
    df_copy.groupby('batter')['runs_batter']
    .sum()
    .sort_values(ascending=False)
    .head(4)
    .index.tolist()
)

print(f"Analyzing top 4 batters: {top_batters}")

# Filter data for only the top batters
batters_progression = df_copy[df_copy['batter'].isin(top_batters)]

# Calculate runs scored by each batter in each over
batters_overwise = (
    batters_progression.groupby(['batter', 'over'])
    .agg(runs_in_over=('runs_batter', 'sum'))
    .reset_index()
)

# Create line plot showing run progression across overs
plt.figure(figsize=(12, 6))

# Plot line for each top batter
for batter in top_batters:
    batter_data = batters_overwise[batters_overwise['batter'] == batter]
    plt.plot(batter_data['over'], batter_data['runs_in_over'], 
             marker='o', linewidth=2, label=batter)

# Customize the plot
plt.xlabel("Over")
plt.ylabel("Runs Scored in Over")
plt.title("Over-wise Run Progression of Top 4 Batters", fontsize=16)
plt.xticks(range(batters_overwise['over'].min(), batters_overwise['over'].max() + 1))
plt.legend(title="Top Batters")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Display average runs per over for top batters
print("\nAverage runs per over for top batters:")
avg_runs_per_over = (
    batters_overwise.groupby('batter')['runs_in_over']
    .agg(['mean', 'max', 'count'])
    .round(2)
)
avg_runs_per_over.columns = ['Avg_Runs_Per_Over', 'Max_Runs_In_Over', 'Overs_Played']
print(avg_runs_per_over)

In [None]:
# Analyze the relationship between runs and wickets across overs
# This reveals match dynamics: when teams score most and when wickets fall
# High runs + low wickets = good batting conditions
# Low runs + high wickets = bowling dominated phase

# Extract wicket data (balls where a batsman got out)
wickets_df = df_copy[df_copy['player_out'].notna()]

# Count wickets per over
wickets_by_over = wickets_df.groupby('over').size().reset_index(name='wickets')

# Calculate total runs scored in each over
runs_by_over = df_copy.groupby('over')['runs_batter'].sum().reset_index(name='total_runs')

# Merge runs and wickets data
overwise_analysis = pd.merge(runs_by_over, wickets_by_over, on='over', how='left').fillna(0)
overwise_analysis = overwise_analysis.sort_values('over')

# Create dual-axis plot (runs as bars, wickets as line)
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot runs as bars on primary y-axis
ax1.bar(overwise_analysis['over'], overwise_analysis['total_runs'],
        color='skyblue', edgecolor='black', label='Runs Scored')
ax1.set_xlabel('Over')
ax1.set_ylabel("Runs Scored", color='blue')
ax1.tick_params(axis='y', labelcolor='blue')

# Create secondary y-axis for wickets
ax2 = ax1.twinx()
ax2.plot(overwise_analysis['over'], overwise_analysis['wickets'],
         color='red', marker='o', linewidth=2, label='Wickets')
ax2.set_ylabel("Wickets", color='red')
ax2.tick_params(axis='y', labelcolor='red')

# Set x-axis to show all overs
ax1.set_xticks(range(1, 21))
ax1.set_xlim(0.5, 20.5)

# Customize the plot
plt.title("Over-wise Analysis: Runs vs Wickets", fontsize=16)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
fig.legend(loc='upper right', bbox_to_anchor=(0.9, 0.9))
plt.tight_layout()
plt.show()

# Calculate and display correlation between runs and wickets
correlation = overwise_analysis['total_runs'].corr(overwise_analysis['wickets'])
print(f"\nCorrelation between runs and wickets: {correlation:.3f}")

# Identify high-scoring and high-wicket overs
print("\nTop 5 highest-scoring overs:")
high_scoring = overwise_analysis.nlargest(5, 'total_runs')[['over', 'total_runs', 'wickets']]
print(high_scoring)

print("\nOvers with most wickets:")
high_wickets = overwise_analysis.nlargest(5, 'wickets')[['over', 'total_runs', 'wickets']]
print(high_wickets)

In [None]:
# Create comprehensive batting profiles for all players
# This combines multiple metrics to give a complete picture of each batter's style
# Metrics include: strike rate, average, boundary %, dot ball %, dismissals

# Calculate basic batting statistics for each player
batter_stats = (
    df_copy.groupby('batter')
    .agg(
        balls_faced=('runs_batter', 'count'),           # Total balls faced
        total_runs=('runs_batter', 'sum'),              # Total runs scored
        dismissals=('player_out', lambda x: x.notna().sum())  # Times dismissed
    )
    .reset_index()
)

# Calculate ball outcome percentages (already computed earlier)
outcome_counts = df_copy.groupby(['batter', 'ball_outcome']).size().unstack().fillna(0)
outcome_counts['dot_percent'] = (outcome_counts['Dot'] / outcome_counts.sum(axis=1)) * 100
outcome_counts['boundary_percent'] = (outcome_counts['Boundary'] / outcome_counts.sum(axis=1)) * 100
outcome_counts = outcome_counts[['dot_percent', 'boundary_percent']].reset_index()

# Merge all statistics into comprehensive batting profiles
batter_profiles = pd.merge(batter_stats, outcome_counts, on='batter')

# Calculate key performance metrics
batter_profiles['strike_rate'] = (batter_profiles['total_runs'] / batter_profiles['balls_faced']) * 100

# Calculate batting average (runs per dismissal)
batter_profiles['average'] = batter_profiles.apply(
    lambda row: row['total_runs'] / row['dismissals'] if row['dismissals'] > 0 else float('inf'), 
    axis=1
)

# Filter for players with meaningful sample size
batter_profiles = batter_profiles[batter_profiles['balls_faced'] >= 10]

# Display comprehensive statistics
print("Comprehensive Batting Profiles (Top 10 by Strike Rate):")
display_cols = ['batter', 'balls_faced', 'total_runs', 'strike_rate', 'average', 
                'boundary_percent', 'dot_percent', 'dismissals']
top_profiles = batter_profiles.sort_values('strike_rate', ascending=False).head(10)
print(top_profiles[display_cols].round(2))

# Calculate summary statistics for the dataset
print("\nDataset Summary Statistics:")
summary_stats = batter_profiles[['strike_rate', 'average', 'boundary_percent', 'dot_percent']].describe()
print(summary_stats.round(2))

In [None]:
# Create radar charts to visualize multi-dimensional batting profiles
# Radar charts are perfect for comparing players across multiple metrics simultaneously
# Each spoke represents a different batting metric, creating a unique 'fingerprint' for each player

# Select metrics for radar chart visualization
metrics = ['strike_rate', 'dot_percent', 'boundary_percent']

# Get top 4 batters by strike rate for detailed comparison
top_batters_radar = batter_profiles.sort_values(by='strike_rate', ascending=False).head(4).copy()

# Normalize all metrics to 0-100 scale for fair comparison
# This ensures all metrics have equal visual weight in the radar chart
normalized_profiles = top_batters_radar[['batter'] + metrics].copy()
for metric in metrics:
    max_val = batter_profiles[metric].max()
    normalized_profiles[metric] = (normalized_profiles[metric] / max_val) * 100

# Set up subplot configuration for multiple radar charts
num_batters = len(normalized_profiles)
rows = ceil(num_batters / 2)
cols = 2 if num_batters > 1 else 1

# Calculate angles for radar chart spokes (equally spaced around circle)
angles = [n / float(len(metrics)) * 2 * pi for n in range(len(metrics))]
angles += angles[:1]  # Complete the circle

# Create subplot grid with polar projections
fig, axes = plt.subplots(rows, cols, subplot_kw=dict(polar=True), figsize=(12, 5 * rows))
axes = np.array(axes).reshape(-1)  # Flatten axes array for easy iteration

# Create individual radar chart for each top batter
for plot_idx, (_, row) in enumerate(normalized_profiles.iterrows()):
    # Get normalized values for this batter
    values = row[metrics].tolist()
    values += values[:1]  # Complete the circle
    
    # Configure the radar chart
    ax = axes[plot_idx]
    ax.set_theta_offset(pi / 2)      # Start from top
    ax.set_theta_direction(-1)       # Clockwise direction
    
    # Set spoke labels and positions
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(metrics)
    
    # Configure radial axis (concentric circles)
    ax.set_rlabel_position(30)
    ax.set_yticks([20, 40, 60, 80, 100])
    ax.set_yticklabels(["20", "40", "60", "80", "100"], color="grey", size=8)
    ax.set_ylim(0, 100)
    
    # Plot the radar chart for this batter
    ax.plot(angles, values, linewidth=2, linestyle='solid', label=row['batter'])
    ax.fill(angles, values, alpha=0.25)  # Fill area for visual impact
    ax.set_title(row['batter'], size=13, y=1.1)

# Add overall title and display
plt.suptitle("Batter Profiles: Radar Chart of Batting Metrics", size=16, y=1.02)
plt.tight_layout()
plt.show()

# Print explanation of radar chart interpretation
print("\nRadar Chart Interpretation:")
print("- Larger area = More aggressive/effective batting style")
print("- Strike Rate spoke: Higher = more runs per ball")
print("- Boundary % spoke: Higher = more aggressive shot selection")
print("- Dot % spoke: Lower is better (fewer balls without scoring)")

print("\nTop 4 Batters - Raw Values:")
print(top_batters_radar[['batter'] + metrics + ['balls_faced', 'total_runs']].round(2))