In [None]:
from dataprep import prepare_data

DATA_DOWNLOAD_MODE = "FULL" # FULL or INCREMENTAL
# when INCREMENTAL is chosen then it is assumed that historical data already exists and last 30 days data is fetched and merged to that


print("Start data prep")
# prepare_data(DATA_DOWNLOAD_MODE)

In [None]:
import pandas as pd
full_df = pd.read_parquet('/home/abhay/work/pytasy/processed_output/delivery_parquet/')
people_df = pd.read_csv('downloads/people.csv')
# full_df.createOrReplaceTempView("all_matches")
full_df

In [None]:
full_df['match_type'].unique()

In [None]:
from country_lookup import venue_to_country, test_hosting_cities
import numpy as np

# Define the lookup function
def lookup_country(city, venue):
    if city is None:
        return venue_to_country.get(venue, None)
    return test_hosting_cities.get(city, None)  # Returns None if city not found
lookup_country_vec = np.vectorize(lookup_country)
# Apply the UDF
full_df['country'] = lookup_country_vec(full_df['city'], full_df['venue_name'])

full_df['year'] = pd.to_datetime(full_df["dt"]).dt.year.astype('str')

In [None]:

player_id = 'ba607b88'
match_type='test'
home_country='India'
people_df[people_df['unique_name'].str.contains('kohli', case=False)]


In [None]:
match_type_stats = full_df[full_df['match_type'].str.lower() == match_type]
batter_stats = match_type_stats[full_df['batter_id'] == player_id]
batter_dismissals = match_type_stats[match_type_stats['wicket_player_id'] == player_id]

In [None]:
batter_stats['innings_id'] = batter_stats['match_id'] + "#" + batter_stats['innings_number']
innings_ids = set(batter_stats['innings_id'])
batter_stats

In [None]:
team_stats = match_type_stats[(match_type_stats['batter_id'] != player_id) & ((match_type_stats['match_id'] + "#" + match_type_stats['innings_number']).isin(innings_ids))]
team_dismissals = match_type_stats[
    (match_type_stats['wicket_player_id'] != player_id) & (match_type_stats['is_wicket'] ==1) & ((match_type_stats['match_id'] + "#" + match_type_stats['innings_number']).isin(innings_ids))]

In [None]:
def get_delivery_stats(player_id, match_type, full_df):
    match_type_stats = full_df[full_df['match_type'].str.lower() == match_type]
    batter_stats = match_type_stats[full_df['batter_id'] == player_id]
    batter_dismissals = match_type_stats[match_type_stats['wicket_player_id'] == player_id]
    batter_stats['innings_id'] = batter_stats['match_id'] + "#" + batter_stats['innings_number']
    innings_ids = set(batter_stats['innings_id'])
    team_stats = match_type_stats[(match_type_stats['batter_id'] != player_id) & ((match_type_stats['match_id'] + "#" + match_type_stats['innings_number']).isin(innings_ids))]
    team_stats['innings_id'] = team_stats['match_id'] + "#" + team_stats['innings_number']
    team_dismissals = match_type_stats[
        (match_type_stats['wicket_player_id'] != player_id) & (match_type_stats['is_wicket'] ==1) & ((match_type_stats['match_id'] + "#" + match_type_stats['innings_number']).isin(innings_ids))]
    return (batter_stats, batter_dismissals,team_stats, team_dismissals)

def get_aggregate_stats(stats,dismissals, grouping_keys):
    stats_agg = stats.groupby(grouping_keys).agg(
        runs=('batter_runs', 'sum'),
        deliveries=('venue_name', 'count'),
        matches=('match_id', 'nunique'),
        innings=('innings_id', 'nunique'),
    )
    dismissals_agg = dismissals.groupby(grouping_keys).agg(
        dismissals=('batter_name', 'count')
    )

    combined_stats = pd.merge(
        stats_agg, 
        dismissals_agg, 
        on=grouping_keys,  # List of columns to join on
        how='left',  # or 'left', 'right', 'outer',
    ).reset_index()
    combined_stats['average'] = combined_stats['runs']/combined_stats['dismissals'].replace(0, 1).fillna(1)
    combined_stats['strike_rate'] = 100*combined_stats['runs']/combined_stats['deliveries'].replace(0, 1).fillna(1)
    return combined_stats

def get_impact_stats(batter_stats,batter_dismissals,team_stats, team_dismissals, grouping_keys):
    batter_agg_stats = get_aggregate_stats(batter_stats, batter_dismissals, grouping_keys)
    team_agg_stats = get_aggregate_stats(team_stats, team_dismissals, grouping_keys)

    impact_stats = pd.merge(
        batter_agg_stats, 
        team_agg_stats, 
        on=grouping_keys,  # List of columns to join on
        how='left',  # or 'left', 'right', 'outer',
        suffixes=['_player', '_team']
    ).reset_index()
    impact_stats['global_key'] = impact_stats[grouping_keys].agg('_'.join, axis=1)
    impact_stats['deliveries_factor'] = impact_stats['deliveries_player']/((impact_stats['deliveries_player']+impact_stats['deliveries_team'])/7.5)
    impact_stats['runs_factor'] = impact_stats['runs_player']/((impact_stats['runs_player']+impact_stats['runs_team'])/7.5)
    impact_stats['strike_rate_factor'] = impact_stats['strike_rate_player']/impact_stats['strike_rate_team']
    return impact_stats

grouping_keys = ["season", "country", "bowler_team"]

(batter_stats, batter_dismissals,team_stats, team_dismissals) = get_delivery_stats(player_id, match_type,full_df)

season_loc_oppteam_impact_stats = get_impact_stats(batter_stats,batter_dismissals,team_stats, team_dismissals, ["season", "country", "bowler_team"])
season_oppteam_impact_stats = get_impact_stats(batter_stats,batter_dismissals,team_stats, team_dismissals, ["season", "bowler_team"])
season_loc_impact_stats = get_impact_stats(batter_stats,batter_dismissals,team_stats, team_dismissals, ["season", "country"])
year_oppteam_impact_stats = get_impact_stats(batter_stats,batter_dismissals,team_stats, team_dismissals, ["year", "country", "bowler_team"])
year_loc_impact_stats = get_impact_stats(batter_stats,batter_dismissals,team_stats, team_dismissals, ["year", "country"])
year_impact_stats = get_impact_stats(batter_stats,batter_dismissals,team_stats, team_dismissals, ["year"])


In [None]:
import matplotlib.pyplot as plt
def plot_comparison_df(df,stat):
    
    df_plot = df.set_index('global_key')

    # 3. Plot the multiline chart
    plt.figure(figsize=(8,8)) # Set the figure size for better readability
    
    # Using the .plot() method directly on the DataFrame
    # By default, if the index is numeric, it will be used as the x-axis
    # and each column will be plotted as a separate line.
    df_plot[[stat+'_player',stat+'_team']].plot(kind='bar', ax=plt.gca())
    
    plt.title(f'Compare team vs player performance for {stat}')
    plt.xlabel('Series')
    plt.ylabel(stat)
    plt.grid(True) # Add a grid for easier reading
    plt.legend(title='Series') # Add a legend to identify the lines
    plt.tight_layout() # Adjust layout to prevent labels from overlapping
    plt.show() # Display the plot

def plot_factor_df(df,stat):
    
    df_plot = df.set_index('global_key')

    # 3. Plot the multiline chart
    plt.figure(figsize=(8,8)) # Set the figure size for better readability
    ax = plt.gca() 
    
    # Using the .plot() method directly on the DataFrame
    # By default, if the index is numeric, it will be used as the x-axis
    # and each column will be plotted as a separate line.
    df_plot[[stat+'_factor']].plot(kind='bar', ax=ax)
    ax.axhline(y=0.9, color='r', linestyle='--', linewidth=2, label=f'90% efficiency')
    
    plt.title(f'{stat} ratio compared to top 7 batter expectation')
    plt.xlabel('Series')
    plt.ylabel(stat+'_factor')
    plt.grid(True) # Add a grid for easier reading
    plt.legend(title='Series') # Add a legend to identify the lines
    plt.tight_layout() # Adjust layout to prevent labels from overlapping
    plt.show() # Display the plot

In [None]:
home_impact = season_loc_impact_stats[season_loc_impact_stats['country'] == home_country]
home_impact.attrs['name'] = 'Home'
tour_impact = season_loc_impact_stats[season_loc_impact_stats['country'] != home_country]
tour_impact.attrs['name'] = 'Away'
sena_impact = season_loc_impact_stats[season_loc_impact_stats['country'].isin(['South Africa','England','New Zealand','Australia'])]
sena_impact.attrs['name'] = 'SENA'
subcontinent_impact = season_loc_impact_stats[season_loc_impact_stats['country'].isin(['India','Pakistan','Bangladesh','Sri Lanka'])]
subcontinent_impact.attrs['name'] = 'Subcontinent'



In [None]:
plot_comparison_df(home_impact, 'average') 

In [None]:
plot_comparison_df(tour_impact, 'average') 

In [None]:
plot_comparison_df(sena_impact, 'average') 

In [None]:
plot_comparison_df(subcontinent_impact, 'average') 

In [None]:
plot_comparison_df(year_impact_stats, 'average')

In [None]:
plot_factor_df(home_impact, 'runs')

In [None]:
plot_factor_df(home_impact, 'deliveries')

In [None]:
plot_factor_df(tour_impact, 'runs')

In [None]:
plot_factor_df(tour_impact, 'deliveries')

In [None]:
plot_factor_df(sena_impact, 'deliveries')

In [None]:
plot_factor_df(sena_impact, 'runs')

In [None]:
plot_factor_df(subcontinent_impact, 'runs')

In [None]:
plot_factor_df(year_impact_stats, 'runs')

In [None]:
plot_factor_df(sena_impact, 'deliveries')

In [None]:
plot_factor_df(subcontinent_impact, 'deliveries')

In [None]:
plot_factor_df(year_impact_stats, 'deliveries')

In [None]:
def generate_summary_text(impact_df):
    recent_df = impact_df.tail(5)
    return f"""
        In {impact_df.attrs.get('name', "Overall")} conditions:
            He scored at an average better than rest of the team {impact_df[impact_df['average_player'] > impact_df['average_team']].shape[0]} out of {impact_df.shape[0]} seasons.
            He scored atleast 90% or more runs expected from an individual top 7 batter {impact_df[impact_df['runs_factor'] > 0.9].shape[0]} out of {impact_df.shape[0]} seasons.
            He played atleast 90% or more deliveries expected from an individual top 7 batter {impact_df[impact_df['deliveries_factor'] > 0.9].shape[0]} out of {impact_df.shape[0]} seasons.
            His strike rate was atleast 0.9X team strike rate {impact_df[impact_df['strike_rate_factor'] > 0.9].shape[0]} out of {impact_df.shape[0]} seasons.
            His average was {impact_df['runs_player'].sum()/impact_df['dismissals_player'].fillna(1).sum()} compared to team {impact_df['runs_team'].sum()/impact_df['dismissals_team'].fillna(1).sum()}.
            His strike_rate was {100*impact_df['runs_player'].sum()/impact_df['deliveries_player'].fillna(1).sum()} compared to team {impact_df['runs_team'].sum()/impact_df['deliveries_team'].fillna(1).sum()}.

            And if you look at the last 5 series

            He scored at an average better than rest of the team {recent_df[recent_df['average_player'] > recent_df['average_team']].shape[0]} out of {recent_df.shape[0]} seasons.
            He scored atleast 90% or more runs expected from an individual top 7 batter {recent_df[recent_df['runs_factor'] > 0.9].shape[0]} out of {recent_df.shape[0]} seasons.
            He played atleast 90% or more deliveries expected from an individual top 7 batter {recent_df[recent_df['deliveries_factor'] > 0.9].shape[0]} out of {recent_df.shape[0]} seasons.
            His strike rate was atleast 0.9X team strike rate {recent_df[recent_df['strike_rate_factor'] > 0.9].shape[0]} out of {recent_df.shape[0]} seasons.
            His average was {recent_df['runs_player'].sum()/recent_df['dismissals_player'].fillna(1).sum()} compared to team {recent_df['runs_team'].sum()/recent_df['dismissals_team'].fillna(1).sum()}.
            His strike_rate was {100*recent_df['runs_player'].sum()/recent_df['deliveries_player'].fillna(1).sum()} compared to team {100*recent_df['runs_team'].sum()/recent_df['deliveries_team'].fillna(1).sum()}.
    """

In [None]:
print(generate_summary_text(home_impact))
print(generate_summary_text(tour_impact))
print(generate_summary_text(sena_impact))
print(generate_summary_text(subcontinent_impact))
print(generate_summary_text(year_impact_stats))