First we will import the appropriate modules needed to conduct this analysis

In [None]:
import dtale #We will use this to explore various charts from the dataframe in very few lines of code

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta
from collections import defaultdict
import json

import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
from plotly_calplot import calplot

Each cricket match is stored as a JSON file showing ball by ball data. Thus I have written functions that will compute various statistics for batting and bowling and to visualise the data.
These matches are grouped into directories depending on their format. And each directory contains a README.txt file which lists the matches, the dates, match ids and the teams competing in the matches.

In [None]:
def rangeofdates(start_date, end_date):
    """
    This is a helper function that will generate dates in the specified range 
    """
    for n in range(int ((end_date - start_date).days)+1):
        yield start_date + timedelta(n)

#Defining the global variables that will be used in the subsequent functions 

main_playing_nations = ['Australia','England','South Africa','West Indies','New Zealand','India','Pakistan','Sri Lanka' ,'Bangladesh','Afghanistan']

colours = {'Australia':'#ffff00','England':'#ff0066','South Africa':'#669900','West Indies':'#800000','New Zealand':'#000000','India':'#ff9900','Pakistan':'#008000','Sri Lanka':'#000099','Bangladesh':'#ff0000','Afghanistan':'#0099ff'}

years = list(range(2014,2024)) #2014 to 2023 are the years of interest for this analysis

india_matches_path = 'india_mens_matches/README.txt'
india_players = 'indian_world_cup_players.txt'
t20_path = 'T20_matches/README.txt'
odi_path = 'ODI_matches/README.txt'

#These are the dates when the IPL tournament occurred from 2014 to 2023 
ipl_tournaments = {2014: [dt.strftime("%Y-%m-%d") for dt in rangeofdates(date(2014,4,16),date(2014,6,1))],
                   2015: [dt.strftime("%Y-%m-%d") for dt in rangeofdates(date(2015,4,8),date(2015,5,24))],
                   2016: [dt.strftime("%Y-%m-%d") for dt in rangeofdates(date(2016,4,9),date(2016,5,29))],
                   2017: [dt.strftime("%Y-%m-%d") for dt in rangeofdates(date(2017,4,2),date(2017,5,21))],
                   2018: [dt.strftime("%Y-%m-%d") for dt in rangeofdates(date(2018,4,7),date(2018,5,27))],
                   2019: [dt.strftime("%Y-%m-%d") for dt in rangeofdates(date(2019,3,23),date(2019,5,12))],
                   2020: [dt.strftime("%Y-%m-%d") for dt in rangeofdates(date(2020,9,19),date(2020,11,10))],
                   2021: [dt.strftime("%Y-%m-%d") for dt in rangeofdates(date(2021,9,19),date(2021,10,15))]+
                         [dt.strftime("%Y-%m-%d") for dt in rangeofdates(date(2021,4,9),date(2021,5,2))],
                   2022: [dt.strftime("%Y-%m-%d") for dt in rangeofdates(date(2022,3,26),date(2022,5,29))],
                   2023: [dt.strftime("%Y-%m-%d") for dt in rangeofdates(date(2023,3,31),date(2023,5,30))]}

#Note that the 2021 IPL season was split due to India's COVID wave so thus we added separate lists together

def generate_yearly_schedule(year):
    """
    This function will take in a year and generate a calendar plot illustrating the Indian Cricket schedule. It will illustrate the 
    test matches, the IPL season and the international limited overs games played by the Men In Blue

    Args:
        year (int): Year 

    Returns:
        None (this function will generate a calendar plot which will be displayed in the output)
    """
    
    def set_value(format):
        """
        This is a helper function which will assign a value depending on the format (['ODI','T20'],'IPL' or 'Test')

        Args:
           format (str): The match format 

        Returns:
           value (int) which will be used to color code the calendar plot 
        """
        if format in ['ODI','T20']:
            return 1
        elif format == 'IPL':
            return 2
        else:
            return 3
    
    matches = pd.read_csv(india_matches_path, names=['Date', 'Format','ID', 'Teams'])
    matches = matches.drop(columns=['Teams', 'ID'])#Creating the international matches dataframe for Team India
    
    ipl_data = [[date,'IPL'] for date in ipl_tournaments[year]]
    ipl_df = pd.DataFrame(ipl_data,columns=['Date','Format']) #Creating the IPL dataframe 
    
    matches_year = matches[(matches.Date >= '{}-01-01'.format(year)) & (matches.Date <= '{}-12-31'.format(year))] #Getting matches played during the year 
    
    year_schedule = pd.concat([matches_year, ipl_df]).sort_values(by='Date') #Combining the 2 dataframes
            
    year_schedule['Value'] = year_schedule.apply(lambda x: set_value(x['Format']), axis=1) #Applying helper function to create value column
    
    # Creating the calendar trace: The calplot asks for a set of values hence why we created the column above 
    fig = calplot(year_schedule,x="Date", y="Value",dark_theme=True,colorscale=px.colors.diverging.Tropic,
                  title="Indian Cricket Schedule in {}".format(year),month_lines_color='green',month_lines_width=2, text="Format")

    #Updating the layout to an appropriate size 
    fig.update_layout(width=900, height=190)

    fig.show()

######################THE FOLLOWING FUNCTIONS FOCUS ON THE VENUE IN QUESTION WHERE THE MATCHES WERE PLAYED######################

def get_innings_scores(match):
    """
    This function will get the scores in each innings of a match by summing the total runs scored in each delivery

    Args:
       match: The match we are interested in 

    Returns:
       innings_scores (list): List containing the first innings and second innings scores
    """
    
    inning_scores = []
    #We are interested only in matches where both teams played their innings 
    if len(match['innings']) == 2:
        for i in list(range(2)):
            runs = 0
            #This is a nested for loop to iterate through every delivery in the match 
            for over in match['innings'][i]['overs']:
                for delivery in over['deliveries']:
                    runs += delivery['runs']['total']
            inning_scores.append(runs)
    else:
        inning_scores = [None,None]
    
    return inning_scores


def get_venue_information(venue, format_path, date, match_id=None):
    """
    This function will generate data about the matches played at a specific venue in a specific format before the date. It makes sense
    to view this information before the knockout game occurred.

    Args:
       venue (str): The venue of interest 
       format_path (str): The path containing the matches of a specific format
       date (str): The date of the knockout game 
       match_id (str): A match has a JSON file assigned to it with an ID. This is an optional argument set to None by default

    Returns:
       first_innings_scores (list): List containing the first innings scores of matches 
       second_innings_scores (list): List containing the second innings scores of matches
       An additional list containing the venue, format, win percentages batting first and second and the toss to game win ratio.
    
    """
    first_innings_scores = []
    second_innings_scores = []
    win_batting_first = []
    win_batting_second = []
    tosses_game_win = []

    #These last 3 arrays will consist of Boolean values 

    with open(format_path) as path_file:
        matches = path_file.readlines()

    for match in matches:
        with open('{format_path}/{id}.json'.format(format_path=format_path.split('/')[0],id=match.split(',')[1])) as match_file: 
            game = json.load(match_file)

        #We ignore games that had no result and if both innings were not played so we first check this 
        if game['info']['outcome'].get('winner') is not None and len(game['innings']) == 2:
            #We are interested in games BEFORE the date of the knockout game so we check the date as well
            if game['info'].get('venue').lower() == venue.lower() and game['info']['dates'][0] < date:
                first_innings_scores.append(get_innings_scores(game)[0])
                second_innings_scores.append(get_innings_scores(game)[1])
                win_batting_first.append(game['info']['outcome'].get('winner') == game['innings'][0]['team'])
                win_batting_second.append(game['info']['outcome'].get('winner') == game['innings'][1]['team'])
                tosses_game_win.append(game['info']['outcome'].get('winner') == game['info']['toss'].get('winner'))
                
        
    return first_innings_scores, second_innings_scores, [venue, format_path.split('_')[0], round(np.mean(win_batting_first) * 100, 1), round(np.mean(win_batting_second) * 100, 1), round(np.mean(tosses_game_win), 2)] #We multiply by 100 in both these values to get a percentage

def create_venue_dataframe(matches):
    """
    This function will generate the dataframe about the matches played at a specific venue in a specific format before the date. It uses the 
    data created from the 'get_venue_information' function. It will also show us India's toss result and innings order to make comparisons.

    Args:
       matches (list): The list of matches whose venues we want to get information for 

    Returns:
       match_df (DataFrame): The dataframe containing the information about the venues
    """

    data = []
    for match in matches:
        with open('{format_path}/{id}.json'.format(format_path=match[-3].split('/')[0],id=match[-1])) as match_file: 
            game = json.load(match_file)
        #We augment the venue information with whether India won the toss and when they batted during the game.
        india_toss_win = 'India' == game['info']['toss'].get('winner')
        india_innings_order = [game['innings'][i]['team'] for i in range(len(game['innings']))].index('India')+1

        #matches is a list of tuples and the second to last element in the tuple is the date of the knockout game. We also call the get_venue_information to get the win percentages 
        data.append([match[-2]]+get_venue_information(*match)[-1]+[india_toss_win,india_innings_order])
    
    #Creating the dataframe and setting the title
    match_df = pd.DataFrame(data, columns=['Date', 'Venue', 'Format',
                                            '% Wins Batting First','% Wins Batting Second', 'Tosses to Game Win Ratio','India Toss Win','India Innings Order'])
    match_df = match_df.set_index(['Date', 'Venue', 'Format'])

    match_df.style.set_table_attributes("style='display:inline'").set_caption("Win Percentages at Knockout Venue before date of game based on Toss and Innings Order")

    return match_df

def create_venue_innings_scatter(venue, format_path, date, match_id):
    """
    This function will generate a scatter plot showing the combinations of innings scores at a venue before the knockout game and the points
    are color coded depending on whether the game was won by defending or chasing. 

    Args:
       venue (str): The venue of interest 
       format_path (str): The path containing the matches of a specific format
       date (str): The date of the knockout game 
       match_id (str): A match has a JSON file assigned to it with an ID

    Returns:
       None (it will just display a scatterplot in the output)
    
    """
    
    #First we get the list of first innings scores and second innings scores to plot on scatter plot
    first_innings_scores, second_innings_scores = get_venue_information(venue, format_path, date)[0], get_venue_information(venue, format_path, date)[1]
    with open('{format_path}/{id}.json'.format(format_path=format_path.split('/')[0],id=match_id)) as match_file: 
        game = json.load(match_file)
    match_innings = get_innings_scores(game)
    
    #We will color code the points according to whether the game was won by defending or chasing 
    innings_win = list(map(lambda x: 'Chased' if x else 'Defended', np.array(first_innings_scores) < np.array(second_innings_scores)))
    
    fig = px.scatter(x=first_innings_scores,y=second_innings_scores, color=innings_win)
    
    #This highlights the scores in the knockout game 
    fig.add_scatter(marker_color="maroon",x=[match_innings[0]],y=[match_innings[1]],name=' vs '.join(game['info']['teams']))
    fig.update_traces(marker_size=10, hovertemplate="<br>".join(["First Innings: %{x}", "Second Innings: %{y}"]))
    
    fig.update_layout(xaxis_title="First Innings", yaxis_title="Second Innings",
                      title="Scores at {venue} in {format}s before {date}".format(venue=venue,format=format_path.split('_')[0],date=date))
    
    fig.show()

######################THE FOLLOWING FUNCTIONS FOCUS ON THE PERFORMANCE OF INDIVIDUAL PLAYERS IN PARTICULAR MATCHES######################

#For a certain player and match, we store runs, strike rate, dot ball percentage and whether they were out in a list 
def player_innings(match, player): 
    """
    This function will generate a list describing the performance of a batter in a game. 

    Args:
       match: The match of interest
       player (str): Player whose performance we want to view with the bat

    Returns:
       A list containing the runs, strike rate, dot ball percentage and boolean indicating whether batter was out
    
    """
    inning_team_order = [match['innings'][i]['team'] for i in range(len(match['innings']))]
    india_played = 'India' in inning_team_order #There is no need to execute the rest of the function if India did not play
    out = False
    runs = 0
    balls = 0
    dots = 0
    if india_played: 
        inning_order = inning_team_order.index('India')
        for over in match['innings'][inning_order]['overs']:
            for delivery in over['deliveries']:
                if delivery['batter'] == player:
                    runs += delivery['runs']['batter']
                    balls += 1
                    dots += delivery['runs']['total'] == 0
                if delivery.get('wickets') is None:
                    continue
                else:
                    if delivery.get('wickets')[0]['player_out'] == player:
                        out = True
                        break
    if balls == 0:
        return None #This means they did not face a single ball
    else:
        return [runs, (runs / balls) * 100, (dots / balls) * 100,out]


def player_bowling_performance(match,player):
    """
    This function will generate a list describing the performance of a bowler in a game. 

    Args:
       match: The match of interest
       player (str): Player whose performance we want to view with the ball

    Returns:
       A list containing the wickets, economy rate and dot ball percentage
    
    """ 
    match['info']['teams'].remove('India')
    opposition_team = match['info']['teams'][0]
    inning_team_order = [match['innings'][i]['team'] for i in range(len(match['innings']))]
    india_bowled = opposition_team in inning_team_order #There is no need to execute the rest of the function if India did not bowl
    runs = 0
    balls = 0
    dots = 0
    wickets = 0
    if india_bowled:
        inning_order = inning_team_order.index(opposition_team)
        for over in match['innings'][inning_order]['overs']:
            for delivery in over['deliveries']:
                if delivery['bowler'] == player:
                    runs += delivery['runs']['total']
                    balls += 1
                    dots += delivery['runs']['total'] == 0
                    if delivery.get('wickets') is None:
                        continue
                    elif delivery['wickets'][0]['kind'] in ['caught','lbw','bowled','hit wicket','stumped']: #Run Out does not belong to bowler
                        wickets += 1
    if balls == 0:
        return None #This means they did not bowl
    else:
        overs = balls/6
        return [wickets, runs/overs, (dots/balls)*100]

def yearly_wickets_and_average_economy(player, year, format,captain):
    """
    This function will generate a list describing the player's bowling performance during a specific year under a particular captaincy 
    and format.

    Args:
       player (str): Player whose yearly performance we want to view with the ball
       year (int): The year
       format (str): Type of game i.e ODI, T20, Test
       captain (str): The captain the player was playing under. It could be the captain himself to assess whether captaincy affected their
       individual performance

    Returns:
       A list containing the total wickets, average economy and average dot ball percentage
    
    """
    bowling_spells = []
    with open(india_matches_path) as path_file:
        matches = path_file.readlines()

    for match in matches:
        if match.split(',')[1].lower() == format.lower() and (match.split(',')[0] >= '{}-01-01'.format(year)) and (
                match.split(',')[0] <= '{}-12-31'.format(year)):
            with open('india_mens_matches/{id}.json'.format(id=match.split(',')[2])) as match_file:
                game = json.load(match_file)
            if game['info'].get('captain') == captain:
                bowling_spells.append(player_bowling_performance(game, player))

    return [round(np.sum([bowling_spell[0] for bowling_spell in bowling_spells if bowling_spell is not None]),0), #This is the number of wickets
            round(np.mean([bowling_spell[1] for bowling_spell in bowling_spells if bowling_spell is not None]),0), #Average Economy
            round(np.mean([bowling_spell[2] for bowling_spell in bowling_spells if bowling_spell is not None]),1)] #Average Dot Ball Percentage


def yearly_average_and_strike_rate(player, year, format,captain):
    """
    This function will generate a list describing the player's batting performance during a specific year under a particular captaincy 
    and format.

    Args:
       player (str): Player whose yearly performance we want to view with the bat
       year (int): The year
       format (str): Type of game i.e ODI, T20, Test
       captain (str): The captain the player was playing under. It could be the captain himself to assess whether captaincy affected their
       individual performance

    Returns:
       A list containing the average runs scored, average strike rate and average dot ball percentage
    
    """
    innings = []
    with open(india_matches_path) as path_file:
        matches = path_file.readlines()

    for match in matches:
        if match.split(',')[1].lower() == format.lower() and (match.split(',')[0] >= '{}-01-01'.format(year)) and (
                match.split(',')[0] <= '{}-12-31'.format(year)):
            with open('india_mens_matches/{id}.json'.format(id=match.split(',')[2])) as match_file:
                game = json.load(match_file)
            if game['info'].get('captain') == captain:
                innings.append(player_innings(game, player))

    outs = sum([inning[-1] for inning in innings if inning is not None])
    if outs == 0:
        outs = 1
    return [round(sum([inning[0] for inning in innings if inning is not None])/outs,2), round(np.mean([inning[1] for inning in innings if inning is not None]),2), round(np.mean([inning[2] for inning in innings if inning is not None]),2)] 
    #This list stores the average (total runs/number of outs), the average strike rate and the average dot ball percentage respectively

def number_of_matches_per_year(player, year, format, captain):
    """
    This function will return the total matches of a player in a year under a particular captain in a specific format 
    
    Args:
       player (str): Player of interest 
       year (int): The year
       format (str): Type of game i.e ODI, T20, Test
       captain (str): The captain the player was playing under

    Returns:
       count (int): Number of matches played 
    
    """
    count = 0
    with open(india_matches_path) as path_file:
        matches = path_file.readlines()

    for match in matches:
        if match.split(',')[1].lower() == format.lower() and (match.split(',')[0] >= '{}-01-01'.format(year)) and (match.split(',')[0] <= '{}-12-31'.format(year)):
            with open('india_mens_matches/{id}.json'.format(id=match.split(',')[2])) as match_file:
                game = json.load(match_file)
            if player in game['info']['players'].get('India') and game['info'].get('captain') == captain:
                count += 1
    return count

def get_win_percentage(player, year, format,captain):
    """
    This function will return the win percentages of a player in a year under a particular captain in a specific format. It can assist us 
    in finding a strong combination for the squad. 
    
    Args:
       player (str): Player of interest 
       year (int): The year
       format (str): Type of game i.e ODI, T20, Test
       captain (str): The captain the player was playing under

    Returns:
        Float: The win percentage 
    
    """
    wins = 0
    with open(india_matches_path) as path_file:
        matches = path_file.readlines()

    for match in matches:
        if match.split(',')[1].lower() == format.lower() and (match.split(',')[0] >= '{}-01-01'.format(year)) and (match.split(',')[0] <= '{}-12-31'.format(year)):
            with open('india_mens_matches/{id}.json'.format(id=match.split(',')[2])) as match_file:
                game = json.load(match_file)
            if player in game['info']['players'].get('India') and game['info']['outcome'].get('winner') == 'India' and game['info'].get('captain') == captain:
                wins += 1

    total_matches = number_of_matches_per_year(player, year, format,captain)

    if total_matches == 0:
        return 0
    else:
        return round((wins/total_matches)*100,2)

def get_world_cup_player_info(years,format,captain):
    """
    This function will return the dataframes that give summaries of the players' batting and bowling performances respectively in a 
    specific year under a particular captain
    
    
    Args:
       years (list): The years we are interested in
       format (str): Type of game i.e ODI, T20, Test
       captain (str): The captain the player was playing under

    Returns:
        batting_df (DataFrame): This dataframe describes the batting performance of Indian players
        bowling_df (DataFrame): This dataframe describes the bowling performance of Indian players
    
    """
    batting = []
    bowling = []
    with open(india_players) as path_file:
        players = path_file.readlines()
    
    #For each player in question we need to add information to the appropriate dataframes according to their role (batter, all rounder, bowler)
    for player_row in players:
        player = player_row.split(',')[0]
        type = player_row.split(',')[-1].strip()
        for year in years:
            yearly_matches = number_of_matches_per_year(player,year,format,captain)
            win_percentage = get_win_percentage(player,year,format,captain)
            if yearly_matches == 0: 
                continue
            else:
                if type == 'batter':
                    batting.append([player,year,format.upper(),captain,yearly_matches,win_percentage]+yearly_average_and_strike_rate(player,year,format,captain))
                elif type == 'all_rounder':
                    batting.append([player,year,format.upper(),captain,yearly_matches,win_percentage]+yearly_average_and_strike_rate(player,year,format,captain))
                    bowling.append([player,year,format.upper(),captain,yearly_matches,win_percentage]+yearly_wickets_and_average_economy(player,year,format,captain))
                else:
                    bowling.append([player,year,format.upper(),captain,yearly_matches,win_percentage]+yearly_wickets_and_average_economy(player,year,format,captain))

    
    #Creating the batting dataframe
    batting_df = pd.DataFrame(batting, columns=['Player', 'Year', 'Format', 'Captain', 'Matches', 'Win %','Average',
                                           'Average Strike Rate',
                                           'Dot Ball Percentage'])
    batting_df = batting_df.set_index(['Player', 'Year', 'Format','Captain'])

    batting_df.dropna(axis=0, inplace=True) #Removing any nan values meaning that there is no data for the player in the year

    
    #Creating the bowling dataframe
    bowling_df = pd.DataFrame(bowling, columns=['Player', 'Year', 'Format','Captain','Matches','Win %', 'Wickets',
                                           'Average Economy Rate',
                                           'Dot Ball Percentage'])
    bowling_df = bowling_df.set_index(['Player', 'Year', 'Format'])

    bowling_df.dropna(axis=0, inplace=True)

    return batting_df, bowling_df

######################THE FOLLOWING FUNCTIONS FOCUS ON THE STORY OF INDIVIDUAL MATCHES THAT ARE OF INTEREST######################

def get_match_story(game):
    """
    The purpose is to create a dataframe that describes what happened at each over in the innings including the wickets, who got dismissed,
    the batsmen who scored in each over including extras and who bowled the over.

    Args:
      game: The game that we are interested in

    Returns:
      inning_team_order (list): Returns the teams ordered by when they played their innings. This will be used in the next function when 
      plotting the line chart.
      dfs (DataFrame): The dataframe with over by over information
    
    """
    
    dfs = []
    inning_team_order = [game['innings'][i]['team'] for i in range(len(game['innings']))]
    for i in range(len(inning_team_order)): #Iterating through each innings 
        overs,runs,wickets,bowlers,players_out,over_breakdown = [], [],[],[],[],[]
        for over in game['innings'][i]['overs']:
            scores_by_players = defaultdict(lambda: 0) #For each over we set the dictionary values to 0. We want to track the runs scored by each batter and extras per over
            overs.append(game['innings'][i]['overs'].index(over))
            over_score, wicket_count = 0, 0
            players_out_in_over = []
            bowler = over['deliveries'][0]['bowler'] #Adding the name of the bowler 
            bowlers.append(bowler)
            for delivery in over['deliveries']:
                key = delivery['batter'] if delivery['runs']['batter'] > 0 else 'extras' #Checking the delivery to see who scored the runs
                scores_by_players[key] += delivery['runs']['batter'] if delivery['runs']['batter'] > 0 else delivery['runs']['extras'] 
                over_score += delivery['runs']['total'] 
                if 'wickets' in delivery.keys():
                    wicket_count += 1
                    players_out_in_over.append(delivery['wickets'][0]['player_out'])
            if len(players_out_in_over) == 0:
                players_out.append('None')
            else:
                players_out.insert(game['innings'][i]['overs'].index(over),players_out_in_over)

            scores_by_players = ['{}:{}\n'.format(key,value) for key,value in dict(scores_by_players).items()] 

            over_breakdown.append(scores_by_players)
            runs.append(over_score)
            wickets.append(-wicket_count)
        #Creating the over by over dataframe 
        df = pd.DataFrame({'Over': overs, 'Runs': runs,
                           'Wickets': wickets,
                          'Bowler': bowlers,
                           'Players_Out': players_out,
                           'Scores_By_Players': over_breakdown})
        dfs.append(df)
    return inning_team_order, dfs

def plot_match_story_graphs(match):
    """
    The purpose is to create figures that illustrate the story of the match specifically the innings of each team. 

    Args:
      match: The game that we are interested in

    Returns:
      None (instead an interactive figure is shown illustrating the inning stories of both teams)
    
    """
    
    with open('india_mens_matches/{id}.json'.format(id=match)) as match_json:
        india_match = json.load(match_json)

    inning_order, dfs = get_match_story(india_match)

    fig = make_subplots(rows=2,cols=1,shared_yaxes=True)
    
    team_1 = inning_order[0]
    team_2 = inning_order[1]

    #When you hover over each over, you can see the bowler, the players dismissed and the scores made by each batter in the over including extras
    hovertemplate = '<b>Bowler</b><br>%{customdata[0]}<br>' + '<br>'+'<b>Players Out</b><br>%{customdata[1]}<br>' + '<br>' + '<b>Scores by Player</b><br>%{customdata[2]}<br>' 
    
    #Color coding the lines depending on the team who was batting (see above the colours dictionary)
    dfs[0]['colors'] = dfs[0].apply(lambda x: colours[team_2] if x['Wickets'] < 0 else colours[team_1], axis=1)
    dfs[1]['colors'] = dfs[1].apply(lambda x: colours[team_1] if x['Wickets'] < 0 else colours[team_2], axis=1)
    
    #This adds the line graph of the second innings in the game
    fig.add_trace(go.Scatter(x=dfs[1]['Over']+1,
                                y=dfs[1]['Runs'],
                                name=team_2,
                                mode='markers+lines',
                                marker_color=dfs[1]['colors'],
                                line=dict(color=colours[team_2]),
                                customdata=np.transpose(np.array([dfs[1]['Bowler'],dfs[1]['Players_Out'],dfs[1]['Scores_By_Players']])),
                                hovertemplate=hovertemplate),row=2,col=1)
    
    #This adds the line graph of the first innings in the game
    fig.add_trace(go.Scatter(x=dfs[0]['Over']+1,
                                y=dfs[0]['Runs'],
                                name=team_1,
                                mode='markers+lines',
                                marker_color=dfs[0]['colors'],
                                line=dict(color=colours[team_1]),
                                customdata=np.transpose(np.array([dfs[0]['Bowler'],dfs[0]['Players_Out'],dfs[0]['Scores_By_Players']])),
                                hovertemplate=hovertemplate),row=1,col=1)
                                    
    fig.update_traces(marker={'size': 10})

    fig.update_layout(height=600,xaxis={'title':'Over'},yaxis={'title':'Runs'},title_text='{team_1} vs {team_2} Innings'.format(team_1=team_1,team_2=team_2))

    fig.show()

Upon articles online that have been highlighting India's recent heavy schedule, we would like to assess this to understand their workload.

In [None]:
#For each year of interest, we can view the Indian Cricket schedule
schedule_years = [2021,2022,2023]
for year in schedule_years:
    generate_yearly_schedule(year)

Next we can get information about the venues where India got knocked out and see if that had an effect on the outcome. We can look at the win percentages batting first and second, the effect of the toss and compare that to what happened to India on the day of the knockout match.

In [None]:
"""
These are the matches where India lost the game and thus the tournament
Each element in the list contains the venue, the format, the knockout game date and the match id corresponding to these games 
"""

knockout_matches = [('Shere Bangla National Stadium',t20_path, '2014-04-06','682965'),
                    ('Sydney Cricket Ground',odi_path, '2015-03-26','656493'),
                     ('Wankhede Stadium',t20_path,'2016-03-31','951371'),
                     ('Kennington Oval',odi_path,'2017-06-18','1022375'),
                     ('Old Trafford',odi_path,'2019-07-09','1144528'),
                     ('Dubai International Cricket Stadium',t20_path,'2021-10-31','1273739'),
                     ('Adelaide Oval',t20_path,'2022-11-10','1298178'),
                     ('Narendra Modi Stadium, Ahmedabad',odi_path,'2023-11-19','1384439')]

match_df = create_venue_dataframe(knockout_matches)

match_df

In [None]:
"""
Here we can take a look at the scatter plot of the innings scores at a venue where a knockout game was played using the 'knockout_matches' 
list from above.
"""
#Taken the example of the 4th element in the list corresponding to the Champions Trophy 2017 final between India and Pakistan
create_venue_innings_scatter(*knockout_matches[3])

In [None]:
"""
Here we can take a look at the match story of a knockout game and understand periods that costed India the match
For example here we have taken the 4th match in the knockout_matches list. (The -1 means we are just extracting the match_id)
"""

plot_match_story_graphs(knockout_matches[3][-1])

We can use the following dataframes to view the statistics of the players under Rohit Sharma's captaincy utilising the dtale module. This will help us view potential players to make the squad for this year's T20 World Cup and generally assess who performs well under him.

In [None]:
#We can look at the performance of players under Rohit's captaincy in 2022 and 2024 (He did not captain any T20 matches in 2023)
recent_years = [2022,2024]
batting_df_rohit,bowling_df_rohit = get_world_cup_player_info(recent_years,'t20','RG Sharma') 

Other captains we can look at: 'MS Dhoni' and 'V Kohli' and other formats we can view is 'ODI'

In [None]:
#Using dtale to view the batting dataframe and explore visuals
dtale.show(batting_df_rohit)

In [None]:
#Using dtale to view the bowling dataframe and explore visuals
dtale.show(bowling_df_rohit)