In [568]:
import pandas as pd
import numpy as np
import json
import os
import glob
import sys
import time
import numexpr
import bottleneck

In [569]:
# data = json.load(open("/Users/aahaansingh/Desktop/webscraping/CricBase/ipl_json/335982.json"))
# data

In [570]:
def match_key(data) :
    season = data["info"]["season"]
    match_number = None
    if "match_number" in data["info"]["event"] :
        match_number = str(data["info"]["event"]["match_number"])
    elif "stage" in data["info"]["event"] :
        match_number = data["info"]["event"]["stage"]
    return season, match_number

In [571]:
def match_data() :
    match_data_array = []
    match_attrs = ["season", "match_number", "city", "start_date", "winner", "batting_first", "chasing", "eliminator"]
    for filename in glob.glob(os.path.join("ipl_json", '*.json')):
        data = json.load(open(filename))
        city = None
        start_date = None
        winner = None
        eliminator = None
        season, match_number = match_key(data)
        if "city" in data["info"] :
            city = data["info"]["city"]
        if "dates" in data["info"] :
            if len(data["info"]["dates"]) > 0 :
                start_date = data["info"]["dates"][0]
        if "winner" in data["info"]["outcome"] :
            winner = data["info"]["outcome"]["winner"]
        elif "result" in data["info"]["outcome"] :
            winner = data["info"]["outcome"]["result"]
        if "eliminator" in data["info"]["outcome"] :
            eliminator = data["info"]["outcome"]["eliminator"]

        match_data_list = [season, match_number, city,
                   start_date, winner, data["info"]["teams"][0],
                   data["info"]["teams"][1], eliminator]
        match_data_array.append(match_data_list)
    match_df = pd.DataFrame(match_data_array, columns=match_attrs)
    return match_df
match_df = match_data()
print(match_df)
        

       season match_number       city  start_date  \
0        2024           23     Mohali  2024-04-09   
1        2023           33    Kolkata  2023-04-23   
2        2009           37  Kimberley  2009-05-09   
3        2021           46    Sharjah  2021-10-02   
4        2015  Qualifier 1     Mumbai  2015-05-19   
...       ...          ...        ...         ...   
1090  2009/10           17     Mumbai  2010-03-22   
1091  2020/21           25       None  2020-10-10   
1092     2019           21     Jaipur  2019-04-07   
1093     2013           16    Chennai  2013-04-13   
1094     2018           36  Hyderabad  2018-05-05   

                           winner                batting_first  \
0             Sunrisers Hyderabad          Sunrisers Hyderabad   
1             Chennai Super Kings          Chennai Super Kings   
2             Chennai Super Kings          Chennai Super Kings   
3                  Delhi Capitals               Mumbai Indians   
4                  Mumbai Indians

In [572]:
def player_data() :
    player_df = pd.read_csv("people.csv")
    player_match_array = []
    player_match_attrs = ["name", "player_id", "season", "match_number", "team", "runs_scored", "fours", "sixes", 
                          "out", "balls_faced", "position", "wickets", "runs_conceded", "balls_delivered",
                          "fours_conceded", "sixes_conceded", "wides", "no_balls"]
    for filename in glob.glob(os.path.join("ipl_json", '*.json')):
        data = json.load(open(filename))
        season, match_number = match_key(data)
        for player in data["info"]["registry"]["people"] :
            team = None
            for team_name in data["info"]["players"] :
                if player in data["info"]["players"][team_name] :
                    team = team_name
            if not team == None : # is an official
                player_match = [player, data["info"]["registry"]["people"][player], season, match_number,
                            team, None, None, None, None, None, None, None, None, None, None, None, None, None]
                player_match_array.append(player_match)
    player_match_df = pd.DataFrame(player_match_array, columns=player_match_attrs)
    return player_df, player_match_df
player_df, player_match_df = player_data()
print(player_match_df)
        

                  name player_id season match_number                 team  \
0           AK Markram  6a26221c   2024           23  Sunrisers Hyderabad   
1          Abdul Samad  8e514b4c   2024           23  Sunrisers Hyderabad   
2      Abhishek Sharma  f29185a1   2024           23  Sunrisers Hyderabad   
3       Arshdeep Singh  244048f6   2024           23         Punjab Kings   
4      Ashutosh Sharma  84d9c311   2024           23         Punjab Kings   
...                ...       ...    ...          ...                  ...   
24362  Shakib Al Hasan  7dc35884   2018           36  Sunrisers Hyderabad   
24363         TA Boult  a818c1be   2018           36     Delhi Daredevils   
24364        V Shankar  0994d0ae   2018           36     Delhi Daredevils   
24365          WP Saha  fe11caa6   2018           36  Sunrisers Hyderabad   
24366        YK Pathan  3c6ffae8   2018           36  Sunrisers Hyderabad   

      runs_scored fours sixes   out balls_faced position wickets  \
0      

In [573]:
def delivery_data() :
    # leaving out the "legal number" field from the delivery array because it's too much work for now;
    # if I eventually figure out a simple way to figure it out then I will implement it
    delivery_array = []
    delivery_features = ["season", "match_number", "team_batting", "over", "number", "batter",
                         "bowler", "non_striker", "extras", "runs", "total_runs", "wickets", "match_id"]
    wicket_array = []
    wicket_features = ["season", "match_number", "team_batting", "over", "number", "player_out", "type"]
    extra_array = []
    extra_features = ["season", "match_number", "team_batting", "over", "number", "byes", "legbyes", "noballs", "penalty", "wides"]
    fielder_wicket_array = []
    fielder_wicket_features = ["season", "match_number", "team_batting", "over", "number", "id"] # not name, which isn't necessarily unique
    for filename in glob.glob(os.path.join("ipl_json", '*.json')):
        data = json.load(open(filename))
        season, match_number = match_key(data)
        registry = data["info"]["registry"]["people"]
        for innings_data in data["innings"] :
            team_batting = innings_data["team"]
            for over_data in innings_data["overs"] :
                over = over_data["over"]
                for number, delivery_data in enumerate(over_data["deliveries"]) :
                    batter_name = delivery_data["batter"]
                    batter_id = registry[batter_name]
                    bowler_name = delivery_data["bowler"]
                    bowler_id = registry[bowler_name]
                    non_striker_name = delivery_data["non_striker"]
                    non_striker_id = registry[non_striker_name]
                    runs = delivery_data["runs"]["batter"]
                    total_runs = delivery_data["runs"]["total"]
                    wickets = 0
                    extras = delivery_data["runs"]["extras"]

                    if "wickets" in delivery_data :
                        wickets = len(delivery_data["wickets"])
                        for wicket_data in delivery_data["wickets"] :
                            wicket_type = wicket_data["kind"]
                            player_out_name = wicket_data["player_out"]
                            player_out_id = registry[player_out_name]
                            wicket_list = [season, match_number, team_batting, over, number, player_out_id, wicket_type]
                            wicket_array.append(wicket_list)
                            if "fielders" in wicket_data:
                                for fielder_data in wicket_data["fielders"] :
                                    fielder_name = fielder_data["name"]
                                    fielder_id = registry[fielder_name]
                                    fielder_wicket_list = [season, match_number, team_batting, over, number, fielder_id]
                                    fielder_wicket_array.append(fielder_wicket_list)
                    
                    if "extras" in delivery_data :
                        extras_data = delivery_data["extras"]
                        byes, legbyes, noballs, penalty, wides = 0, 0, 0, 0, 0
                        if "byes" in extras_data :
                            byes = extras_data["byes"]
                        if "legbyes" in extras_data :
                            legbyes = extras_data["legbyes"]
                        if "noballs" in extras_data :
                            noballs = extras_data["noballs"]
                        if "penalty" in extras_data :
                            penalty = extras_data["penalty"]
                        if "wides" in extras_data :
                            wides = extras_data["wides"]
                        extra_list = [season, match_number, team_batting, over, number, byes, 
                                       legbyes, noballs, penalty, wides]
                        extra_array.append(extra_list)
                    # In this databases, overs + deliveries are zero indexed
                    delivery_list = [season, match_number, team_batting, over, number, batter_id, 
                                     bowler_id, non_striker_id, extras, runs, total_runs, wickets, str(season) + " " + str(match_number)]
                    delivery_array.append(delivery_list)
    delivery_df = pd.DataFrame(delivery_array, columns=delivery_features)
    wicket_df = pd.DataFrame(wicket_array, columns=wicket_features)
    extra_df = pd.DataFrame(extra_array, columns=extra_features)
    fielder_wicket_df = pd.DataFrame(fielder_wicket_array, columns=fielder_wicket_features)
    return delivery_df, wicket_df, extra_df, fielder_wicket_df

delivery_df, wicket_df, extra_df, fielder_wicket_df = delivery_data()
print(extra_df)

      season match_number         team_batting  over  number  byes  legbyes  \
0       2024           23  Sunrisers Hyderabad     3       5     0        0   
1       2024           23  Sunrisers Hyderabad     4       2     0        0   
2       2024           23  Sunrisers Hyderabad     6       4     0        1   
3       2024           23  Sunrisers Hyderabad    13       5     0        0   
4       2024           23  Sunrisers Hyderabad    17       2     0        0   
...      ...          ...                  ...   ...     ...   ...      ...   
14120   2018           36  Sunrisers Hyderabad     2       5     0        0   
14121   2018           36  Sunrisers Hyderabad     5       0     0        0   
14122   2018           36  Sunrisers Hyderabad    10       5     0        0   
14123   2018           36  Sunrisers Hyderabad    11       2     0        0   
14124   2018           36  Sunrisers Hyderabad    18       3     0        0   

       noballs  penalty  wides  
0            0    

In [574]:
%%time
start = time.time()
player_row = player_match_df.iloc[4] # B. Kumar is 5, A. Sharma is 4, J. Unadkat is 9
deliveries_faced_df = delivery_df.loc[(delivery_df["batter"] == player_row["player_id"])
                                       & (delivery_df["season"] == player_row["season"])
                                       & (delivery_df["match_number"] == player_row["match_number"])
                                       ]
deliveries_bowled_df = delivery_df.loc[(delivery_df["bowler"] == player_row["player_id"])
                                       & (delivery_df["season"] == player_row["season"])
                                       & (delivery_df["match_number"] == player_row["match_number"])
                                       ]
end = time.time()
print(end-start)
player_row["runs_scored"] = deliveries_faced_df["runs"].sum()
player_row["runs_conceded"] = deliveries_bowled_df["runs"].sum() # TODO add wides
extras_against = pd.merge(deliveries_faced_df, extra_df, on=["season", "match_number", "team_batting", "over", "number"])
wickets_against = pd.merge(deliveries_faced_df, wicket_df, on=["season", "match_number", "team_batting", "over", "number"])

player_row["out"] = False
for _, wickets_against_row in wickets_against.iterrows() :
    if wickets_against_row["batter"] == wickets_against_row["player_out"] :
        player_row["out"] = True

wides_faced = sum(extras_against["wides"] > 0)
player_row["balls_faced"] = deliveries_faced_df.shape[0] - wides_faced
player_row["fours"] = sum(deliveries_faced_df["runs"] == 4)
player_row["sixes"] = sum(deliveries_faced_df["runs"] == 6)

extras_bowled = pd.merge(deliveries_bowled_df, extra_df, on=["season", "match_number", "team_batting", "over", "number"])
wickets_bowled = pd.merge(deliveries_bowled_df, wicket_df, on=["season", "match_number", "team_batting", "over", "number"])

player_row["wickets"] = sum(wickets_bowled["type"] != "run out")
wides_bowled = extras_bowled["wides"].sum()
noballs_bowled = extras_bowled["noballs"].sum()
player_row["runs_conceded"] += wides_bowled + noballs_bowled
player_row["wides"] = wides_bowled
player_row["no_balls"] = noballs_bowled

player_row["balls_delivered"] = len(deliveries_bowled_df) - sum(extras_bowled["wides"] > 0) - sum(extras_bowled["noballs"] > 0)
player_row["fours_conceded"] = sum(deliveries_bowled_df["runs"] == 4)
player_row["sixes_conceded"] = sum(deliveries_bowled_df["runs"] == 6)

batting_innings_deliveries = delivery_df.loc[(delivery_df["season"] == player_row["season"])
                             & (delivery_df["match_number"] == player_row["match_number"])
                             & (delivery_df["team_batting"] == player_row["team"])
                             ]
# Need to incorporate nonstriker into batting order in case of golden duck
batting_order = pd.unique(pd.concat([batting_innings_deliveries["batter"], batting_innings_deliveries["non_striker"]]).sort_index())
if player_row["player_id"] in batting_order :
    player_row["position"] = np.where(batting_order == player_row["player_id"])[0] + 1
print(player_row)
print(batting_order)
batting_innings_deliveries

0.05611777305603027
name               Ashutosh Sharma
player_id                 84d9c311
season                        2024
match_number                    23
team                  Punjab Kings
runs_scored                     33
fours                            3
sixes                            2
out                          False
balls_faced                     15
position                         8
wickets                          0
runs_conceded                    0
balls_delivered                  0
fours_conceded                   0
sixes_conceded                   0
wides                            0
no_balls                         0
Name: 4, dtype: object
['0a476045' 'abb83e27' '9418198b' 'e94915e6' '26d041c4' '26989d80'
 '800d2d97' '84d9c311']
CPU times: user 81.4 ms, sys: 6.33 ms, total: 87.7 ms
Wall time: 93.2 ms


Unnamed: 0,season,match_number,team_batting,over,number,batter,bowler,non_striker,extras,runs,total_runs,wickets,match_id
125,2024,23,Punjab Kings,0,0,0a476045,2e81a32d,abb83e27,0,0,0,0,2024 23
126,2024,23,Punjab Kings,0,1,0a476045,2e81a32d,abb83e27,0,0,0,0,2024 23
127,2024,23,Punjab Kings,0,2,0a476045,2e81a32d,abb83e27,0,0,0,0,2024 23
128,2024,23,Punjab Kings,0,3,0a476045,2e81a32d,abb83e27,0,0,0,0,2024 23
129,2024,23,Punjab Kings,0,4,0a476045,2e81a32d,abb83e27,1,0,1,0,2024 23
...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,2024,23,Punjab Kings,19,4,84d9c311,1e66c162,26989d80,0,2,2,0,2024 23
247,2024,23,Punjab Kings,19,5,84d9c311,1e66c162,26989d80,0,2,2,0,2024 23
248,2024,23,Punjab Kings,19,6,84d9c311,1e66c162,26989d80,1,0,1,0,2024 23
249,2024,23,Punjab Kings,19,7,84d9c311,1e66c162,26989d80,0,1,1,0,2024 23


In [576]:
def player_stats() :
    for loop_ct, player_row in player_match_df.iterrows() :
        if loop_ct >= 200 :
            break
        deliveries_faced_df = delivery_df.loc[(delivery_df["batter"] == player_row["player_id"])
                                       & (delivery_df["season"] == player_row["season"])
                                       & (delivery_df["match_number"] == player_row["match_number"])
                                       ]
        deliveries_bowled_df = delivery_df.loc[(delivery_df["bowler"] == player_row["player_id"])
                                            & (delivery_df["season"] == player_row["season"])
                                            & (delivery_df["match_number"] == player_row["match_number"])
                                            ]

        player_row["runs_scored"] = deliveries_faced_df["runs"].sum()
        player_row["runs_conceded"] = deliveries_bowled_df["runs"].sum() # TODO add wides
        extras_against = pd.merge(deliveries_faced_df, extra_df, on=["season", "match_number", "team_batting", "over", "number"])
        wickets_against = pd.merge(deliveries_faced_df, wicket_df, on=["season", "match_number", "team_batting", "over", "number"])

        player_row["out"] = False
        for __, wickets_against_row in wickets_against.iterrows() :
            if wickets_against_row["batter"] == wickets_against_row["player_out"] :
                player_row["out"] = True

        wides_faced = sum(extras_against["wides"] > 0)
        player_row["balls_faced"] = deliveries_faced_df.shape[0] - wides_faced
        player_row["fours"] = sum(deliveries_faced_df["runs"] == 4)
        player_row["sixes"] = sum(deliveries_faced_df["runs"] == 6)

        # TODO figure out position
        extras_bowled = pd.merge(deliveries_bowled_df, extra_df, on=["season", "match_number", "team_batting", "over", "number"])
        wickets_bowled = pd.merge(deliveries_bowled_df, wicket_df, on=["season", "match_number", "team_batting", "over", "number"])

        player_row["wickets"] = sum(wickets_bowled["type"] != "run out")
        wides_bowled = extras_bowled["wides"].sum()
        noballs_bowled = extras_bowled["noballs"].sum()
        player_row["runs_conceded"] += wides_bowled + noballs_bowled
        player_row["wides"] = wides_bowled
        player_row["no_balls"] = noballs_bowled

        player_row["balls_delivered"] = len(deliveries_bowled_df) - sum(extras_bowled["wides"] > 0) - sum(extras_bowled["noballs"] > 0)
        player_row["fours_conceded"] = sum(deliveries_bowled_df["runs"] == 4)
        player_row["sixes_conceded"] = sum(deliveries_bowled_df["runs"] == 6)

        batting_innings_deliveries = delivery_df.loc[(delivery_df["season"] == player_row["season"])
                                    & (delivery_df["match_number"] == player_row["match_number"])
                                    & (delivery_df["team_batting"] == player_row["team"])
                                    ]
        # Need to incorporate nonstriker into batting order in case of golden duck
        batting_order = pd.unique(pd.concat([batting_innings_deliveries["batter"], batting_innings_deliveries["non_striker"]]).sort_index())
        if player_row["player_id"] in batting_order :
            player_row["position"] = np.where(batting_order == player_row["player_id"])[0] + 1
player_stats()
player_match_df

Unnamed: 0,name,player_id,season,match_number,team,runs_scored,fours,sixes,out,balls_faced,position,wickets,runs_conceded,balls_delivered,fours_conceded,sixes_conceded,wides,no_balls
0,AK Markram,6a26221c,2024,23,Sunrisers Hyderabad,,,,,,,,,,,,,
1,Abdul Samad,8e514b4c,2024,23,Sunrisers Hyderabad,,,,,,,,,,,,,
2,Abhishek Sharma,f29185a1,2024,23,Sunrisers Hyderabad,,,,,,,,,,,,,
3,Arshdeep Singh,244048f6,2024,23,Punjab Kings,,,,,,,,,,,,,
4,Ashutosh Sharma,84d9c311,2024,23,Punjab Kings,33,3,2,False,15,8,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24362,Shakib Al Hasan,7dc35884,2018,36,Sunrisers Hyderabad,,,,,,,,,,,,,
24363,TA Boult,a818c1be,2018,36,Delhi Daredevils,,,,,,,,,,,,,
24364,V Shankar,0994d0ae,2018,36,Delhi Daredevils,,,,,,,,,,,,,
24365,WP Saha,fe11caa6,2018,36,Sunrisers Hyderabad,,,,,,,,,,,,,
