In [3]:
import codebase.web_scrape_functions as wsf
import codebase.analysis_functions as af
import codebase.match_data as match_data
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import utils
from utils import logger
import os
import logging
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import codebase.graphing_functions as gf
from codebase.settings import CAREERS
from codebase.settings import LABEL_DATA

if utils.check_if_ipython():
    logger.disabled = True

%load_ext autoreload
%autoreload 2
logger.handlers[1].setLevel(logging.INFO)
# logger.disabled = True
pd.get_option("display.max_columns")

20

In [4]:
KOHLI_ID = '253802'
ROOT_PLAYER_ID = '303669'
WILLIAMSON_PLAYER_ID = '277906'
SPD_SMITH_ID = '267192'

Lets do a deep dive into Kohli's drives and innings in general. First lets do a few basic questions, average length of Kohli's innings, scores, minutes, etc.

In [6]:
kohli_matches = wsf.get_player_match_list(KOHLI_ID)


In [20]:
kohli_innings = af.get_cricket_totals(KOHLI_ID, kohli_matches, _type='bat', by_innings=True, is_object_id=True)
kohli_innings_df = pd.DataFrame(kohli_innings)

In [6]:
kohli_innings_df.head()

Unnamed: 0,inning,runs,balls_faced,fours,six,dot_balls,not_out,how_out,date,team,opposition,ground,continent,match_id
0,1,4,10,1,0,9,False,caught,2011-06-20,6,4,200,Americas,489226
1,3,15,54,2,0,43,False,caught,2011-06-20,6,4,200,Americas,489226
2,1,0,2,0,0,2,False,caught,2011-06-28,6,4,199,Americas,489227
3,3,27,107,1,1,87,False,caught,2011-06-28,6,4,199,Americas,489227
4,2,30,53,2,0,35,False,caught,2011-07-06,6,4,629,Americas,489228


In [7]:
kohli_innings_df.describe()

Unnamed: 0,inning,runs,balls_faced,fours,six,dot_balls,match_id
count,173.0,173.0,173.0,173.0,173.0,173.0,173.0
mean,2.213873,46.67052,84.046243,5.260116,0.138728,58.421965,948176.3
std,1.070445,53.477023,81.056527,6.143296,0.393778,53.339533,262036.2
min,1.0,0.0,1.0,0.0,0.0,1.0,489226.0
25%,1.0,10.0,21.0,1.0,0.0,15.0,667715.0
50%,2.0,28.0,60.0,3.0,0.0,44.0,1034813.0
75%,3.0,58.0,113.0,8.0,0.0,81.0,1157752.0
max,4.0,254.0,366.0,33.0,2.0,241.0,1320741.0


In [8]:
pd.cut(kohli_innings_df.balls_faced, [0,10,20,30,40,50,100,150,200, float("inf")]).value_counts()

(50.0, 100.0]     43
(0.0, 10.0]       24
(100.0, 150.0]    23
(20.0, 30.0]      20
(200.0, inf]      19
(10.0, 20.0]      18
(150.0, 200.0]    12
(30.0, 40.0]      10
(40.0, 50.0]       4
Name: balls_faced, dtype: int64

Ok now back to the cover drives, we will deep dive, how many cover drives does he play in each bin, and the average of those cover drives. We wanna see if there is a trend in when he plays the shot and the average. First lets get all commentary and then we can filter the innings based on the bins and match to commentary

## Get all commentary from match and turn it to list

In [7]:
kohli_comms = af.get_player_contributions(KOHLI_ID, kohli_matches, _type = 'bat', by_innings=True, is_object_id=True)

In [10]:
kohli_comms[0].commentTextItems.tolist()

['Carefully gets forward to a tossed up ball and defends',
 'Turns away from a shortish length and virat goes back to push it towards point',
 'Another watchful front foot block from virat',
 "And it wasn't long coming, virat brings out the whiplash extra cover drive off a fuller one but cannot beat the infield",
 'On the front foot and pushed away towards silly mid off',
 'Virat gets a gift and accepts it to score his first runs in tests, the attempted yorker gone wrong, way down the leg side and virat just helps it away with a nudge',
 'There arrives the short ball, but a touch outside off stump and virat calmly weaves out of its path',
 'Gets behind a short of a length ball and blocks it back to the bowler',
 'This one jags back in but virat is not committing forward at all and stays in the crease to block it towards short leg',
 "That is the end of virat's debut innings, he is gone nibbling at one outside off and baugh has gobbled up the appetiser before lunch, was some distance fr

We have all the innings commentary, no lets get just the commentTextItems and then we can search in those to see when the commentary is cover drives

In [11]:
commentary = []
for match in kohli_comms:
    commentary.append(match.commentTextItems.tolist())

## Searching for cover drives

In [12]:
cover_drives = []
for inning in commentary:
    cover_drives.append(af.search_for_keywords(inning, ['drive', 'cover', 'defending', 'defence', 'defends'], exclude_words=['run out', 'pull', 'flick'],return_matching=True, return_indices=True))

In [13]:
cover_drives[3][2]

[4,
 14,
 17,
 20,
 21,
 22,
 26,
 29,
 37,
 39,
 41,
 43,
 44,
 47,
 48,
 49,
 50,
 55,
 57,
 58,
 66,
 70,
 74,
 75,
 77,
 79,
 80,
 84,
 85,
 86,
 87,
 88,
 89,
 95,
 96,
 106]

In [14]:
cover_drives[3][1]

["Whoa, what was that? edwards pitches one up, just seeing if kohli is ready for it, and he wasn't. 88 mph, and it was at kohli's feet by the time he knew it it was there. he wasn't forward in time, and threw his bat at a hard-handed drive. it popped up in the leg side and nearly carried to bishoo at midwicket,",
 'Kohli leans out and drives into the off side with the spin.',
 'Nervy moments for kohli. survives another bouncer without much control. bouncer heading for the ribs, he hops back and tries to ride the bounce but ends up limply providing the handle of the bat as his last line of defence. it could have gone anywhere, but managed to stay just away from barath at short leg.',
 "Good ball, 88 mph and full, seeing if kohli is ready to get forward. he wasn't quick enough to change tack, but went for the drive, and ended it onto the pad.",
 'Tossed up, kohli drives through mid-off for a single.',
 "Kohli wisely takes a single and moves to the bowler's end. short ball outside off, he

Now that we have the deliveries that kohli played drives to, we can reference this back to the df and pick out all the rows where he played a cover drive. Then we can get the average off the cover drive and dismissals etc. Next we will need to do this for other shots and start to build a picture of how Kohli gets out.

In [15]:
cover_drive_df = pd.DataFrame()
for i,inning in enumerate(cover_drives):
    indices = inning[2]
    cover_drive_df = pd.concat([cover_drive_df, kohli_comms[i].iloc[indices]])

In [16]:
cover_drive_df[cover_drive_df['isWicket'] == True].count()

_uid                    38
id                      38
inningNumber            38
ballsActual              0
ballsUnique              0
oversUnique             38
oversActual             38
overNumber              38
ballNumber              38
totalRuns               38
batsmanRuns             38
isFour                  38
isSix                   38
isWicket                38
dismissalType           38
byes                    38
legbyes                 38
wides                   38
noballs                 38
timestamp               11
batsmanPlayerId         38
bowlerPlayerId          38
totalInningRuns         38
title                   38
dismissalText           38
commentPreTextItems     38
commentTextItems        38
commentPostTextItems    38
commentVideos           37
events                  38
over                     7
batsmanName             38
bowlerName              38
bowlerRuns              38
battingTeam             38
batsmanBallsFaced       37
commentImages            5
d

Only 36 dismissals with cover drive in them, would have thought there were way more. What are the other ways Kohli is getting out? Lets create a dictionary of words that are appearing in Kohli dismissals. First we need to get all the dismissals.

## Get all dismissals

In [17]:
kohli_comms_flat = pd.DataFrame()
for comms in kohli_comms:
    kohli_comms_flat = pd.concat([kohli_comms_flat, comms])

In [18]:
kohli_comms_flat[kohli_comms_flat.isWicket == True].count()

_uid                    165
id                      165
inningNumber            165
ballsActual               0
ballsUnique               0
oversUnique             165
oversActual             165
overNumber              165
ballNumber              165
totalRuns               165
batsmanRuns             165
isFour                  165
isSix                   165
isWicket                165
dismissalType           165
byes                    165
legbyes                 165
wides                   165
noballs                 165
timestamp                34
batsmanPlayerId         165
bowlerPlayerId          165
totalInningRuns         165
title                   165
dismissalText           165
commentPreTextItems     165
commentTextItems        165
commentPostTextItems    165
commentVideos           162
events                  165
over                     24
batsmanName             165
bowlerName              165
bowlerRuns              165
battingTeam             165
batsmanBallsFaced   

In [19]:
kohli_dismissals = kohli_comms_flat[(kohli_comms_flat.isWicket == True) & (kohli_comms_flat.batsmanPlayerId == int(af.get_player_map(match_data.MatchData(kohli_matches[0]), 'player_id', 'object_id')[int(KOHLI_ID)]))]

In [20]:
dismissals_list = kohli_dismissals.commentTextItems.to_list()

In [21]:
dismissals_list

["That is the end of virat's debut innings, he is gone nibbling at one outside off and baugh has gobbled up the appetiser before lunch, was some distance from virat, not much need to play at it, but for once virat pressed forward, and got so close to the ball that he had to play at it, and ended up getting a healthy edge through to the keeper, a happy bunch of west indians will take lunch,",
 "Soft dismissal! short ball, aimed into the ribs, but headed down the leg side, kohli flirts with it, and west indies' appeal for a catch down the leg side is spontaneous. harper agrees, and a disappointed kohli goes back. did he show dissent? were there any grounds for dissent? we know not yet... can't tell at all from these replays. as an aside, these are the incidents where drs becomes ineffective if there is no hot spot. makes no sense to have drs without proper technology",
 'Rampaul has taken off, his team-mates rush after him, virat has been bounced out, nothing shot, nothing at all actuall

In [22]:
cover_drive_dismissals = cover_drive_df[cover_drive_df['isWicket'] == True].commentTextItems.to_list()

In [23]:
cover_drive_dismissals

["Kohli's day just got worse, a loose drive away from his body, it was short of a length and outside off, perhaps should have left it alone, he decides to drive on the up instead, though edwards has been swinging the ball away, duly gets the outside edge and its easy for second slip",
 "They won't need the new ball for virat, he is gone, all the batsmen have fallen today to the pitched up ball which moves away slightly, kohli is forward in defence, but the movement has done him, and the outside edge is snapped up easily by haddin",
 "The end of kohli, he was looking so good this session, can't blame any demons in the pitch for that dismissal, a length ball that kohli drives uppishly, warner was waiting at point for that, some exuberant celebrations from siddle, kohli should be kicking himself to be dismissed in this manner after taking his time to settle in, and surviving a testing early phase",
 "Gone! panesar breaks through again. loose from kohli. hint of width outside off, his eyes

In [24]:
import nltk
from nltk.corpus import stopwords
exclude_words = set(stopwords.words('english'))
exclude_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [25]:
import string

vocabulary = {}

for dismissal in dismissals_list:
    dismissal = dismissal.translate(str.maketrans('', '', string.punctuation))
    wordlist = dismissal.split(' ')
    for word in wordlist:
        if word not in exclude_words:
            try:
                vocabulary[word] += 1
            except KeyError:
                vocabulary[word] = 1


In [26]:
vocabulary['edge']

70

The above 70 is the number of times dismissals contain the word edge in them. Lets see all these dismissals and see the pattern of words that will allow us to catch outside edge. I think that the best way to make sure that we have all the correct words is to get sets of dismissals from different phrases and then take the intercept of all these dismissals.

Let us no search all dismissals and find the times they contain edge

In [27]:
edge_dismissals = af.search_for_keywords(dismissals_list, keywords=['outside edge', 'drive', 'outside off', 'reach', 'slip'], exclude_words = ['top edge', 'top-edge'], return_matching=True)

In [28]:
edge_dismissals[1]

["That is the end of virat's debut innings, he is gone nibbling at one outside off and baugh has gobbled up the appetiser before lunch, was some distance from virat, not much need to play at it, but for once virat pressed forward, and got so close to the ball that he had to play at it, and ended up getting a healthy edge through to the keeper, a happy bunch of west indians will take lunch,",
 'Rampaul has taken off, his team-mates rush after him, virat has been bounced out, nothing shot, nothing at all actually, just caught on the crease shuffling across, he was probably expecting the bouncer as he was crouching a tad, instead rampaul gets it to shoot off from back of a length and straighten as well, and before virat could do anything, the ball had taken the glove and lobbed high to second slip',
 "Kohli's day just got worse, a loose drive away from his body, it was short of a length and outside off, perhaps should have left it alone, he decides to drive on the up instead, though edwar

Ok, so we manually went through and labelled a number of dismissals, let us load these back in and then we can properly see how many of Kohli's dismissals were actually becuase of driving or cover driving in general.

In [29]:
yes_count = 0
yes_dismissals = []
with open(os.path.join(LABEL_DATA, 'labelled_drive_dismisals.txt'), 'r') as file:
    for line in file.readlines():
        line = line.split('label:')
        label = line[1]
        if label.strip().lower() == 'yes':
            yes_count += 1
            yes_dismissals.append(line[0])

print("Kohli's dismissals that are cover drives:")
print(yes_count)

Kohli's dismissals that are cover drives:
54


Let us now trend these dismissals. I want to know the average score of all these dismissals and I want to know how early in the innings that each of these dismissals happened. For this we are gonna need contributions, as well as knowing what match these drives happened in. From the match function we can get the index of every one of these dismissals and then we can use that and our label ones and zeros to basically figure out which inning the dismisal was in. Or maybe a better way is to get the match id of each of the dismissals. 

Objctive: Get the match id of every one of the cover drive dismissals.

In [30]:
# yes_dismissals[0]

In [31]:
# drive_dismissal_indices = []
# j = 0
# for i,dismissal in enumerate(dismissals_list):
#     print(dismissal.strip('"').strip("'").strip().lower(), yes_dismissals[j].strip('"').strip("'").strip().lower())
#     if dismissal.strip('"').strip("'").strip().lower()[:30] == yes_dismissals[j].strip('"').strip("'").strip().lower()[:30]:
#         j += 1
#         drive_dismissal_indices.append(i)


In [32]:
KOHLI_ID_COMMS = int(af.get_player_map(match_data.MatchData(kohli_matches[0]), 'player_id', 'object_id')[int(KOHLI_ID)])
drive_dismissal_indices = []
j = 0
for i, _match in enumerate(kohli_comms):
    try:
        dismissal = _match.iloc[-1]
        #dismissal = _match[(_match.isWicket == True) & (_match.batsmanPlayerId == KOHLI_ID_COMMS)]
        dismissal = dismissal.commentTextItems
        #print(dismissal.strip('"').strip("'").strip().lower()[:30])
        #print(yes_dismissals[j].strip('"').strip("'").strip().lower()[:30])
        if dismissal.strip('"').strip("'").strip().lower()[:30] == yes_dismissals[j].strip('"').strip("'").strip().lower()[:30]:
            j += 1
            drive_dismissal_indices.append(i)
    except IndexError:
        pass
        #print(dismissal)



In [33]:
drive_dismissal_indices #index out of the total innings of innings where Kohli has got out playing a drive

[0,
 3,
 7,
 9,
 11,
 18,
 20,
 23,
 32,
 33,
 39,
 41,
 43,
 45,
 47,
 48,
 50,
 55,
 58,
 59,
 64,
 65,
 67,
 72,
 78,
 84,
 85,
 92,
 96,
 100,
 110,
 116,
 118,
 120,
 121,
 122,
 124,
 126,
 127,
 133,
 141,
 144,
 146,
 147,
 149,
 156,
 157,
 158,
 159,
 164,
 165,
 166,
 167,
 172]

In [34]:
match_list_by_inning = [inning['match_id'] for inning in kohli_innings]

In [35]:
len(match_list_by_inning)

173

In [36]:
cover_drive_dismissal_match_ids = [match_list_by_inning[i] for i in drive_dismissal_indices]

In [37]:
cover_drive_dismissal_match_ids #match id of matches where kohli is dismissed by the cover drive

[489226,
 489227,
 518950,
 518951,
 518952,
 565806,
 565807,
 565808,
 676527,
 648665,
 667653,
 667711,
 667713,
 667715,
 667717,
 667717,
 667719,
 754741,
 754743,
 870729,
 895777,
 895777,
 903603,
 1022593,
 1030215,
 1034811,
 1034811,
 1062573,
 1062575,
 1109604,
 1122278,
 1119551,
 1119552,
 1119553,
 1119553,
 1157752,
 1144993,
 1144994,
 1144994,
 1188629,
 1187685,
 1187686,
 1223869,
 1243384,
 1243385,
 1239544,
 1239544,
 1239545,
 1239545,
 1277079,
 1277079,
 1277081,
 1277081,
 1320741]

Now that we have the cover drive match ids, we can figure out if there is a trend in how early these dismissals are happening, is it the case that Kohli gets out in different ways based on how many balls he has faced, does he get out to cover drives only early in his innings, and finally, how many cover drives does Kohli play in this innings where he gets out to cover drives

In [38]:
cover_drive_match_totals = [kohli_innings[i] for i in drive_dismissal_indices] #match totals of matches where kohli got out playing cover drive

In [39]:
cover_drive_match_totals_df = pd.DataFrame(cover_drive_match_totals)
cover_drive_match_totals_df.head()

Unnamed: 0,inning,runs,balls_faced,fours,six,dot_balls,not_out,how_out,date,team,opposition,ground,continent,match_id
0,1,4,10,1,0,9,False,caught,2011-06-20,6,4,200,Americas,489226
1,3,27,107,1,1,87,False,caught,2011-06-28,6,4,199,Americas,489227
2,2,11,21,1,0,14,False,caught,2011-12-26,6,2,61,Oceania,518950
3,1,23,41,3,0,32,False,caught,2012-01-03,6,2,132,Oceania,518951
4,1,44,82,6,0,61,False,caught,2012-01-13,6,2,213,Oceania,518952


In [40]:
cover_drive_match_totals_df.describe()

Unnamed: 0,inning,runs,balls_faced,fours,six,dot_balls,match_id
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,1.888889,41.018519,80.814815,4.796296,0.055556,58.37037,954065.1
std,0.984151,46.748823,75.606747,5.360159,0.231212,51.061671,272464.4
min,1.0,0.0,1.0,0.0,0.0,1.0,489226.0
25%,1.0,11.0,24.25,1.0,0.0,18.5,667717.0
50%,2.0,22.0,62.5,3.0,0.0,46.0,1048692.0
75%,3.0,48.25,106.0,6.0,0.0,75.5,1187686.0
max,4.0,200.0,283.0,24.0,1.0,192.0,1320741.0


That is very very interesting, Kohli's innings when he gets out from driving look much the same as his normal stats. Nothing drastically different.

NOTE: Side concern is that we want to only sum the rows that have an out in their out column. If the batsman is not out then we want to skip. So let us quickly modify the describe function, so that before we apply describe, we want to get rid of these rows, or we can add the run total to the row above and then we can describe again.

Ok now let us count how many cover drives Kohli plays in every one of these innings, and then we can also see the average of these cover drives and check if there are any trends wrt to balls faced and cover drives, or average run scored, cover drives played per innings, cover drives "in control" vs "out of control"

Lets start with getting all the cover drives played from the innings where Kohli is dismissed by cover drives.

In [41]:
cover_drive_inning_comms = [kohli_comms[i] for i in drive_dismissal_indices] #innings comms for matches where Kohli has got out to a cover drive

In [48]:
cover_drive_inning_comms[0].commentTextItems.to_list()

['Carefully gets forward to a tossed up ball and defends',
 'Turns away from a shortish length and virat goes back to push it towards point',
 'Another watchful front foot block from virat',
 "And it wasn't long coming, virat brings out the whiplash extra cover drive off a fuller one but cannot beat the infield",
 'On the front foot and pushed away towards silly mid off',
 'Virat gets a gift and accepts it to score his first runs in tests, the attempted yorker gone wrong, way down the leg side and virat just helps it away with a nudge',
 'There arrives the short ball, but a touch outside off stump and virat calmly weaves out of its path',
 'Gets behind a short of a length ball and blocks it back to the bowler',
 'This one jags back in but virat is not committing forward at all and stays in the crease to block it towards short leg',
 "That is the end of virat's debut innings, he is gone nibbling at one outside off and baugh has gobbled up the appetiser before lunch, was some distance fr

In [51]:
cover_drives_in_innings = []
for i, comms in enumerate(cover_drive_inning_comms):
    innings = comms.commentTextItems.to_list()
    search = af.search_for_keywords(innings, ['drive', 'cover drive', 'full and wide', 'outside edge', 'reach', 'slip', 'edge'], exclude_words=['run out', 'pull', 'flicks', 'bouncer', 'short ball', 'stays back', 'backfoot', 'top edge', 'top-edge', 'lets one go', 'easy leave', 'leaves the ball'],return_matching=True, return_indices=True)
    cover_drives_in_innings.append(comms.iloc[search[2]])
    ## Everything here is identified as cover drive dismissals anyways, so we will make sure to add the dismissal if the dismissal is not there.
    if int(comms.shape[0]-1) not in search[2]: #Not working properly, all dismissals are being appended again.
        try:
            cover_drives_in_innings[i] = cover_drives_in_innings[i].append(comms.iloc[-1], ignore_index=True)
        except IndexError as e:
            cover_drives_in_innings.append(comms.iloc[-1])

In [None]:
# _cover_drives_in_innings = []
# comms = cover_drive_inning_comms[34]
# innings = comms.commentTextItems.to_list()
# search = af.search_for_keywords(innings, ['drive', 'cover drive', 'full and wide', 'outside edge', 'reach', 'slip', 'edge'], exclude_words=['run out', 'pull', 'flicks', 'bouncer', 'short ball', 'stays back', 'backfoot', 'top edge', 'top-edge', 'lets one go', 'easy leave', 'leaves the ball'],return_matching=True, return_indices=True)
# _cover_drives_in_innings.append(comms.iloc[search[2]])
# ## Everything here is identified as cover drive dismissals anyways, so we will make sure to add the dismissal if the dismissal is not there.
# if int(comms.shape[0]) not in search[2]:
#     print(search[2])
#     try:
#         _cover_drives_in_innings.append(comms.iloc[-1])
#     except TypeError:
#         _cover_drives_in_innings.append(comms.iloc[-1])

In [None]:
#_cover_drives_in_innings

In [None]:
#cover_drives_in_innings[14][cover_drives_in_innings[14].isWicket == True]

Ok, now we can describe all these dataframes, and as long as the dataframes only contain cover drives, then we can get the stats we need, average runs per cover drive, how many over drives in the innings, the length of the innings on average and runs scored in the innings. Later on we can try to find out if we can figure out a way to see if Kohli is in control. Again it will be a matter of finding key words.

I think we need to make our own describe function to account for the not outs and cricket dismissals. Ok what we want is average per shot, strike rate, dismissals.

In [14]:
def analyse_batting(contributuion):
    def safe_divide(numerator, denominator, _round=2):
        try:
            return round(numerator/denominator, 2)
        except ZeroDivisionError:
            return float('inf')

    runs = contributuion.batsmanRuns.sum()
    dismissals = contributuion[contributuion.isWicket == True].count().isWicket
    balls = contributuion.shape[0]
    strike_rate = safe_divide(runs, balls)
    dot_balls = contributuion[contributuion.batsmanRuns == 0.0].count().batsmanRuns
    fours = contributuion[contributuion.isFour == True].count().isFour
    sixes = contributuion[contributuion.isSix == True].count().isSix
    average = safe_divide(runs, dismissals)
    how_out = af.how_out(contributuion.iloc[-1].dismissalType)
    total_balls_faced = contributuion.iloc[-1].batsmanBallsFaced
    fours_per_ball = safe_divide(fours, balls)
    sixes_per_ball = safe_divide(sixes, balls)
    dots_per_ball = safe_divide(dot_balls, balls)
    
    #In control, out of control
    return {
        'runs': runs,
        'dismissals': dismissals,
        'balls':balls,
        'sr':strike_rate,
        'average': average,
        'dot_balls': dot_balls,
        'fours': fours,
        'sixes': sixes,
        'how-out': how_out,
        'total_balls_faced': total_balls_faced,
        'fours_per_ball': fours_per_ball,
        'sixes_per_ball': sixes_per_ball,
        'dots_per_ball': dots_per_ball
    }

In [54]:
cover_drives_in_innings[0]

Unnamed: 0,_uid,id,inningNumber,ballsActual,ballsUnique,oversUnique,oversActual,overNumber,ballNumber,totalRuns,...,commentTextItems,commentPostTextItems,commentVideos,events,over,batsmanName,bowlerName,bowlerRuns,battingTeam,batsmanBallsFaced
138,5242550,5242550,1,,,22.05,22.5,23,5,0,...,"And it wasn't long coming, virat brings out th...",,,[],,V Kohli,D Bishoo,0,6,4.0
154,5242627,5242627,1,,,25.03,25.3,26,3,0,...,"That is the end of virat's debut innings, he i...",India won the toss and west indies won everyth...,,[],,V Kohli,FH Edwards,0,6,10.0


In [52]:
print(analyse_batting(cover_drives_in_innings[0]))

{'runs': 0, 'dismissals': 1, 'balls': 2, 'sr': 0.0, 'average': 0.0, 'dot_balls': 2, 'fours': 0, 'sixes': 0, 'how-out': 'caught', 'total_balls_faced': 10.0}


In [78]:
cover_drive_batting_anaysis = []
for inning in cover_drives_in_innings:
    cover_drive_batting_anaysis.append(analyse_batting(inning))

In [62]:
sum([i['total_balls_faced'] for i in cover_drive_batting_anaysis])/len(cover_drive_batting_anaysis)

80.81481481481481

Ok, so now we got the stats we need for the cover drive, lets do this for other shots and lets also compare this to the baseline. Also we need to get averages, and everything for all these values. Lets create a function that when given a list of these batting analysis objects will return us back the average runs, average sr, the average balls faced, the total balls faced, percentage of shots resulting in boundaries, percentage of shots resulting in sixes, percentage of dot balls.

In [10]:
def aggregate_batting_analysis(batting_stats):
    """Takes list of batting stat objects and returns aggregate stats"""
    averages = {}
    totals = {}
    keys = list(batting_stats[0].keys())
    for key in keys:
        try:
            total = sum([i[key] for i in batting_stats])
            av = total/len(batting_stats)
            totals[key] = total
            averages[key] = round(av,2)
        except TypeError as e:
            pass
        except ZeroDivisionError:
            av = float('inf')

    
    return averages, totals

In [81]:
aves, tots = aggregate_batting_analysis(cover_drive_batting_anaysis)

In [82]:
aves

{'runs': 11.35,
 'dismissals': 1.0,
 'balls': 13.63,
 'sr': 0.66,
 'average': 11.35,
 'dot_balls': 8.57,
 'fours': 1.67,
 'sixes': 0.0,
 'total_balls_faced': 80.81,
 'fours_per_ball': 0.1,
 'sixes_per_ball': 0.0,
 'dots_per_ball': 0.72}

In [83]:
tots

{'runs': 613,
 'dismissals': 54,
 'balls': 736,
 'sr': 35.43,
 'average': 613.0,
 'dot_balls': 463,
 'fours': 90,
 'sixes': 0,
 'total_balls_faced': 4364.0,
 'fours_per_ball': 5.4,
 'sixes_per_ball': 0.0,
 'dots_per_ball': 38.71999999999999}

First lets start with this same batting analysis of cover drives on all his innings. Lets find out compared to innings he does not get out on cover drives, if they are the same as the innings he does get out on the cover drive. The main thing that we are trying to find here is simply if the cover drive is a shot that affects his batting negatively or not.

In [15]:
# I wanna do a quick sanity check on my batting analysis functions, make sure that they are giving me the correct results, so we will do the batting analysis on Kohli's whole career.
# So the analysis function takes contributions, we will need to loop through all of those and then apply the aggregate batting function
kohli_career_batting_stats = []
for comms in kohli_comms:
    kohli_career_batting_stats.append(analyse_batting(comms))

kohli_career_averages, kohli_career_totals = aggregate_batting_analysis(kohli_career_batting_stats)

  return round(numerator/denominator, 2)


In [16]:
kohli_career_averages

{'runs': 46.67,
 'dismissals': 0.95,
 'balls': 84.05,
 'sr': 0.47,
 'average': inf,
 'dot_balls': 58.91,
 'fours': 5.26,
 'sixes': 0.14,
 'total_balls_faced': nan,
 'fours_per_ball': 0.05,
 'sixes_per_ball': 0.0,
 'dots_per_ball': 0.75}

In [17]:
kohli_career_totals

{'runs': 8074,
 'dismissals': 165,
 'balls': 14541,
 'sr': 81.44000000000003,
 'average': inf,
 'dot_balls': 10191,
 'fours': 910,
 'sixes': 24,
 'total_balls_faced': nan,
 'fours_per_ball': 9.369999999999996,
 'sixes_per_ball': 0.20000000000000004,
 'dots_per_ball': 130.06}

In [18]:
#Ok the stats are almost correct, they count the runs accurately, and surely the balls as well, just need to find out why there are 42 more balls in virat's career.

In [19]:
kohli_career_batting_stats

[{'runs': 4,
  'dismissals': 1,
  'balls': 10,
  'sr': 0.4,
  'average': 4.0,
  'dot_balls': 9,
  'fours': 1,
  'sixes': 0,
  'how-out': 'caught',
  'total_balls_faced': 10.0,
  'fours_per_ball': 0.1,
  'sixes_per_ball': 0.0,
  'dots_per_ball': 0.9},
 {'runs': 15,
  'dismissals': 1,
  'balls': 54,
  'sr': 0.28,
  'average': 15.0,
  'dot_balls': 45,
  'fours': 2,
  'sixes': 0,
  'how-out': 'caught',
  'total_balls_faced': 54.0,
  'fours_per_ball': 0.04,
  'sixes_per_ball': 0.0,
  'dots_per_ball': 0.83},
 {'runs': 0,
  'dismissals': 1,
  'balls': 2,
  'sr': 0.0,
  'average': 0.0,
  'dot_balls': 2,
  'fours': 0,
  'sixes': 0,
  'how-out': 'caught',
  'total_balls_faced': 2.0,
  'fours_per_ball': 0.0,
  'sixes_per_ball': 0.0,
  'dots_per_ball': 1.0},
 {'runs': 27,
  'dismissals': 1,
  'balls': 107,
  'sr': 0.25,
  'average': 27.0,
  'dot_balls': 88,
  'fours': 1,
  'sixes': 1,
  'how-out': 'caught',
  'total_balls_faced': 107.0,
  'fours_per_ball': 0.01,
  'sixes_per_ball': 0.01,
  'dots_p

In [21]:
kohli_innings

[{'inning': 1,
  'runs': 4,
  'balls_faced': 10,
  'fours': 1,
  'six': 0,
  'dot_balls': 9,
  'not_out': False,
  'how_out': 'caught',
  'date': datetime.datetime(2011, 6, 20, 0, 0),
  'team': '6',
  'opposition': '4',
  'ground': '200',
  'continent': 'Americas',
  'match_id': 489226},
 {'inning': 3,
  'runs': 15,
  'balls_faced': 54,
  'fours': 2,
  'six': 0,
  'dot_balls': 43,
  'not_out': False,
  'how_out': 'caught',
  'date': datetime.datetime(2011, 6, 20, 0, 0),
  'team': '6',
  'opposition': '4',
  'ground': '200',
  'continent': 'Americas',
  'match_id': 489226},
 {'inning': 1,
  'runs': 0,
  'balls_faced': 2,
  'fours': 0,
  'six': 0,
  'dot_balls': 2,
  'not_out': False,
  'how_out': 'caught',
  'date': datetime.datetime(2011, 6, 28, 0, 0),
  'team': '6',
  'opposition': '4',
  'ground': '199',
  'continent': 'Americas',
  'match_id': 489227},
 {'inning': 3,
  'runs': 27,
  'balls_faced': 107,
  'fours': 1,
  'six': 1,
  'dot_balls': 87,
  'not_out': False,
  'how_out': 'ca

In [22]:
sum([i['balls_faced'] for i in kohli_innings])

14540

In [23]:
kohli_innings_from_sc = af.get_cricket_totals(KOHLI_ID, kohli_matches, _type='bat', by_innings=True, is_object_id=True, from_scorecards=True)

In [24]:
kohli_innings_from_sc

[{'runs': 4,
  'balls_faced': 10,
  'fours': 1,
  'six': 0,
  'dot_balls': 0,
  'not_out': False,
  'how_out': 'caught',
  'date': datetime.datetime(2011, 6, 20, 0, 0),
  'team': '6',
  'opposition': '4',
  'ground': '200',
  'continent': 'Americas',
  'match_id': 489226},
 {'runs': 15,
  'balls_faced': 54,
  'fours': 2,
  'six': 0,
  'dot_balls': 0,
  'not_out': False,
  'how_out': 'caught',
  'date': datetime.datetime(2011, 6, 20, 0, 0),
  'team': '6',
  'opposition': '4',
  'ground': '200',
  'continent': 'Americas',
  'match_id': 489226},
 {'runs': 0,
  'balls_faced': 2,
  'fours': 0,
  'six': 0,
  'dot_balls': 0,
  'not_out': False,
  'how_out': 'caught',
  'date': datetime.datetime(2011, 6, 28, 0, 0),
  'team': '6',
  'opposition': '4',
  'ground': '199',
  'continent': 'Americas',
  'match_id': 489227},
 {'runs': 27,
  'balls_faced': 107,
  'fours': 1,
  'six': 1,
  'dot_balls': 0,
  'not_out': False,
  'how_out': 'caught',
  'date': datetime.datetime(2011, 6, 28, 0, 0),
  'team

In [25]:
sum([i['balls_faced'] for i in kohli_innings_from_sc])

14499

In [26]:
for i,m in enumerate(kohli_innings):
    if m['balls_faced'] != kohli_innings_from_sc[i]['balls_faced']:
        print(m['match_id'])
        print(m['balls_faced'])
        print(kohli_innings_from_sc[i]['balls_faced'])
        print(i)

518952
82
81
11
518952
137
136
12
598812
208
206
25
676525
6
5
31
648665
185
181
33
648665
194
193
34
648667
88
87
35
667653
94
93
39
667653
137
135
40
667715
77
75
45
754737
176
175
52
754743
231
230
57
895773
192
191
60
895777
65
63
65
1022595
91
90
73
1030215
67
65
79
1034811
268
267
84
1034811
110
109
85
1034815
341
340
88
1109602
138
136
98
1122723
121
119
102
1122277
218
217
108
1122278
108
106
110
1122278
80
79
111
1119549
226
225
112
1119551
153
152
116
1144994
258
257
126
1187685
8
7
141
1243386
59
58
151
1249875
30
29
154


In [27]:
kohli_comms[31]

Unnamed: 0,_uid,id,inningNumber,ballsActual,ballsUnique,oversUnique,oversActual,overNumber,ballNumber,totalRuns,...,commentTextItems,commentPostTextItems,commentVideos,events,over,batsmanName,bowlerName,bowlerRuns,battingTeam,batsmanBallsFaced
634,8960107,8960107,2,,,27.05,27.5,28,5,1,...,That bouncer is far too high over kohli's head,,,[],,V Kohli,SS Cottrell,1,6,1.0
635,8960111,8960111,2,,,27.06,27.5,28,6,0,...,Slants down leg side and kohli lets it go,,,[],,V Kohli,SS Cottrell,0,6,2.0
636,8960113,8960113,2,,,27.07,27.6,28,7,2,...,"A full ball on the pas, kohli flicks neatly be...",,,[],"{'team': {'id': 6, 'objectId': 6, 'scribeId': ...",V Kohli,SS Cottrell,2,6,3.0
643,8960193,8960193,2,,,29.01,29.1,30,1,1,...,"A length ball angling into the pads, kohli lea...",,,[],,V Kohli,SS Cottrell,1,6,4.0
649,8960229,8960229,2,,,30.01,30.1,31,1,0,...,Kohli goes back and opens the face of the bat ...,,,[],,V Kohli,S Shillingford,0,6,5.0
650,8960273,8960273,2,,,30.02,30.2,31,2,0,...,India slide further! kohli has been caught bat...,,,[],,V Kohli,S Shillingford,0,6,6.0


In [None]:
#No balls and wides do not count as balls that the batsman faces, so these need to be removed from the total balls faced.