In [None]:
import codebase.web_scrape_functions as wsf
import codebase.analysis_functions as af
import codebase.match_data as match_data
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import utils
from utils import logger
import os
import logging
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import codebase.graphing_functions as gf
from codebase.settings import CAREERS
from codebase.settings import LABEL_DATA

if utils.check_if_ipython():
    logger.disabled = True

%load_ext autoreload
%autoreload 2
logger.handlers[1].setLevel(logging.INFO)
# logger.disabled = True
pd.get_option("display.max_columns")

In [None]:
KOHLI_ID = '253802'
ROOT_PLAYER_ID = '303669'
WILLIAMSON_PLAYER_ID = '277906'
SPD_SMITH_ID = '267192'

Lets do a deep dive into Kohli's drives and innings in general. First lets do a few basic questions, average length of Kohli's innings, scores, minutes, etc.

In [None]:
kohli_matches = wsf.get_player_match_list(KOHLI_ID)


In [None]:
kohli_innings = af.get_cricket_totals(KOHLI_ID, kohli_matches, _type='bat', by_innings=True, is_object_id=True)
kohli_innings_df = pd.DataFrame(kohli_innings)

In [None]:
kohli_innings_df.head()

In [None]:
kohli_innings_df.describe()

In [None]:
pd.cut(kohli_innings_df.balls_faced, [0,10,20,30,40,50,100,150,200, float("inf")]).value_counts()

Ok now back to the cover drives, we will deep dive, how many cover drives does he play in each bin, and the average of those cover drives. We wanna see if there is a trend in when he plays the shot and the average. First lets get all commentary and then we can filter the innings based on the bins and match to commentary

## Get all commentary from match and turn it to list

In [None]:
kohli_comms = af.get_player_contributions(KOHLI_ID, kohli_matches, _type = 'bat', by_innings=True, is_object_id=True)

In [None]:
kohli_comms[0].commentTextItems.tolist()

We have all the innings commentary, no lets get just the commentTextItems and then we can search in those to see when the commentary is cover drives

In [None]:
commentary = []
for match in kohli_comms:
    commentary.append(match.commentTextItems.tolist())

## Searching for cover drives

In [None]:
cover_drives = []
for inning in commentary:
    cover_drives.append(af.search_for_keywords(inning, ['drive', 'cover', 'defending', 'defence', 'defends'], exclude_words=['run out', 'pull', 'flick'],return_matching=True, return_indices=True))

In [None]:
cover_drives[3][2]

In [None]:
cover_drives[3][1]

Now that we have the deliveries that kohli played drives to, we can reference this back to the df and pick out all the rows where he played a cover drive. Then we can get the average off the cover drive and dismissals etc. Next we will need to do this for other shots and start to build a picture of how Kohli gets out.

In [None]:
cover_drive_df = pd.DataFrame()
for i,inning in enumerate(cover_drives):
    indices = inning[2]
    cover_drive_df = pd.concat([cover_drive_df, kohli_comms[i].iloc[indices]])

In [None]:
cover_drive_df[cover_drive_df['isWicket'] == True].count()

Only 36 dismissals with cover drive in them, would have thought there were way more. What are the other ways Kohli is getting out? Lets create a dictionary of words that are appearing in Kohli dismissals. First we need to get all the dismissals.

## Get all dismissals

In [None]:
kohli_comms_flat = pd.DataFrame()
for comms in kohli_comms:
    kohli_comms_flat = pd.concat([kohli_comms_flat, comms])

In [None]:
kohli_comms_flat[kohli_comms_flat.isWicket == True].count()

In [None]:
kohli_dismissals = kohli_comms_flat[(kohli_comms_flat.isWicket == True) & (kohli_comms_flat.batsmanPlayerId == int(af.get_player_map(match_data.MatchData(kohli_matches[0]), 'player_id', 'object_id')[int(KOHLI_ID)]))]

In [None]:
dismissals_list = kohli_dismissals.commentTextItems.to_list()

In [None]:
dismissals_list

In [None]:
cover_drive_dismissals = cover_drive_df[cover_drive_df['isWicket'] == True].commentTextItems.to_list()

In [None]:
cover_drive_dismissals

In [None]:
import nltk
from nltk.corpus import stopwords
exclude_words = set(stopwords.words('english'))
exclude_words

In [None]:
import string

vocabulary = {}

for dismissal in dismissals_list:
    dismissal = dismissal.translate(str.maketrans('', '', string.punctuation))
    wordlist = dismissal.split(' ')
    for word in wordlist:
        if word not in exclude_words:
            try:
                vocabulary[word] += 1
            except KeyError:
                vocabulary[word] = 1


In [None]:
vocabulary['edge']

The above 70 is the number of times dismissals contain the word edge in them. Lets see all these dismissals and see the pattern of words that will allow us to catch outside edge. I think that the best way to make sure that we have all the correct words is to get sets of dismissals from different phrases and then take the intercept of all these dismissals.

Let us no search all dismissals and find the times they contain edge

In [None]:
edge_dismissals = af.search_for_keywords(dismissals_list, keywords=['outside edge', 'drive', 'outside off', 'reach', 'slip'], exclude_words = ['top edge', 'top-edge'], return_matching=True)

In [None]:
edge_dismissals[1]

Ok, so we manually went through and labelled a number of dismissals, let us load these back in and then we can properly see how many of Kohli's dismissals were actually becuase of driving or cover driving in general.

In [None]:
yes_count = 0
yes_dismissals = []
with open(os.path.join(LABEL_DATA, 'labelled_drive_dismisals.txt'), 'r') as file:
    for line in file.readlines():
        line = line.split('label:')
        label = line[1]
        if label.strip().lower() == 'yes':
            yes_count += 1
            yes_dismissals.append(line[0])

print("Kohli's dismissals that are cover drives:")
print(yes_count)

Let us now trend these dismissals. I want to know the average score of all these dismissals and I want to know how early in the innings that each of these dismissals happened. For this we are gonna need contributions, as well as knowing what match these drives happened in. From the match function we can get the index of every one of these dismissals and then we can use that and our label ones and zeros to basically figure out which inning the dismisal was in. Or maybe a better way is to get the match id of each of the dismissals. 

Objctive: Get the match id of every one of the cover drive dismissals.

In [None]:
# yes_dismissals[0]

In [None]:
# drive_dismissal_indices = []
# j = 0
# for i,dismissal in enumerate(dismissals_list):
#     print(dismissal.strip('"').strip("'").strip().lower(), yes_dismissals[j].strip('"').strip("'").strip().lower())
#     if dismissal.strip('"').strip("'").strip().lower()[:30] == yes_dismissals[j].strip('"').strip("'").strip().lower()[:30]:
#         j += 1
#         drive_dismissal_indices.append(i)


In [None]:
KOHLI_ID_COMMS = int(af.get_player_map(match_data.MatchData(kohli_matches[0]), 'player_id', 'object_id')[int(KOHLI_ID)])
drive_dismissal_indices = []
j = 0
for i, _match in enumerate(kohli_comms):
    try:
        dismissal = _match.iloc[-1]
        #dismissal = _match[(_match.isWicket == True) & (_match.batsmanPlayerId == KOHLI_ID_COMMS)]
        dismissal = dismissal.commentTextItems
        #print(dismissal.strip('"').strip("'").strip().lower()[:30])
        #print(yes_dismissals[j].strip('"').strip("'").strip().lower()[:30])
        if dismissal.strip('"').strip("'").strip().lower()[:30] == yes_dismissals[j].strip('"').strip("'").strip().lower()[:30]:
            j += 1
            drive_dismissal_indices.append(i)
    except IndexError:
        pass
        #print(dismissal)



In [None]:
drive_dismissal_indices #index out of the total innings of innings where Kohli has got out playing a drive

In [None]:
match_list_by_inning = [inning['match_id'] for inning in kohli_innings]

In [None]:
len(match_list_by_inning)

In [None]:
cover_drive_dismissal_match_ids = [match_list_by_inning[i] for i in drive_dismissal_indices]

In [None]:
cover_drive_dismissal_match_ids #match id of matches where kohli is dismissed by the cover drive

Now that we have the cover drive match ids, we can figure out if there is a trend in how early these dismissals are happening, is it the case that Kohli gets out in different ways based on how many balls he has faced, does he get out to cover drives only early in his innings, and finally, how many cover drives does Kohli play in this innings where he gets out to cover drives

In [None]:
cover_drive_match_totals = [kohli_innings[i] for i in drive_dismissal_indices] #match totals of matches where kohli got out playing cover drive

In [None]:
cover_drive_match_totals_df = pd.DataFrame(cover_drive_match_totals)
cover_drive_match_totals_df.head()

In [None]:
cover_drive_match_totals_df.describe()

That is very very interesting, Kohli's innings when he gets out from driving look much the same as his normal stats. Nothing drastically different.

NOTE: Side concern is that we want to only sum the rows that have an out in their out column. If the batsman is not out then we want to skip. So let us quickly modify the describe function, so that before we apply describe, we want to get rid of these rows, or we can add the run total to the row above and then we can describe again.

Ok now let us count how many cover drives Kohli plays in every one of these innings, and then we can also see the average of these cover drives and check if there are any trends wrt to balls faced and cover drives, or average run scored, cover drives played per innings, cover drives "in control" vs "out of control"

Lets start with getting all the cover drives played from the innings where Kohli is dismissed by cover drives.

In [None]:
cover_drive_inning_comms = [kohli_comms[i] for i in drive_dismissal_indices] #innings comms for matches where Kohli has got out to a cover drive

In [None]:
cover_drive_inning_comms[0].commentTextItems.to_list()

In [None]:
cover_drives_in_innings = []
for i, comms in enumerate(cover_drive_inning_comms):
    innings = comms.commentTextItems.to_list()
    search = af.search_for_keywords(innings, ['drive', 'cover drive', 'full and wide', 'outside edge', 'reach', 'slip', 'edge'], exclude_words=['run out', 'pull', 'flicks', 'bouncer', 'short ball', 'stays back', 'backfoot', 'top edge', 'top-edge', 'lets one go', 'easy leave', 'leaves the ball'],return_matching=True, return_indices=True)
    cover_drives_in_innings.append(comms.iloc[search[2]])
    ## Everything here is identified as cover drive dismissals anyways, so we will make sure to add the dismissal if the dismissal is not there.
    if int(comms.shape[0]-1) not in search[2]: #Not working properly, all dismissals are being appended again.
        try:
            cover_drives_in_innings[i] = cover_drives_in_innings[i].append(comms.iloc[-1], ignore_index=True)
        except IndexError as e:
            cover_drives_in_innings.append(comms.iloc[-1])

In [None]:
# _cover_drives_in_innings = []
# comms = cover_drive_inning_comms[34]
# innings = comms.commentTextItems.to_list()
# search = af.search_for_keywords(innings, ['drive', 'cover drive', 'full and wide', 'outside edge', 'reach', 'slip', 'edge'], exclude_words=['run out', 'pull', 'flicks', 'bouncer', 'short ball', 'stays back', 'backfoot', 'top edge', 'top-edge', 'lets one go', 'easy leave', 'leaves the ball'],return_matching=True, return_indices=True)
# _cover_drives_in_innings.append(comms.iloc[search[2]])
# ## Everything here is identified as cover drive dismissals anyways, so we will make sure to add the dismissal if the dismissal is not there.
# if int(comms.shape[0]) not in search[2]:
#     print(search[2])
#     try:
#         _cover_drives_in_innings.append(comms.iloc[-1])
#     except TypeError:
#         _cover_drives_in_innings.append(comms.iloc[-1])

In [None]:
#_cover_drives_in_innings

In [None]:
#cover_drives_in_innings[14][cover_drives_in_innings[14].isWicket == True]

Ok, now we can describe all these dataframes, and as long as the dataframes only contain cover drives, then we can get the stats we need, average runs per cover drive, how many over drives in the innings, the length of the innings on average and runs scored in the innings. Later on we can try to find out if we can figure out a way to see if Kohli is in control. Again it will be a matter of finding key words.

I think we need to make our own describe function to account for the not outs and cricket dismissals. Ok what we want is average per shot, strike rate, dismissals.

In [None]:
# def analyse_batting(contributuion):
#     def safe_divide(numerator, denominator, _round=2):
#         try:
#             return round(numerator/denominator, 2)
#         except ZeroDivisionError:
#             return float('inf')

#     runs = contributuion.batsmanRuns.sum()
#     dismissals = contributuion[contributuion.isWicket == True].count().isWicket
#     balls = contributuion.shape[0]
#     strike_rate = safe_divide(runs, balls)
#     dot_balls = contributuion[contributuion.batsmanRuns == 0.0].count().batsmanRuns
#     fours = contributuion[contributuion.isFour == True].count().isFour
#     sixes = contributuion[contributuion.isSix == True].count().isSix
#     average = safe_divide(runs, dismissals)
#     how_out = af.how_out(contributuion.iloc[-1].dismissalType)
#     total_balls_faced = contributuion.iloc[-1].batsmanBallsFaced
#     fours_per_ball = safe_divide(fours, balls)
#     sixes_per_ball = safe_divide(sixes, balls)
#     dots_per_ball = safe_divide(dot_balls, balls)
    
#     #In control, out of control
#     return {
#         'runs': runs,
#         'dismissals': dismissals,
#         'balls':balls,
#         'sr':strike_rate,
#         'average': average,
#         'dot_balls': dot_balls,
#         'fours': fours,
#         'sixes': sixes,
#         'how-out': how_out,
#         'total_balls_faced': total_balls_faced,
#         'fours_per_ball': fours_per_ball,
#         'sixes_per_ball': sixes_per_ball,
#         'dots_per_ball': dots_per_ball
#     }

In [None]:
# def aggregate_batting_analysis(batting_stats):
#     """Takes list of batting stat objects and returns aggregate stats"""
#     averages = {}
#     totals = {}
#     keys = list(batting_stats[0].keys())
#     for key in keys:
#         try:
#             total = sum([i[key] for i in batting_stats])
#             av = total/len(batting_stats)
#             totals[key] = total
#             averages[key] = round(av,2)
#         except TypeError as e:
#             pass
#         except ZeroDivisionError:
#             av = float('inf')

    
#     return averages, totals

Ok, so now we got the stats we need for the cover drive, lets do this for other shots and lets also compare this to the baseline. Also we need to get averages, and everything for all these values. Lets create a function that when given a list of these batting analysis objects will return us back the average runs, average sr, the average balls faced, the total balls faced, percentage of shots resulting in boundaries, percentage of shots resulting in sixes, percentage of dot balls.

In [None]:
aves, tots, cover_drive_stats = af.analyse_batting(cover_drives_in_innings)

In [None]:
aves

In [None]:
tots

First lets start with this same batting analysis of cover drives on all his innings. Lets find out compared to innings he does not get out on cover drives, if they are the same as the innings he does get out on the cover drive. The main thing that we are trying to find here is simply if the cover drive is a shot that affects his batting negatively or not.

In [None]:
# I wanna do a quick sanity check on my batting analysis functions, make sure that they are giving me the correct results, so we will do the batting analysis on Kohli's whole career.
# So the analysis function takes contributions, we will need to loop through all of those and then apply the aggregate batting function
kohli_career_averages, kohli_career_totals, kohli_career_batting_stats = af.analyse_batting(kohli_comms)

In [None]:
kohli_career_averages

In [None]:
kohli_career_totals

In [None]:
#Ok the stats are almost correct, they count the runs accurately, and surely the balls as well, just need to find out why there are 42 more balls in virat's career.

In [None]:
cover_drives_in_innings_full = []
for i, comms in enumerate(kohli_comms):
    innings = comms.commentTextItems.to_list()
    search = af.search_for_keywords(innings, ['drive', 'cover drive', 'full and wide', 'outside edge', 'reach', 'slip', 'edge'], exclude_words=['run out', 'pull', 'flicks', 'bouncer', 'short ball', 'stays back', 'backfoot', 'top edge', 'top-edge', 'lets one go', 'easy leave', 'leaves the ball', 'leg side', 'leading edge', 'leg ', 'cut '],return_matching=True, return_indices=True)
    cover_drives_in_innings_full.append(comms.iloc[search[2]])
    ## Everything here is identified as cover drive dismissals anyways, so we will make sure to add the dismissal if the dismissal is not there.
    # if int(comms.shape[0]-1) not in search[2]: 
    #     try:
    #         cover_drives_in_innings_full[i] = cover_drives_in_innings_full[i].append(comms.iloc[-1], ignore_index=True)
    #     except IndexError as e:
    #         cover_drives_in_innings_full.append(comms.iloc[-1])

In [None]:
[len(x) for x in cover_drives_in_innings_full]

In [None]:
cover_drive_averages, cover_drive_totals, cover_drive_career_stats =  af.analyse_batting(cover_drives_in_innings_full)

In [None]:
cover_drive_averages

In [None]:
cover_drive_totals

In [None]:
[x['how-out'] if x else False for x in cover_drive_career_stats]

In [None]:
for i, dismissal in enumerate([x['how-out'] if x else False for x in cover_drive_career_stats]):
    if bool(dismissal):
        print(kohli_comms[i].iloc[-1].commentTextItems)
    