In [1]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import re

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
#Get Hawkeye data from cricket api
def get_url_hawkeye(match_id):
    try:
        url = f'https://cricketapi-icc.pulselive.com//fixtures/{match_id}/uds/stats'
    except:
        url = f'https://cricketapi.platform.iplt20.com//fixtures/{match_id}/uds/stats'
    return url

In [4]:
#Get scoring data from cricket api
def get_url_metadata(match_id):
    try:
        url = f'https://cricketapi-icc.pulselive.com//fixtures/{match_id}/scoring'
    except:
        url = f'https://cricketapi.platform.iplt20.com//fixtures/{match_id}/scoring'
    return url

In [5]:
#Get the data in the parsed html format
def get_soup_from_url(url):
    try:
        html = urlopen(url).read()
    except HTTPError:
        print("Link Cannot be Reached", url)
        return -1
        
    #soup = BeautifulSoup(html,"lxml")
    soup = BeautifulSoup(html,"html.parser")
    return str(soup)

In [6]:
def get_tracking_df_from_matchid(match_id):
  try:
    df = pd.DataFrame(
        [[k]+v.split(',') for i in json.loads(get_soup_from_url(get_url_hawkeye(match_id)))['data'] 
         for k,v in i.items()],
        columns = ['over','ball_num','batter','non-striker',
                   'bowler','speed','catcher','dismissal_desc',
                   'total_extras','runs','bowler_extras','extra_type',
                  'otw','length','line','line_at_stumps',
                  'height_at_stumps','shot_dist0','shot_dist1','blank2',
                   'blank3','blank4']
    )
    df['match_id'] = str(match_id)
    if ((df.shape[0] == 0) | 
       ((df.speed.nunique() == 1) & 
        (df.length.nunique() == 1) & 
        (df.line.nunique() == 1) & 
        (df.line_at_stumps.nunique() == 1) & 
        (df.height_at_stumps.nunique() == 1))) :
        return
    else:
      df['over'] = df.over.apply(lambda x: str(x).split('.'))
      df['match_inn'] = df.over.apply(lambda x: x[0])
      df['over_ball'] = pd.to_numeric(df.over.apply(lambda x: x[2]), errors='coerce')
      df['over_num'] = pd.to_numeric(df.over.apply(lambda x: x[1]), errors='coerce')
      df.drop('over', axis=1, inplace=True)

      df['speed'] = pd.to_numeric(df['speed'], errors='coerce')*3.6
      df.loc[df.speed < 0, 'speed'] = np.nan

      df['length'] = pd.to_numeric(df['length'], errors='coerce')
      df['line'] = pd.to_numeric(df['line'], errors='coerce')
      df['line_at_stumps'] = pd.to_numeric(df['line_at_stumps'], errors='coerce')
      df['height_at_stumps'] = pd.to_numeric(df['height_at_stumps'], errors='coerce')
      df['deviation'] = df.line_at_stumps - df.line
      return df
  except:
    print(f"couldn't retrieve data for match {match_id}. Please check {get_url_hawkeye(match_id)} to debug")
    return

In [7]:
wtc=get_tracking_df_from_matchid(23498)

In [8]:
def get_metadata_df_from_matchid(match_id):
  m = json.loads(get_soup_from_url(get_url_metadata(match_id)))
  this_match = pd.DataFrame([{k: v for k,v in m['matchInfo'].items() if k in [
    'matchDate', 'matchEndDate','isLimitedOvers', 'description', 'matchType', 'tournamentLabel']}])
  this_match['match_id'] = match_id
  try:
      this_match['toss_elected'] = m['matchInfo']['additionalInfo']['toss.elected']
  except:
      this_match['toss_elected'] = ''
  this_match['venue_id'] = m['matchInfo']['venue']['id']
  try:
      this_match['team1_wk'] = m['matchInfo']['teams'][0]['wicketKeeper']['id']
      this_match['team2_wk'] = m['matchInfo']['teams'][1]['wicketKeeper']['id']
  except:
      this_match['team1_wk'] = ''
      this_match['team2_wk'] = ''
  this_match['team1'] = m['matchInfo']['teams'][0]['team']['fullName']
  this_match['team2'] = m['matchInfo']['teams'][1]['team']['fullName']
  match_df = this_match
  venue_df = pd.DataFrame([m['matchInfo']['venue']])
  player_df = pd.concat([pd.DataFrame(m['matchInfo']['teams'][0]['players']),
                         pd.DataFrame(m['matchInfo']['teams'][1]['players'])]).drop_duplicates()
  
  #venue_df.drop('coordinates',axis=1, inplace=True)
  player_df['batter_hand'] = player_df.rightHandedBat.apply(lambda x: 'R' if x else 'L')
  player_df['bowler_hand'] = player_df.rightArmedBowl.apply(lambda x: 'R' if x else 'L')
  match_df.matchType = match_df.apply(lambda x: 'W_' + x.matchType if 
               re.search('women', x.tournamentLabel.lower()) else x.matchType,
              axis=1)
  match_df['toss_winner'] = match_df.toss_elected.apply(lambda x: str(x).strip().lower().split(',')[0])
  match_df['toss_decision'] = match_df.toss_elected.apply(lambda x: str(x).lower().strip('.').split(' ')[-1])
  match_df['toss_decision'] = match_df.toss_decision.apply(lambda x: 'field' if str(x)=='bowl' else str(x))
  match_df['toss_decision'] = match_df.toss_decision.apply(lambda x: x if str(x) in ['field','bat'] else '')
  match_df.drop('toss_elected', axis=1, inplace=True)
  return {'match_metadata': match_df,
          'player_metadata': player_df,
          'venue_metadata': venue_df}


In [9]:
wtc_meta=get_metadata_df_from_matchid(23498)

In [14]:
wtc.head(40)

Unnamed: 0,ball_num,batter,non-striker,bowler,speed,catcher,dismissal_desc,total_extras,runs,bowler_extras,extra_type,otw,length,line,line_at_stumps,height_at_stumps,shot_dist0,shot_dist1,blank2,blank3,blank4,match_id,match_inn,over_ball,over_num,deviation
0,1,5436,158,38,137.3508,-1,,0,0,0,,y,6.554,-0.19,0.011,0.649,0,0,,,,23498,1,1,1,0.201
1,2,5436,158,38,137.2248,-1,,0,0,0,,y,6.445,-0.179,0.078,0.831,0,0,,,,23498,1,2,1,0.257
2,3,5436,158,38,135.954,-1,,0,0,0,,y,7.479,-0.537,-0.496,0.675,0,0,,,,23498,1,3,1,0.041
3,4,5436,158,38,135.0144,-1,,0,0,0,,y,7.089,-0.424,-0.37,0.886,0,0,,,,23498,1,4,1,0.054
4,5,5436,158,38,135.7884,-1,,0,0,0,,y,6.837,-0.259,-0.008,0.783,0,0,,,,23498,1,5,1,0.251
5,6,5436,158,38,120.2832,-1,,0,0,0,,y,7.859,-0.61,-0.654,0.762,0,0,,,,23498,1,6,1,-0.044
6,1,158,5436,964,129.1104,-1,,1,1,1,,y,7.095,-0.362,-0.091,0.885,34,26,,,,23498,1,1,2,0.271
7,2,5436,158,964,129.0996,-1,,1,1,1,,y,7.668,-0.262,-0.168,0.54,34,31,,,,23498,1,2,2,0.094
8,3,158,5436,964,127.8468,-1,,1,1,1,,y,8.371,-0.015,0.232,0.941,54,33,,,,23498,1,3,2,0.247
9,4,5436,158,964,126.9396,-1,,1,1,1,,y,8.002,-0.444,-0.165,0.824,16,19,,,,23498,1,4,2,0.279


In [19]:
wtc_meta['venue_metadata']

Unnamed: 0,id,fullName,shortName,city,country
0,19,Narendra Modi Stadium,Narendra Modi Stadium,Ahmedabad,India
