In [1]:
import requests
import json

def fetch_data_from_api(api_url, headers=None, params=None):
    try:
        # Make a request to the API
        response = requests.get(api_url, headers=headers, params=params)
        
        # Raise an exception if the request was unsuccessful
        response.raise_for_status()

        # Get the content type to determine if the response is JSON or HTML
        content_type = response.headers.get('Content-Type')

        if 'application/json' in content_type:
            # If the response is JSON, return the JSON data
            return response.json(), None
        else:
            # Otherwise, assume the response is HTML
            return None, response.text

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None, None

def extract_json_after_next_data_tag(file_content):
    """
    Extracts JSON content after the _NEXT_DATA__" type="application/json"> tag.
    """
    try:
        # Find the start of the JSON content after the specified tag
        json_start = file_content.find('_NEXT_DATA__" type="application/json">') + len('_NEXT_DATA__" type="application/json">')
        
        # Check if the start marker was found
        if json_start == -1:
            raise ValueError("Start marker not found in the file content")

        # Extract the JSON content
        json_content = file_content[json_start:]

        # Print the extracted JSON content for debugging
        print("Extracted JSON content:", json_content[:500])  # Print the first 500 characters

        # Find the end of the JSON content
        json_end = json_content.find('</script>')

        if json_end == -1:
            raise ValueError("End marker not found in the file content")

        # Isolate the JSON data
        json_content = json_content[:json_end].strip()

        # Validate the extracted JSON content
        json_data = json.loads(json_content)

        return json_data, json_content

    except json.JSONDecodeError as e:
        print(f"JSON decoding error: {e}")
        return None, None
    except Exception as e:
        print(f"An error occurred while extracting JSON: {e}")
        return None, None

def save_json_to_file(json_content, output_file):
    """
    Saves the JSON content to a .json file.
    """
    try:
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(json_content)
        print(f"JSON content successfully saved to {output_file}")
    except Exception as e:
        print(f"An error occurred while saving JSON to file: {e}")

# Example usage
api_url = "https://www.espncricinfo.com/series/india-in-sri-lanka-2024-1442984/sri-lanka-vs-india-1st-t20i-1442987/ball-by-ball-commentary"
headers = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-encoding": "gzip, deflate, br, zstd",
    "accept-language": "en-US,en;q=0.9",
    "cache-control": "max-age=0",
    "cookie": "_cb=DTQxLy3Jm1XBtmcL5; SWID=f6f3db74-9e0e-4048-8790-8a4532e7653a; _cc_id=3f79452065065c76763f9a1f40dac86f; WZRK_G=041019a6f2414f26a84bd108e525733b; trc_cookie_storage=taboola%2520global%253Auser-id%3D947c6663-8df1-42dd-8318-46490ef6836d-tuctc96b065; _uetvid=7805a520aee011eeaa09a1ee6681b14e; _ga=GA1.1.679824282.1718555219; s_pers=%20s_c24%3D1718808602208%7C1813416602208%3B%20s_c24_s%3DLess%2520than%25207%2520days%7C1718810402208%3B%20s_gpv_pn%3Dcricinfo%253Asearch%253Asearch%2520result%7C1718810402210%3B; _ga_TNVCMNW09Q=GS1.1.1718987635.4.0.1718987635.0.0.0; _ga_H0P43ZY447=GS1.1.1718987687.2.0.1718987697.0.0.0; _ga_XS3N0WC54W=GS1.1.1718987747.3.1.1718987927.0.0.0; edition=espncricinfo-en-in; edition-view=espncricinfo-en-in; region=unknown; _dcf=1; connectionspeed=full; cto_bundle=sFLpe19GViUyQmRHUVlaRXJ1WGZqaUtNc3RnVDdZTDJZSTAlMkZFc3dJaHJkWFRBRXQzaG9ZSXZ6MEY5Y0JKV0VUY2RXTTM4Q3ptT2ZDNEJGeEp0JTJCQUt2NzAxT0lta0hMWnlaR2ZBSHdoa1RtWjV1UExsMGwxTW5BaW5scWRsWmxwNXV4R0xkTQ; country=in; s_ensPortal=sports; s_ensCDS=0; __gads=ID=a85a2a34d7a2c324:T=1704798944:RT=1722179627:S=ALNI_MZvvZHxOdAQ6JIwCh_Bk5AgXTGiZg; __gpi=UID=00000cd46a933317:T=1704798944:RT=1722179627:S=ALNI_MbP3FdvgMBBgIohDstFK4RlkCDrSQ; __eoi=ID=97602268cd3413c3:T=1718555173:RT=1722179627:S=AA-AfjYhQlVtGGwCWsyBHmnzKcqz; panoramaId_expiry=1722266027860; s_c24_s=Less%20than%207%20days; AMCVS_EE0201AC512D2BE80A490D4C%40AdobeOrg=1; AMCV_EE0201AC512D2BE80A490D4C%40AdobeOrg=1585540135%7CMCIDTS%7C19932%7CMCMID%7C33855837611456531403825214972827827092%7CMCAAMLH-1722784428%7C12%7CMCAAMB-1722784428%7C6G1ynYcLPuiQxYZrsz_pkqfLG9yMXBpb2zX5dvJdYQJzPXImdj0y%7CMCOPTOUT-1722186828s%7CNONE%7CvVersion%7C4.4.0; s_cc=true; s_gpv=espncricinfo%3Agame%3Asurrey-vs-glamorgan-group-b%3Acommentary; s_ensNR=1722179768388-Repeat; s_nr30=1722179768817-Repeat; s_c24=1722179768818; s_sq=%5B%5BB%5D%5D; WZRK_S_884-7R5-R85Z=%7B%22p%22%3A3%2C%22s%22%3A1722179627%2C%22t%22%3A1722179769%7D; s_ips=989.3999938964844; nol_fpid=9847n0dtfdeqyfzeb3cfoqxyeo3yh1722086391|1722086391049|1722179770810|1722179771099; _chartbeat2=.1663738421793.1722179773593.0000000000000011.CNjQHOBREiiMBpCot-Dcx96kDB80sA.1; _cb_svref=external; s_tp=2923; s_ppv=espncricinfo%253Agame%253Asurrey-vs-glamorgan-group-b%253Acommentary%2C34%2C29%2C29%2C850.1999969482422%2C4%2C1",
    "dnt": "1",
    "priority": "u=0, i",
    "sec-ch-ua": '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"Windows"',
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}
params = {"param1": "value1", "param2": "value2"}
json_output_file = 'combined_output.json'

# Fetch data from the API
json_data, html_content = fetch_data_from_api(api_url, headers, params)

if html_content:
    # Step 2: Extract JSON content after the _NEXT_DATA__" type="application/json"> tag
    json_data, json_content = extract_json_after_next_data_tag(html_content)

    # Step 3: Save the extracted JSON content to a new .json file
    if json_content:
        save_json_to_file(json_content, json_output_file)
else:
    # Save the JSON data directly if it was fetched as JSON
    if json_data:
        json_content = json.dumps(json_data, indent=4)
        save_json_to_file(json_content, json_output_file)


Extracted JSON content: {"props":{"appNextJsContext":{"lang":"en","country":"in","edition":"in","device":{"screen":"DESKTOP_LG","orientation":"LANDSCAPE","isTouchDevice":false,"isMobile":false,"isSSR":true},"hip":"ffc31168120ffe0cd1d9b47579a44377"},"globalDetails":{"followItems":[{"title":"Instagram","summary":"","previewImage":null,"type":"URL","countryCodes":[],"url":{"id":11670,"objectId":1243355,"title":"https://www.instagram.com/espncricinfo/","url":"https://www.instagram.com/espncricinfo/"}},{"title":"WhatsApp","
JSON content successfully saved to combined_output.json


In [13]:
import pandas as pd
import json

# Load the JSON file
with open('combined_output.json', 'r') as file:
    data = json.load(file)

# Extract the relevant details
series_id = data['query']['seriesId']
match_id = data['query']['matchId']
innings = data['props']['appPageProps']['data']['content']['innings']

# Create an empty list to store all batsman details
all_batsmen = []

# Iterate over each inning in the innings list
for inning in innings:
    # Check if 'inningBatsmen' is a key in the inning dictionary
    if 'inningBatsmen' in inning:
        batsmen = inning['inningBatsmen']
        # Add seriesId and matchId to each batsman's dictionary
        for batsman in batsmen:
            batsman['seriesId'] = series_id
            batsman['matchId'] = match_id
        # Extend the all_batsmen list with details from the current inning
        all_batsmen.extend(batsmen)

# Convert the details to a pandas DataFrame
df = pd.DataFrame(all_batsmen)

# Extract player details into separate columns
df['playerId'] = df['player'].apply(lambda x: x.get('id'))
df['playerLongName'] = df['player'].apply(lambda x: x.get('longName'))
df['playerBattingStyles'] = df['player'].apply(lambda x: x.get('battingStyles'))

# Extract bowler details into separate columns
df['bowlerId'] = df['dismissalBowler'].apply(lambda x: x.get('id') if pd.notnull(x) else None)
df['bowlerLongName'] = df['dismissalBowler'].apply(lambda x: x.get('longName') if pd.notnull(x) else None)
df['bowlingStyles'] = df['dismissalBowler'].apply(lambda x: x.get('bowlingStyles') if pd.notnull(x) else None)

# Drop the original 'player' and 'dismissalBowler' columns
df = df.drop(columns=['player', 'dismissalBowler'])

# Reorder the columns to ensure seriesId and matchId are first
# and bowler details are after dismissalType
columns_order = ['seriesId', 'matchId', 'playerId', 'playerLongName', 'playerBattingStyles'] + \
                [col for col in df.columns if col not in ['seriesId', 'matchId', 'playerId', 'playerLongName', 'playerBattingStyles', 'bowlerId', 'bowlerLongName', 'bowlingStyles']] + \
                ['dismissalType', 'bowlerId', 'bowlerLongName', 'bowlingStyles']
df = df[columns_order]

# Inspect the DataFrame
print(df)

# Save the DataFrame to an Excel file
df.to_excel('batsmen_details.xlsx', index=False)

# Load the Excel file to verify its content
df_excel = pd.read_excel('batsmen_details.xlsx')
print(df_excel)


   seriesId  matchId  playerId       playerLongName playerBattingStyles  \
0   1442984  1442987    102743     Yashasvi Jaiswal               [lhb]   
1   1442984  1442987     95316         Shubman Gill               [rhb]   
2   1442984  1442987     61990     Suryakumar Yadav               [rhb]   
3   1442984  1442987     86165         Rishabh Pant               [lhb]   
4   1442984  1442987     70633        Hardik Pandya               [rhb]   
5   1442984  1442987     96392          Riyan Parag               [rhb]   
6   1442984  1442987     75591          Rinku Singh               [lhb]   
7   1442984  1442987     67455           Axar Patel               [lhb]   
8   1442984  1442987    101430       Arshdeep Singh               [lhb]   
9   1442984  1442987    104770         Ravi Bishnoi               [rhb]   
10  1442984  1442987     87477       Mohammed Siraj               [rhb]   
11  1442984  1442987     93033      Pathum Nissanka               [rhb]   
12  1442984  1442987     

In [19]:
import pandas as pd
import json

# Load the JSON file
with open('combined_output.json', 'r') as file:
    data = json.load(file)

# Extract the relevant details
series_id = data['query']['seriesId']
match_id = data['query']['matchId']
innings = data['props']['appPageProps']['data']['content']['innings']

# Create an empty list to store all inningOvers details
all_inningOvers = []

# Iterate over each inning in the innings list
for inning in innings:
    # Check if 'inningOvers' is a key in the inning dictionary
    if 'inningOvers' in inning:
        inningOvers = inning['inningOvers']
        # Add seriesId and matchId to each over's dictionary
        for over in inningOvers:
            over['seriesId'] = series_id
            over['matchId'] = match_id
        # Extend the all_inningOvers list with details from the current inning
        all_inningOvers.extend(inningOvers)

# Convert the details to a pandas DataFrame
df = pd.DataFrame(all_inningOvers)

# Extract bowler details into separate columns
def extract_bowler_info(bowlers_list, key):
    if pd.notnull(bowlers_list):
        return ', '.join([str(bowler.get(key)) for bowler in bowlers_list])
    return None

# Add bowlerId and bowlerLongName columns as strings
df['bowlerIds'] = df['bowlers'].apply(lambda x: extract_bowler_info(x, 'id'))
df['bowlerLongNames'] = df['bowlers'].apply(lambda x: extract_bowler_info(x, 'longName'))

# Drop the original 'bowlers' column
df = df.drop(columns=['bowlers'])

# Reorder the columns to ensure seriesId, matchId, bowlerIds, and bowlerLongNames are first
columns_order = ['seriesId', 'matchId', 'bowlerIds', 'bowlerLongNames'] + \
                [col for col in df.columns if col not in ['seriesId', 'matchId', 'bowlerIds', 'bowlerLongNames']]
df = df[columns_order]

# Inspect the DataFrame
print(df)

# Save the DataFrame to a CSV file
df.to_csv('innings_details.csv', index=False)

# Load the CSV to verify its content
df_csv = pd.read_csv('innings_details.csv')
print(df_csv)


   seriesId  matchId bowlerIds      bowlerLongNames  overNumber  overRuns  \
0   1442984  1442987     79067   Dilshan Madushanka           1        13   
1   1442984  1442987     78275      Asitha Fernando           2         9   
2   1442984  1442987    101991   Maheesh Theekshana           3        14   
3   1442984  1442987     78275      Asitha Fernando           4        15   
4   1442984  1442987    101991   Maheesh Theekshana           5         8   
5   1442984  1442987     79067   Dilshan Madushanka           6        15   
6   1442984  1442987     78239    Wanindu Hasaranga           7         7   
7   1442984  1442987     79067   Dilshan Madushanka           8        17   
8   1442984  1442987     78239    Wanindu Hasaranga           9         4   
9   1442984  1442987     78233       Kamindu Mendis          10         9   
10  1442984  1442987     78239    Wanindu Hasaranga          11        11   
11  1442984  1442987    105938  Matheesha Pathirana          12        13   