In [6]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def fetch_data_from_api(api_url, headers=None, params=None):
    try:
        # Configure retries and session
        session = requests.Session()
        retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
        adapter = HTTPAdapter(max_retries=retries)
        session.mount('https://', adapter)

        # Make a request to the API
        response = session.get(api_url, headers=headers, params=params)
        
        # Raise an exception if the request was unsuccessful
        response.raise_for_status()

        # Get the content type to determine if the response is JSON or HTML
        content_type = response.headers.get('Content-Type')

        if 'application/json' in content_type:
            # If the response is JSON, return the JSON data
            return response.json(), None
        else:
            # Otherwise, assume the response is HTML
            return None, response.text

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None, None

# Example usage
api_url = "https://www.espncricinfo.com/series/t20-blast-2024-1410370/points-table-standings"
headers = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-encoding": "gzip, deflate, br, zstd",
    "accept-language": "en-US,en;q=0.9",
    "cache-control": "max-age=0",
    "cookie": "_cb=DTQxLy3Jm1XBtmcL5; SWID=f6f3db74-9e0e-4048-8790-8a4532e7653a; _cc_id=3f79452065065c76763f9a1f40dac86f; WZRK_G=041019a6f2414f26a84bd108e525733b; trc_cookie_storage=taboola%2520global%253Auser-id%3D947c6663-8df1-42dd-8318-46490ef6836d-tuctc96b065; _uetvid=7805a520aee011eeaa09a1ee6681b14e; _ga=GA1.1.679824282.1718555219; s_pers=%20s_c24%3D1718808602208%7C1813416602208%3B%20s_c24_s%3DLess%2520than%25207%2520days%7C1718810402208%3B%20s_gpv_pn%3Dcricinfo%253Asearch%253Asearch%2520result%7C1718810402210%3B; _ga_TNVCMNW09Q=GS1.1.1718987635.4.0.1718987635.0.0.0; _ga_H0P43ZY447=GS1.1.1718987687.2.0.1718987697.0.0.0; _ga_XS3N0WC54W=GS1.1.1718987747.3.1.1718987927.0.0.0; edition=espncricinfo-en-in; edition-view=espncricinfo-en-in; region=unknown; _dcf=1; connectionspeed=full; cto_bundle=sFLpe19GViUyQmRHUVlaRXJ1WGZqaUtNc3RnVDdZTDJZSTAlMkZFc3dJaHJkWFRBRXQzaG9ZSXZ6MEY5Y0JKV0VUY2RXTTM4Q3ptT2ZDNEJGeEp0JTJCQUt2NzAxT0lta0hMWnlaR2ZBSHdoa1RtWjV1UExsMGwxTW5BaW5scWRsWmxwNXV4R0xkTQ; country=in; s_ensPortal=sports; s_ensCDS=0; __gads=ID=a85a2a34d7a2c324:T=1704798944:RT=1722179627:S=ALNI_MZvvZHxOdAQ6JIwCh_Bk5AgXTGiZg; __gpi=UID=00000cd46a933317:T=1704798944:RT=1722179627:S=ALNI_MbP3FdvgMBBgIohDstFK4RlkCDrSQ; __eoi=ID=97602268cd3413c3:T=1718555173:RT=1722179627:S=AA-AfjYhQlVtGGwCWsyBHmnzKcqz; panoramaId_expiry=1722266027860; s_c24_s=Less%20than%207%20days; AMCVS_EE0201AC512D2BE80A490D4C%40AdobeOrg=1; AMCV_EE0201AC512D2BE80A490D4C%40AdobeOrg=1585540135%7CMCIDTS%7C19932%7CMCMID%7C33855837611456531403825214972827827092%7CMCAAMLH-1722784428%7C12%7CMCAAMB-1722784428%7C6G1ynYcLPuiQxYZrsz_pkqfLG9yMXBpb2zX5dvJdYQJzPXImdj0y%7CMCOPTOUT-1722186828s%7CNONE%7CvVersion%7C4.4.0; s_cc=true; s_gpv=espncricinfo%3Agame%3Asurrey-vs-glamorgan-group-b%3Acommentary; s_ensNR=1722179768388-Repeat; s_nr30=1722179768817-Repeat; s_c24=1722179768818; s_sq=%5B%5BB%5D%5D; WZRK_S_884-7R5-R85Z=%7B%22p%22%3A3%2C%22s%22%3A1722179627%2C%22t%22%3A1722179769%7D; s_ips=989.3999938964844; nol_fpid=9847n0dtfdeqyfzeb3cfoqxyeo3yh1722086391|1722086391049|1722179770810|1722179771099; _chartbeat2=.1663738421793.1722179773593.0000000000000011.CNjQHOBREiiMBpCot-Dcx96kDB80sA.1; _cb_svref=external; s_tp=2923; s_ppv=espncricinfo%253Agame%253Asurrey-vs-glamorgan-group-b%253Acommentary%2C34%2C29%2C29%2C850.1999969482422%2C4%2C1",
    "dnt": "1",
    "priority": "u=0, i",
    "sec-ch-ua": '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"Windows"',
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}
params = {"param1": "value1", "param2": "value2"}
json_output_file = 'combined_output.json'

# Fetch data from the API
json_data, html_content = fetch_data_from_api(api_url, headers, params)

if html_content:
    # Step 2: Extract JSON content after the _NEXT_DATA__" type="application/json"> tag
    json_data, json_content = extract_json_after_next_data_tag(html_content)

    # Step 3: Save the extracted JSON content to a new .json file
    if json_content:
        save_json_to_file(json_content, json_output_file)
else:
    # Save the JSON data directly if it was fetched as JSON
    if json_data:
        json_content = json.dumps(json_data, indent=4)
        save_json_to_file(json_content, json_output_file)


Extracted JSON content: {"props":{"appNextJsContext":{"lang":"en","country":"in","edition":"in","device":{"screen":"DESKTOP_LG","orientation":"LANDSCAPE","isTouchDevice":false,"isMobile":false,"isSSR":true},"hip":"ffc31168120ffe0cd1d9b47579a44377"},"globalDetails":{"followItems":[{"title":"Instagram","summary":"","previewImage":null,"type":"URL","countryCodes":[],"url":{"id":11670,"objectId":1243355,"title":"https://www.instagram.com/espncricinfo/","url":"https://www.instagram.com/espncricinfo/"}},{"title":"WhatsApp","
JSON content successfully saved to combined_output.json


In [23]:
import pandas as pd
import json

# Load the JSON file
with open('combined_output.json', 'r') as file:
    data = json.load(file)

# Define the path based on assumed structure
recent_matches_path = ['props', 'appPageProps', 'data', 'editionDetails', 'trendingMatches', 'matches']

def get_nested_value(data, path):
    for key in path:
        if isinstance(data, list):
            data = data[key]
        else:
            data = data.get(key, {})
    return data

recent_matches = get_nested_value(data, recent_matches_path)

# Check if recent_matches is found and is a list
if isinstance(recent_matches, list):
    all_matches = []

    for match in recent_matches:
        match_details = {
            'matchId': match.get('matchId'),
            'seriesId': match.get('seriesId'),
            'team1': match.get('team1', {}).get('team', {}).get('longName'),
            'team2': match.get('team2', {}).get('team', {}).get('longName'),
            'matchDate': match.get('matchDate'),
            'matchResult': match.get('matchResult'),
            'matchType': match.get('matchType'),
            'ground': match.get('ground', {}).get('ground', {}).get('longName')
        }
        
        all_matches.append(match_details)

    df = pd.DataFrame(all_matches)
    print(df)
    df.to_excel('Recent_Matches.xlsx', index=False)
else:
    print("No recent matches data found or data is not in expected format.")


No recent matches data found or data is not in expected format.


In [29]:
import pandas as pd
import json

# Load the JSON file
with open('combined_output.json', 'r') as file:
    data = json.load(file)

# Define the path to the matches
matches_path = ['props', 'appPageProps', 'data', 'content', 'standings', 'objects', 'matches']

def get_nested_value(data, path):
    for key in path:
        if isinstance(data, dict):
            data = data.get(key, {})
        else:
            return None
    return data

# Get the matches dictionary
matches = get_nested_value(data, matches_path)

# Check if matches is a dictionary
if isinstance(matches, dict):
    print("Matches is a dictionary")
else:
    print("Matches is not a dictionary or is empty")

# Create an empty list to store all recent match details
all_recent_matches = []

# Iterate over each match in the matches dictionary
for match_id, match_info in matches.items():
    series_info = match_info.get('series', {})
    ground_info = match_info.get('ground', {})
    teams_info = match_info.get('teams', [])

    # Extract information from series
    series_alternate_name = series_info.get('alternateName')
    series_long_alternate_name = series_info.get('longAlternateName')
    series_unofficial_name = series_info.get('unofficialName')
    series_season = series_info.get('season')
    series_type_id = series_info.get('typeId')
    series_end_date = series_info.get('endDate')
    series_slug = series_info.get('slug')  # Extract slug for api_name

    # Extract information from ground
    ground_name = ground_info.get('name')
    ground_long_name = ground_info.get('longName')
    ground_location = ground_info.get('location')

    # Extract information from teams
    if len(teams_info) >= 2:
        team1_info = teams_info[0].get('team', {})
        team2_info = teams_info[1].get('team', {})
        
        team1_name = team1_info.get('longName')
        team2_name = team2_info.get('longName')
    else:
        team1_name = None
        team2_name = None

    # Append the match details to the list
    match_details = {
        'matchId': match_id,
        'seriesAlternateName': series_alternate_name,
        'seriesLongAlternateName': series_long_alternate_name,
        'seriesUnofficialName': series_unofficial_name,
        'seriesSeason': series_season,
        'seriesTypeId': series_type_id,
        'seriesEndDate': series_end_date,
        'groundName': ground_name,
        'groundLongName': ground_long_name,
        'groundLocation': ground_location,
        'team1': team1_name,
        'team2': team2_name,
        'api_name': series_slug  # Add the slug as api_name
    }
    all_recent_matches.append(match_details)

# Convert the list to a DataFrame
df = pd.DataFrame(all_recent_matches)

# Print the DataFrame
print("DataFrame:", df)

# Save the DataFrame to an Excel file
df.to_excel('Recent_Matches_with_API_Name.xlsx', index=False)


Matches is a dictionary
DataFrame:      matchId seriesAlternateName seriesLongAlternateName seriesUnofficialName  \
0    1410371           T20 Blast                    None                 None   
1    1410372           T20 Blast                    None                 None   
2    1410373           T20 Blast                    None                 None   
3    1410374           T20 Blast                    None                 None   
4    1410375           T20 Blast                    None                 None   
..       ...                 ...                     ...                  ...   
128  1410499           T20 Blast                    None                 None   
129  1410500           T20 Blast                    None                 None   
130  1410501           T20 Blast                    None                 None   
131  1410502           T20 Blast                    None                 None   
132  1423080           T20 Blast                    None                 N

In [19]:
import pandas as pd
import json

# Load the JSON file
with open('combined_output.json', 'r') as file:
    data = json.load(file)

# Extract the relevant details
series_id = data['query']['seriesId']
match_id = data['query']['matchId']
innings = data['props']['appPageProps']['data']['content']['innings']

# Create an empty list to store all inningOvers details
all_inningOvers = []

# Iterate over each inning in the innings list
for inning in innings:
    # Check if 'inningOvers' is a key in the inning dictionary
    if 'inningOvers' in inning:
        inningOvers = inning['inningOvers']
        # Add seriesId and matchId to each over's dictionary
        for over in inningOvers:
            over['seriesId'] = series_id
            over['matchId'] = match_id
        # Extend the all_inningOvers list with details from the current inning
        all_inningOvers.extend(inningOvers)

# Convert the details to a pandas DataFrame
df = pd.DataFrame(all_inningOvers)

# Extract bowler details into separate columns
def extract_bowler_info(bowlers_list, key):
    if pd.notnull(bowlers_list):
        return ', '.join([str(bowler.get(key)) for bowler in bowlers_list])
    return None

# Add bowlerId and bowlerLongName columns as strings
df['bowlerIds'] = df['bowlers'].apply(lambda x: extract_bowler_info(x, 'id'))
df['bowlerLongNames'] = df['bowlers'].apply(lambda x: extract_bowler_info(x, 'longName'))

# Drop the original 'bowlers' column
df = df.drop(columns=['bowlers'])

# Reorder the columns to ensure seriesId, matchId, bowlerIds, and bowlerLongNames are first
columns_order = ['seriesId', 'matchId', 'bowlerIds', 'bowlerLongNames'] + \
                [col for col in df.columns if col not in ['seriesId', 'matchId', 'bowlerIds', 'bowlerLongNames']]
df = df[columns_order]

# Inspect the DataFrame
print(df)

# Save the DataFrame to a CSV file
df.to_csv('innings_details.csv', index=False)

# Load the CSV to verify its content
df_csv = pd.read_csv('innings_details.csv')
print(df_csv)


   seriesId  matchId bowlerIds      bowlerLongNames  overNumber  overRuns  \
0   1442984  1442987     79067   Dilshan Madushanka           1        13   
1   1442984  1442987     78275      Asitha Fernando           2         9   
2   1442984  1442987    101991   Maheesh Theekshana           3        14   
3   1442984  1442987     78275      Asitha Fernando           4        15   
4   1442984  1442987    101991   Maheesh Theekshana           5         8   
5   1442984  1442987     79067   Dilshan Madushanka           6        15   
6   1442984  1442987     78239    Wanindu Hasaranga           7         7   
7   1442984  1442987     79067   Dilshan Madushanka           8        17   
8   1442984  1442987     78239    Wanindu Hasaranga           9         4   
9   1442984  1442987     78233       Kamindu Mendis          10         9   
10  1442984  1442987     78239    Wanindu Hasaranga          11        11   
11  1442984  1442987    105938  Matheesha Pathirana          12        13   