# Problems 3 - web_scraping_espn_scorecard_to_sqlite

In [1]:
# ! pip install selenium webdriver_manager lxml html5lib beautifulsoup4

In [2]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import sqlite3
import re
import warnings
warnings.filterwarnings("ignore")



In [3]:
URL = "https://www.espncricinfo.com/series/icc-champions-trophy-2024-25-1459031/india-vs-new-zealand-final-1466428/full-scorecard"


In [4]:
print("Attempting to fetch content using Selenium...")

html_content = None

try:
    options = webdriver.ChromeOptions()
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("user-agent=Mozilla/5.0")
    
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )

    driver.get(URL)
    WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "table.ds-w-full.ds-table"))
    )
    html_content = driver.page_source
    print("Content fetched successfully using Selenium!")

except Exception as e:
    print("\nFATAL ERROR during Selenium process:", e)
    exit()

finally:
    if 'driver' in locals():
        driver.quit()

Attempting to fetch content using Selenium...
Content fetched successfully using Selenium!


In [5]:
print("\nExtracting tables from HTML...")

tables = []

if html_content is None:
    print("Fetch failed. No HTML.")
    exit()

soup = BeautifulSoup(html_content, "html.parser")

# Cricinfo Tables are under ds-table, ds-w-full etc.
raw_tables = soup.find_all(
    "table",
    class_=["ds-w-full", "ds-table", "ds-table-xs", "ds-table-bordered"]
)

print(f"Found {len(raw_tables)} raw tables on page.")

# Convert HTML tables → DataFrames
errors = []
for i, table_tag in enumerate(raw_tables):
    html_snippet = str(table_tag)
    # Try lxml first (fast, robust), then html5lib
    try:
        df = pd.read_html(html_snippet, flavor="lxml")[0]
        tables.append(df)  # Select every other row
        continue
    except Exception as e_lxml:
        err_lxml = str(e_lxml)
    try:
        df = pd.read_html(html_snippet, flavor="html5lib")[0]
        tables.append(df)
        continue
    except Exception as e_h5:
        errors.append((i, err_lxml, str(e_h5)))

print(f"Successfully parsed {len(tables)} tables into DataFrames.")
if errors:
    print(f"Failed to parse {len(errors)} tables. Showing first few errors:")
    for i, e1, e2 in errors[:3]:
        print(f"- Table {i} failed. lxml: {e1} | html5lib: {e2}")

print(f"Successfully parsed {len(tables)} tables into DataFrames.")


Extracting tables from HTML...
Found 5 raw tables on page.
Successfully parsed 5 tables into DataFrames.
Successfully parsed 5 tables into DataFrames.


In [6]:
batting_count, bowling_count = 0, 0
batting, bowling = [], []
scorecards = {}

for df in tables:
    cols = [str(c).lower().replace("\n", "").strip() for c in df.columns]

    # Batting tables typically contain ["Batting"]
    if any(col in cols for col in ["batting"]):
        batting_count += 1
        key = f"Innings {batting_count} Batting"
        col_list = {  'Batting': 'Batting_player_name'
                    , 'Unnamed: 1': 'Status_details'
                    , 'R':'runs_scored'
                    , 'B':'balls_faced'
                    , 'M': 'Minutes_played_by_batsmen'
                    , '4s': 'Boundary_fours_hit_by_batsman'
                    , '6s': 'Boundary_sixe_hit_by_batsman'
                    , 'SR': 'strike_rate'
                }
        df = df.drop(columns=['Unnamed: 8', 'Unnamed: 9']).iloc[::2].iloc[:-1]
        df.loc[df['Batting'].str.contains("Extras"), ["B", "M", "4s", "6s", "SR"]] = 0
        df['innings'] = batting_count
        df['team'] = np.where(batting_count == 1, "India", "New Zealand")
        batting.append(df.rename(columns=col_list))
        print(F"{key} cleaning done")

    # Bowling tables contain ['Bowling']
    elif any(col in cols for col in ['bowling']):
        bowling_count += 1
        key = f"Innings {bowling_count} Bowling"
        df['innings'] = bowling_count
        col_list = {  "Unnamed: 1": "Details"
                    , "Bowling": "Bowling_player_name"
                    , "O": "Overs_bowled"
                    , "M": "Maiden_overs_bowled"
                    , "R": "Runs_coceived"
                    , "W": "Wicket_taken"
                    , "ECON": "Economy"
                    , "0s": "Dot_balls"
                    , "4s": "4s_boundaries"
                    , "6s": "6s_boundaries"
                    , "WD": "Wide_balls"
                    , "NB": "No_balls" 
                }
        df = df.drop(columns=["Unnamed: 11"]).dropna().reset_index(drop=True)
        bowling.append(df.rename(columns=col_list))
        print(F"{key} cleaning done")
    
    if len(scorecards) == 4:
        break

batting = pd.concat(batting).reset_index(drop=True)
for col in ['runs_scored', 'balls_faced', 'Minutes_played_by_batsmen', 'Boundary_fours_hit_by_batsman', 'Boundary_sixe_hit_by_batsman', 'strike_rate', 'innings']:
    batting[col] = batting[col].astype(float)
batting['team'] = np.where(batting['innings'] == 2, "India", "New Zealand")    
scorecards["Batting"] = batting
print("Batting concat done")

bowling = pd.concat(bowling).reset_index(drop=True)
for col in ["Overs_bowled", "Maiden_overs_bowled", "Runs_coceived", "Wicket_taken", "Economy", "Dot_balls", "4s_boundaries", "6s_boundaries", "Wide_balls", "No_balls", "innings"]:
    bowling[col] = bowling[col].astype(float)
bowling['team'] = np.where(bowling['innings'] == 1, "India", "New Zealand")    
scorecards["Bowling"] = bowling
print("Bowling concat done")

Innings 1 Batting cleaning done
Innings 1 Bowling cleaning done
Innings 2 Batting cleaning done
Innings 2 Bowling cleaning done
Batting concat done
Bowling concat done


In [7]:
print("\n==================== FINAL SCORECARD ====================")

if len(scorecards) < 2:
    print(f"WARNING: Only {len(scorecards)} of 2 scorecards tables found.")
else:
    print("All 2 scorecard tables extracted successfully!")

print("\n==================== DATA TABLES ========================")

for name, df in scorecards.items():
    print("\n------------------------------------------------------------")
    print(f"TABLE: {name}")
    print("------------------------------------------------------------")
    # Remove first row if it's duplicate header
    clean_df = df.iloc[1:].reset_index(drop=True)
    print(clean_df)

print("\nExtraction completed.")
print("Final tables:", list(scorecards.keys()))


All 2 scorecard tables extracted successfully!


------------------------------------------------------------
TABLE: Batting
------------------------------------------------------------
     Batting_player_name             Status_details  runs_scored  balls_faced  \
0        Rachin Ravindra            b Kuldeep Yadav         37.0         29.0   
1        Kane Williamson        c & b Kuldeep Yadav         11.0         14.0   
2         Daryl Mitchell  c Sharma b Mohammed Shami         63.0        101.0   
3           Tom Latham †               lbw b Jadeja         14.0         30.0   
4         Glenn Phillips                    b Varun         34.0         52.0   
5      Michael Bracewell                    not out         53.0         40.0   
6   Mitchell Santner (c)     run out (Kohli/†Rahul)          8.0         10.0   
7           Nathan Smith                    not out          0.0          1.0   
8                 Extras               (lb 3, w 13)         16.0          0.0   
9  

In [8]:
def clean_df(df):
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [' '.join(map(str, c)).strip() for c in df.columns.values]
    df = df.loc[:, ~df.columns.astype(str).str.match(r"^Unnamed")]
    df.columns = (df.columns.astype(str)
                  .str.strip()
                  .str.replace(r"\s+", "_", regex=True))
    return df

def table_name_from_key(key):
    # e.g., "Innings 1 Batting" -> "innings1_batting"
    return re.sub(r"\s+", "_", key.strip()).lower()

conn = sqlite3.connect("scorecards.db")

for key, df in scorecards.items():
    tname = table_name_from_key(key)
    df = clean_df(df)
    df.to_sql(tname, conn, if_exists="replace", index=False)

conn.close()
print("Created tables:", [table_name_from_key(k) for k in scorecards.keys()])

Created tables: ['batting', 'bowling']


In [9]:
conn = sqlite3.connect("scorecards.db")

tables_names = pd.read_sql_query(
    "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name",
    conn
)["name"].tolist()

print("\n==================== DATA TABLES in DB created ====================")
for t in tables_names:
    cnt = pd.read_sql_query(f"SELECT COUNT(*) AS c FROM {t}", conn)["c"].iloc[0]
    data = pd.read_sql_query(f"SELECT * FROM {t}", conn)
    print("\n------------------------------------------------------------")
    print(f"TABLE: {t}")
    print("------------------------------------------------------------")
    print(f"{t}: {cnt} rows\n\n", data)
    
conn.close()



------------------------------------------------------------
TABLE: batting
------------------------------------------------------------
batting: 19 rows

      Batting_player_name             Status_details  runs_scored  balls_faced  \
0             Will Young                lbw b Varun         15.0         23.0   
1        Rachin Ravindra            b Kuldeep Yadav         37.0         29.0   
2        Kane Williamson        c & b Kuldeep Yadav         11.0         14.0   
3         Daryl Mitchell  c Sharma b Mohammed Shami         63.0        101.0   
4           Tom Latham †               lbw b Jadeja         14.0         30.0   
5         Glenn Phillips                    b Varun         34.0         52.0   
6      Michael Bracewell                    not out         53.0         40.0   
7   Mitchell Santner (c)     run out (Kohli/†Rahul)          8.0         10.0   
8           Nathan Smith                    not out          0.0          1.0   
9                 Extras        