#### **MINI PROJECT : Using BeautifulSoup, Requests And Pandas**
- In this mini project, we explore how to use the Python library **Beautiful Soup** for **HTML parsing**.
- The project involves scraping data from a public website, organizing the extracted information into a structured DataFrame using **pandas**, and finally exporting the data into an **Excel file**.
- This hands-on task helps in understanding the fundamentals of web scraping, data cleaning, and basic data handling in Python.


In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
# url1 = "https://www.iplt20.com/points-table/"
# url2 = "https://webscraper.io/test-sites/tables/tables-semantically-correct"
url3 = 'https://www.icc-cricket.com/rankings/batting/mens/odi'
# url4 = "https://www.bbc.com/sport/football/premier-league/table"
# url5 = "https://www.espncricinfo.com/records/trophy/batting-most-runs-career/world-cup-12"

url : https://www.icc-cricket.com/rankings/batting/mens/odi

In [5]:
try:
    r = requests.get(url3)
    r.raise_for_status()
except requests.exceptions.HTTPError as errh:
    print ("Http Error:",errh)
except requests.exceptions.ConnectionError as errc:
    print ("Error Connecting:",errc)
except requests.exceptions.Timeout as errt:
    print ("Timeout Error:",errt)
except requests.exceptions.RequestException as err:
    print ("OOps: Something Else",err)
else:
    soup = BeautifulSoup(r.content, 'html.parser')
    print('OK !')
    row = soup.find('div', class_='si-table-row')
    print(row)


OK !
None


In [9]:
from bs4 import BeautifulSoup
import pandas as pd

ranking = []
teams = []
players_name = []
ratings = []
career_bests = []

try:
    with open("db/html_content.html", "r", encoding="utf-8") as f:
        html = f.read()
except Exception as err:
    print("File Read Error:", err)
else:
    soup = BeautifulSoup(html, "html.parser")
    
    # Find all player rows
    rows = soup.find_all("div", class_="si-table-row")

    for row in rows:
        # Position
        pos_div = row.find("div", class_="si-table-data si-pos")
        position = pos_div.find("span", class_="si-text").get_text(strip=True) if pos_div else ""
        ranking.append(position)

        # Team
        team_wrap = row.find("div", class_="si-table-data si-team")
        if team_wrap:
            short_tag = team_wrap.find("span", class_="si-sname")
            full_tag = team_wrap.find("span", class_="si-fname")
            short = short_tag.get_text(strip=True) if short_tag else "" 
            full = full_tag.get_text(strip=True) if full_tag else ""
            team = f"{short} ({full})"
        else:
            team = ""
        teams.append(team)

        # Player Name extraction
        player_div = row.find("div", class_="si-player")
        fname_tag = player_div.find("span", class_="si-fname") if player_div else None
        lname_tag = player_div.find("span", class_="si-lname") if player_div else None
        first = fname_tag.get_text(strip=True) if fname_tag else ""
        last = lname_tag.get_text(strip=True) if lname_tag else ""
        player_name = f"{first} {last}".strip()
        players_name.append(player_name)

        # Rating
        rating_tag = row.find("div", class_="si-table-data si-rating")
        rating = int(rating_tag.get_text(strip=True)) if rating_tag else ""
        ratings.append(rating)

        # Career Best
        best_tag = row.find("div", class_="si-best")
        best = best_tag.get_text(strip=True) if best_tag else ""
        career_bests.append(best)

# Convert to DataFrame
odi_men_ranking = {
    "Rank": ranking,
    "Player": players_name,
    "Team": teams,
    "Rating": ratings,
    "Career Best": career_bests
}

df = pd.DataFrame(odi_men_ranking)

# Save to CSV
filename = "icc_mens_odi_ranking_2025.csv"
df.to_csv("db/"+ filename, index=False)

# Print success message
print(f"\n🎉 CSV File '{filename}' has been saved successfully!\n")

excel_filename = "icc_mens_odi_ranking_2025.xlsx"
df.to_excel("db/"+ excel_filename, index=False)
print(f"📘 Excel file '{excel_filename}' has also been saved successfully!")



🎉 CSV File 'icc_mens_odi_ranking_2025.csv' has been saved successfully!

📘 Excel file 'icc_mens_odi_ranking_2025.xlsx' has also been saved successfully!


In [8]:
from bs4 import BeautifulSoup
import pandas as pd

# Lists to store extracted data
ranking = []
teams = []
players_name = []
ratings = []
career_bests = []

# Helper function to safely extract text
def extract_text(parent, selector):
    tag = parent.select_one(selector)
    return tag.get_text(strip=True) if tag else ""

# Load and parse HTML
try:
    with open("db/html_content.html", "r", encoding="utf-8") as f:
        html = f.read()
except Exception as err:
    print("File Read Error:", err)
else:
    soup = BeautifulSoup(html, "html.parser")
    rows = soup.find_all("div", class_="si-table-row")

    for row in rows:
        # Extract each field using CSS selectors
        position = extract_text(row, "div.si-table-data.si-pos span.si-text")
        team_short = extract_text(row, "div.si-team span.si-sname")
        team_full = extract_text(row, "div.si-team span.si-fname")
        team = f"{team_short} ({team_full})" if team_short and team_full else ""
        first_name = extract_text(row, "div.si-player span.si-fname")
        last_name = extract_text(row, "div.si-player span.si-lname")
        rating = int(extract_text(row, "div.si-table-data.si-rating"))
        best = extract_text(row, "div.si-best")

        # Append to lists
        ranking.append(position)
        teams.append(team)
        players_name.append(f"{first_name} {last_name}".strip())
        ratings.append(rating)
        career_bests.append(best)

# ✅ Convert to DataFrame
df = pd.DataFrame({
    "Rank": ranking,
    "Player": players_name,
    "Team": teams,
    "Rating": ratings,
    "Career Best": career_bests
})

# ✅ Display sample
df

# ✅ Optional: Save to CSV
# df.to_csv("odi_rankings.csv", index=False)


Unnamed: 0,Rank,Player,Team,Rating,Career Best
0,01,Shubman Gill,IND (India),784,847 v Australia at Indore 2023
1,02,Babar Azam,PAK (Pakistan),766,898 v West Indies at Multan 2022
2,03,Rohit Sharma,IND (India),756,882 v Sri Lanka at Headingley 2019
3,04,Virat Kohli,IND (India),736,909 v England at Headingley 2018
4,05,Daryl Mitchell,NZ (New Zealand),720,751 v India at Mumbai 2023
...,...,...,...,...,...
95,=,Milind Kumar,USA (USA),420,442 v Oman at Lauderhill 2025
96,97,Mohammad Nadeem,OMN (Oman),419,430 v Zimbabwe at Bulawayo 2023
97,=,Abdullah Shafique,PAK (Pakistan),419,479 v Bangladesh at Kolkata 2023
98,99,Teja Nidamanuru,NED (Netherlands),417,443 v West Indies at Harare 2023
