# Day_22 : Python web scrapping

### Exercises

In [6]:
# Scraping a website and storing the data as json file

import requests
from bs4 import BeautifulSoup
import json

url = 'http://www.bu.edu/president/boston-university-facts-stats/'

# Fetch the HTML content of the page
response = requests.get(url)
html_content = response.content

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# extracting all text from paragraphs:
data = {'paragraphs': [p.get_text() for p in soup.find_all('p')]}

# Store the data as JSON
with open('scraped_data.json', 'w') as json_file:
    json.dump(data, json_file, indent=2)

print('Data scraped and stored as "scraped_data.json"')


Data scraped and stored as "scraped_data.json"


In [5]:
# Extracting the table in a url


import requests
from bs4 import BeautifulSoup
import json

# URL of the webpage containing the table
url = "https://archive.ics.uci.edu/dataset/602/dry+bean+dataset.php"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table you want to extract (you may need to inspect the HTML to find the appropriate tags)
    table = soup.find('table')

    # Initialize an empty list to store the table data
    table_data = []

    # Extract rows and columns from the table
    for row in table.find_all('tr'):
        row_data = [cell.text.strip() for cell in row.find_all(['th', 'td'])]
        table_data.append(row_data)

    # Convert the table data to a JSON format
    json_data = json.dumps(table_data, indent=2)

    # Save the JSON data to a file
    with open('table_data.json', 'w') as json_file:
        json_file.write(json_data)

    print("Table data has been extracted and saved to 'table_data.json'.")
else:
    print("Failed to retrieve the webpage. Check the URL or try again later.")


Table data has been extracted and saved to 'table_data.json'.


In [4]:
# Scraping the presidents table and storing the data 


import requests
from bs4 import BeautifulSoup
import json

url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States'

# Fetch the HTML content of the page
response = requests.get(url)
html_content = response.content

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Find the table containing the list of presidents
presidents_table = soup.find('table', {'class': 'wikitable'})

# Initialize empty list to store presidents' data
presidents_data = []

# Iterate over rows in the table
for row in presidents_table.find_all('tr')[1:]:  # Skip the header row
    columns = row.find_all(['th', 'td'])

    # Extract data from each column
    number = columns[0].get_text(strip=True)
    president = columns[1].get_text(strip=True)
    presidency = columns[2].get_text(strip=True)
    term = columns[3].get_text(strip=True)
    party = columns[4].get_text(strip=True)

    # Create a dictionary for each president's data
    president_info = {
        'Number': number,
        'President': president,
        'Presidency': presidency,
        'Term': term,
        'Party': party
    }

    # Append the dictionary to the list
    presidents_data.append(president_info)

# Store the data as JSON
with open('presidents_data.json', 'w') as json_file:
    json.dump(presidents_data, json_file, indent=2)

print('Presidents data scraped and stored as "presidents_data.json"')


Presidents data scraped and stored as "presidents_data.json"
