In [1]:
!pip install requests
!pip install scrapy
!pip install re
!pip install pandas

Collecting scrapy
  Using cached Scrapy-2.11.1-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting Twisted>=18.9.0 (from scrapy)
  Using cached twisted-24.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Using cached cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Using cached itemloaders-1.2.0-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Using cached parsel-1.9.1-py2.py3-none-any.whl.metadata (11 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Using cached queuelib-1.7.0-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Using cached service_identity-24.1.0-py3-none-any.whl.metadata (4.8 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Using cached w3lib-2.1.2-py3-none-any.whl.metadata (1.1 kB)
Collecting zope.interface>=5.1.0 (from scrapy)
  Using cached zope.interface-6.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_

In [2]:
import requests
from scrapy import Selector
import re
import pandas as pd

In [3]:
def fetch_smoking_rates_abs():
    # URL of the page you want to scrape
    url = "https://www.abs.gov.au/statistics/health/health-conditions-and-risks/smoking/latest-release"

    # Send HTTP request to the URL
    response = requests.get(url)
    if response.status_code == 200:
        # Use Scrapy's Selector to parse HTML content
        selector = Selector(text=response.text)

        # Extract data from the specified table with the aria-label attribute or caption
        rows = selector.xpath('//table[caption[contains(text(), "Proportion of people 15 years and over who were current daily smokers by age and sex, 2022")]]/tbody/tr')
        return_dict = {}
        for row in rows:
            # Extract data from each row, handling possible missing data gracefully
            age_group = row.xpath('.//th/text()').get(default="N/A").strip()
            males_percentage = row.xpath('.//td[1]/text()').get(default="N/A").strip()
            males_low_ci = row.xpath('.//td[2]/text()').get(default="N/A").strip()
            males_high_ci = row.xpath('.//td[3]/text()').get(default="N/A").strip()
            females_percentage = row.xpath('.//td[4]/text()').get(default="N/A").strip()
            females_low_ci = row.xpath('.//td[5]/text()').get(default="N/A").strip()
            females_high_ci = row.xpath('.//td[6]/text()').get(default="N/A").strip()
            
            print(f"{age_group}: Males {males_percentage}% (CI {males_low_ci}-{males_high_ci}), Females {females_percentage}% (CI {females_low_ci}-{females_high_ci})")
            return_dict[age_group] = {"Males": float(males_percentage)/100, 'Females': float(females_percentage)/100}
        
        print("\n")
        print(return_dict)
        print("\n")

    else:
        print("Failed to retrieve data. Status code:", response.status_code)

In [4]:
fetch_smoking_rates_abs()

15–17(a): Males 1.2% (CI N/A-N/A), Females 1.8% (CI N/A-N/A)
18–24: Males 9.3% (CI 6.4-12.2), Females 5.9% (CI 2.4-9.4)
25–34: Males 13.4% (CI 10.2-16.6), Females 8.8% (CI 6.5-11.1)
35–44: Males 13.5% (CI 10.5-16.5), Females 8.5% (CI 6.5-10.5)
45–54: Males 15.3% (CI 12.3-18.3), Females 11.6% (CI 8.5-14.7)
55–64: Males 17.4% (CI 13.8-21.0), Females 12.0% (CI 10.3-13.7)
65–74: Males 9.9% (CI 7.4-12.4), Females 7.9% (CI 5.8-10.0)
75 years and over: Males 3.8% (CI 1.1-6.5), Females 1.9% (CI 0.7-3.1)


{'15–17(a)': {'Males': 0.012, 'Females': 0.018000000000000002}, '18–24': {'Males': 0.09300000000000001, 'Females': 0.059000000000000004}, '25–34': {'Males': 0.134, 'Females': 0.08800000000000001}, '35–44': {'Males': 0.135, 'Females': 0.085}, '45–54': {'Males': 0.153, 'Females': 0.11599999999999999}, '55–64': {'Males': 0.174, 'Females': 0.12}, '65–74': {'Males': 0.099, 'Females': 0.079}, '75 years and over': {'Males': 0.038, 'Females': 0.019}}




CDC DATA NOW

In [5]:
def fetch_smoking_rates_cdc():
    url = "https://www.cdc.gov/tobacco/data_statistics/fact_sheets/adult_data/cig_smoking/index.htm"

    response = requests.get(url)
    if response.status_code == 200:
        selector = Selector(text=response.text)

        # Extract smoking rates by sex
        male_rate_text = selector.xpath("//li[contains(text(), 'adult men')]/text()").get()
        female_rate_text = selector.xpath("//li[contains(text(), 'adult women')]/text()").get()

        male_rate = float(re.search(r"(\d+\.\d+)", male_rate_text).group(1))
        female_rate = float(re.search(r"(\d+\.\d+)", female_rate_text).group(1))

        print(f"male rate: {male_rate}")
        print(f"female rate: {female_rate}")

        # Calculate the correction factor for men's smoking rates
        if female_rate != 0:
            correction_factor = male_rate / female_rate
        else:
            correction_factor = 1

        # Extract smoking rates by age group
        age_groups = selector.xpath("//div[h4[contains(text(), 'By Age')]]/following-sibling::div//ul/li")
        age_smoking_rates = {}
        for group in age_groups:
            details = group.xpath(".//text()").get()
            age_range_match = re.search(r"(\d+[\–\-]\d+ years|\d+ years and older)", details)
            if age_range_match:
                age_range = age_range_match.group(1)
                age_range_rate = float(re.search(r"(\d+\.\d+)", details).group(1))
                print(f"age group-{age_range}: rate-{age_range_rate}")
                adjusted_rate = age_range_rate * correction_factor

                # Store rates for both men and women
                age_smoking_rates[age_range] = {
                    'female_rate': age_range_rate,
                    'male_rate': adjusted_rate
                }

        return age_smoking_rates
    else:
        print("Failed to retrieve data. Status code:", response.status_code)

In [6]:
smoking_rates = fetch_smoking_rates_cdc()
if smoking_rates:
    print("\n")
    print(smoking_rates)
    print("\n")

male rate: 13.1
female rate: 10.1
age group-18–24 years: rate-5.3
age group-25–44 years: rate-12.6
age group-45–64 years: rate-14.9
age group-65 years and older: rate-8.3


{'18–24 years': {'female_rate': 5.3, 'male_rate': 6.874257425742575}, '25–44 years': {'female_rate': 12.6, 'male_rate': 16.342574257425742}, '45–64 years': {'female_rate': 14.9, 'male_rate': 19.325742574257426}, '65 years and older': {'female_rate': 8.3, 'male_rate': 10.765346534653467}}


