# USA Poverty Data 

This notebook investigates the percent and peoples below poverty in the USA for later comparison with different variables. The two graphs show percent of people below poverty and peoples below poverty, respectively.

In [58]:
import requests
import re
import pandas as pd

import altair as alt
import pandas as pd

#Importing all the commands to utilise throughout

In [59]:

url = "https://aspe.hhs.gov/information-poverty-income-statistics-tables"

# Sending a request to the URL to scrape the data
response = requests.get(url)

if response.status_code == 200:
    # Finding the first table in the website
    match = re.search(r'<table.*?>(.*?)</table>', response.text, re.DOTALL)

    # Checking if the table was found
    if match:
        table_html = match.group(1)

        # Using a list of lists to store the extracted data
        rows = re.findall(r'<tr.*?>(.*?)</tr>', table_html, re.DOTALL)
        data_list = []
        for row in rows:
            columns = re.findall(r'<t[dh].*?>(.*?)</t[dh]>', row, re.DOTALL)
            data = [re.sub(r'<.*?>', '', col).strip() for col in columns]
            data_list.append(data)

        # Create a dataframe 
        df = pd.DataFrame(data_list, columns=["Year", "Persons Below Poverty", "Percent Below Poverty"])

        # Only scrape the data in the 'Year' column that are four numbers so that Month and other variables aren't scraped
        df = df[df["Year"].str.match(r'^\d{4}', na=False)]
        df["Year"] = df["Year"].str[:4]

        # Converting columns to numeric and removing the commas if there are any
        df["Persons Below Poverty"] = pd.to_numeric(df["Persons Below Poverty"].str.replace(',', ''), errors='coerce')
        df["Percent Below Poverty"] = pd.to_numeric(df["Percent Below Poverty"].str.replace(',', ''), errors='coerce')


    else:
        print("No table found on the page.")

else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


In [60]:
pip install altair



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [61]:

# Creating an altair line graph
line_chart = alt.Chart(df).mark_line().encode(
    x='Year:T',
    y=alt.Y('Percent Below Poverty:Q', title='Percent Below Poverty'),
    tooltip=['Year:T', alt.Tooltip('Percent Below Poverty:Q', title='Percent Below Poverty')]
).properties(
    title='Percent Below Poverty in the USA from 2000-2011'
)

# Showing the graph
line_chart.interactive()


In [62]:
import altair as alt
import pandas as pd

#Doing the aforementioned for the second batch of data
line_chart = alt.Chart(df).mark_line().encode(
    x='Year:T',
    y=alt.Y('Persons Below Poverty:Q', title='Percent Below Poverty'),
    tooltip=['Year:T', alt.Tooltip('Persons Below Poverty:Q', title='Persons Below Poverty')]
).properties(
    title='Persons Below Poverty in the USA from 2000-2011'
)

line_chart.interactive()

In [63]:
df.to_csv('poverty.csv', index=False)
#Saving the dataframe to a CSV file