# Global Population Predictor Data
This data was scraped in order to predict global populations, separated by country, which shows the predicted data for the year 2100. This portion is added to the global data found to add.

In [1]:
import requests
import re
import pandas as pd

#Importing commands

In [2]:
url = "https://www.ined.fr/en/everything_about_population/data/world-projections/projections-by-countries/"
response = requests.get(url)
#Providing the URL for data extraction
if response.status_code == 200:
    html_content = response.text
    #Dividing the columns in the table
    table_pattern = re.compile(r'<table[^>]*>.*?</table>', re.DOTALL)
    table_match = table_pattern.search(html_content)

    if table_match:
        table_html = table_match.group(0)

        header_pattern = re.compile(r'<th.*?>(.*?)</th>', re.DOTALL)
        variables = header_pattern.findall(table_html)

        cell_pattern = re.compile(r'<td.*?>(.*?)</td>', re.DOTALL)
        cells = cell_pattern.findall(table_html)
        #Arranging the length of variables in the table for cohesion
        if variables and cells:
            data = [cells[i:i + len(variables)] for i in range(0, len(cells), len(variables))]
            df = pd.DataFrame(data, columns=variables)
            
        else:
            print("Variable names or data not found in the table.")
    else:
        print("Table not found.")
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)
df = df.drop(df.columns[[0,1]], axis=1)
# converting columns to numeric
for col in ['2050', '2075', '2100']:
    # Data has non standard space breaks which is what the str.replace accounts for
    df[col] = df[col].str.replace('\u202f', '').astype(float)
df

#Help from ChatGPT to edit the code was utilised when the code written didn't provide output that was divided into columns


Unnamed: 0,Region,2050,2075,2100
0,Central and Southern Asia,1668475.0,1677814.0,1533400.0
1,Eastern Asia,1316946.0,1034617.0,771301.0
2,Northern America,375085.0,389171.0,393993.0
3,Western Africa,374711.0,491187.0,545706.0
4,Central and Southern Asia,365678.0,452094.0,486772.0
5,South-Eastern Asia,316968.0,316042.0,297128.0
6,South America,230972.0,214277.0,185102.0
7,Middle Africa,215056.0,336841.0,430995.0
8,Eastern Africa,213190.0,283418.0,323283.0
9,Central and Southern Asia,203638.0,202222.0,176964.0


simplifying data to show population by region

In [3]:
region_pop_df = df.groupby('Region').sum()
region_pop_df=region_pop_df[:-1]
region_pop_df

Unnamed: 0_level_0,2050,2075,2100
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Central America,143734.0,136563.0,116090.0
Central and Southern Asia,2455670.0,2572584.0,2438844.0
Eastern Africa,627144.0,855777.0,1000825.0
Eastern Asia,1467075.0,1154501.0,869391.0
Eastern Europe,133354.0,120324.0,112204.0
Middle Africa,337499.0,515092.0,650394.0
Northern Africa,348089.0,420121.0,458585.0
Northern America,420886.0,439494.0,447825.0
Northern Europe,71660.0,71624.0,70511.0
South America,381450.0,363846.0,320918.0


Generic pie chart method

In [4]:
import altair as alt

def create_pie_chart(year):
    # creating data frame for single year
    
    data_year = region_pop_df.reset_index()[['Region', year]].rename(columns={year: 'Population'})
    chart = alt.Chart(data_year).mark_arc().encode(
        theta=alt.Theta(field="Population", type="quantitative"),
        color=alt.Color(field="Region", type="nominal"),
        tooltip=['Region', 'Population']
    ).properties(
        title=f"Population Distribution in {year}"
    )
    return chart

In [5]:
chart_2050 = create_pie_chart('2050')
chart_2075 = create_pie_chart('2075')
chart_2100 = create_pie_chart('2100')


chart_2050 | chart_2075 | chart_2100

Saving CSV

In [6]:
region_pop_df.to_csv("../Data/Globalpopulation.csv")