In [172]:
import pandas as pd
import requests
from io import StringIO

In [173]:
# Download World Pop from Wikipedia using pd.read_html
#
url = "https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)"
out_csv = r'data\world_pop.csv'

# Add user-agent to request to make it look like a normal browser rather than an unknown client
#
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}

response = requests.get(url, headers=headers, timeout=15)

# raise error is response.status is a client error (400's) or server error (500's)
response.raise_for_status()   

# StringIO wraps a text string in an in-memory, file-like object. read_html accepts file paths or file-like objects.
#
dfs = pd.read_html(StringIO(response.text))

# Wikipedia page returns two tables.  Data table for world population is the first.
#
df = dfs[0]

# Export to csv file
#
df.to_csv(out_csv)
df


Unnamed: 0,Country or territory,Population (1 July 2022),Population (1 July 2023),Change (%),UN continental region[1],UN statistical subregion[1]
0,World,8021407192,8091734930,+0.88%,–,–
1,India,1425423212,1438069596,+0.89%,Asia,Southern Asia
2,China[a],1425179569,1422584933,−0.18%,Asia,Eastern Asia
3,United States,341534046,343477335,+0.57%,Americas,Northern America
4,Indonesia,278830529,281190067,+0.85%,Asia,South-eastern Asia
...,...,...,...,...,...,...
233,Montserrat (United Kingdom),4453,4420,−0.74%,Americas,Caribbean
234,Falkland Islands (United Kingdom),3490,3477,−0.37%,Americas,South America
235,Tokelau (New Zealand),2290,2397,+4.67%,Oceania,Polynesia
236,Niue (New Zealand),1821,1817,−0.22%,Oceania,Polynesia


 1. Create a DataFrame from the CSV created in the above cell (out_csv) and assign it to a variable named df_csv. This will allow you to use this cell as a starting point without sending request to the Wikipedia server.

In [174]:
df_csv = pd.read_csv(out_csv)



2. Use head to display the first 3 rows of the DataFrame

In [175]:
df_csv.head(3)

Unnamed: 0.1,Unnamed: 0,Country or territory,Population (1 July 2022),Population (1 July 2023),Change (%),UN continental region[1],UN statistical subregion[1]
0,0,World,8021407192,8091734930,+0.88%,–,–
1,1,India,1425423212,1438069596,+0.89%,Asia,Southern Asia
2,2,China[a],1425179569,1422584933,−0.18%,Asia,Eastern Asia


3. The first row is data for the world.  Remove that row from the DataFrame (df)
 and assign it to the variable df

In [176]:
df_csv.drop(0)

Unnamed: 0.1,Unnamed: 0,Country or territory,Population (1 July 2022),Population (1 July 2023),Change (%),UN continental region[1],UN statistical subregion[1]
1,1,India,1425423212,1438069596,+0.89%,Asia,Southern Asia
2,2,China[a],1425179569,1422584933,−0.18%,Asia,Eastern Asia
3,3,United States,341534046,343477335,+0.57%,Americas,Northern America
4,4,Indonesia,278830529,281190067,+0.85%,Asia,South-eastern Asia
5,5,Pakistan,243700667,247504495,+1.56%,Asia,Southern Asia
...,...,...,...,...,...,...,...
233,233,Montserrat (United Kingdom),4453,4420,−0.74%,Americas,Caribbean
234,234,Falkland Islands (United Kingdom),3490,3477,−0.37%,Americas,South America
235,235,Tokelau (New Zealand),2290,2397,+4.67%,Oceania,Polynesia
236,236,Niue (New Zealand),1821,1817,−0.22%,Oceania,Polynesia


4. Use tail to view the last 5 rows.  

In [177]:
df_csv.tail(5)

Unnamed: 0.1,Unnamed: 0,Country or territory,Population (1 July 2022),Population (1 July 2023),Change (%),UN continental region[1],UN statistical subregion[1]
233,233,Montserrat (United Kingdom),4453,4420,−0.74%,Americas,Caribbean
234,234,Falkland Islands (United Kingdom),3490,3477,−0.37%,Americas,South America
235,235,Tokelau (New Zealand),2290,2397,+4.67%,Oceania,Polynesia
236,236,Niue (New Zealand),1821,1817,−0.22%,Oceania,Polynesia
237,237,Vatican City[w],505,496,−1.78%,Europe,Southern Europe


5. Display count of countries

In [178]:
countries = df_csv.drop(df_csv.index[0])
countries["Country or territory"].count()


np.int64(237)

6. Display only the country names

In [179]:
df_csv["Country or territory"]

0                                  World
1                                  India
2                               China[a]
3                          United States
4                              Indonesia
                     ...                
233          Montserrat (United Kingdom)
234    Falkland Islands (United Kingdom)
235                Tokelau (New Zealand)
236                   Niue (New Zealand)
237                      Vatican City[w]
Name: Country or territory, Length: 238, dtype: object

7. Display only the Country and July 2023 populations

In [180]:
df_csv[["Country or territory", "Population (1 July 2023)"]]

Unnamed: 0,Country or territory,Population (1 July 2023)
0,World,8091734930
1,India,1438069596
2,China[a],1422584933
3,United States,343477335
4,Indonesia,281190067
...,...,...
233,Montserrat (United Kingdom),4420
234,Falkland Islands (United Kingdom),3477
235,Tokelau (New Zealand),2397
236,Niue (New Zealand),1817


8.  Display names of the top five most populous countries

In [181]:
df_csv = df_csv.drop(index=0)
df_csv.sort_values(["Population (1 July 2023)", "Country or territory"], ascending=False).head(5)



Unnamed: 0.1,Unnamed: 0,Country or territory,Population (1 July 2022),Population (1 July 2023),Change (%),UN continental region[1],UN statistical subregion[1]
1,1,India,1425423212,1438069596,+0.89%,Asia,Southern Asia
2,2,China[a],1425179569,1422584933,−0.18%,Asia,Eastern Asia
3,3,United States,341534046,343477335,+0.57%,Americas,Northern America
4,4,Indonesia,278830529,281190067,+0.85%,Asia,South-eastern Asia
5,5,Pakistan,243700667,247504495,+1.56%,Asia,Southern Asia


9. Display the population for the country Réunion (France)

In [182]:
df_csv.loc[df_csv['Country or territory'] == "Réunion (France)", ["Country or territory", "Population (1 July 2023)"]]

Unnamed: 0,Country or territory,Population (1 July 2023)
163,Réunion (France),874883


10. Display the name and populations (2023) for all countries with populations less than 50 million
 and sort from highest to lowest population

In [183]:
df_csv.loc[df_csv["Population (1 July 2023)"] < 50000000, ["Country or territory", "Population (1 July 2023)"]].sort_values(by="Population (1 July 2023)", ascending=False)


Unnamed: 0,Country or territory,Population (1 July 2023)
31,Uganda,48656601
32,Spain[d],47911579
33,Algeria,46164219
34,Argentina,45538401
35,Iraq,45074049
...,...,...
233,Montserrat (United Kingdom),4420
234,Falkland Islands (United Kingdom),3477
235,Tokelau (New Zealand),2397
236,Niue (New Zealand),1817


11. Display the list of Continents

In [184]:
df_csv.value_counts("UN continental region[1]")


UN continental region[1]
Africa      58
Americas    55
Asia        51
Europe      50
Oceania     23
Name: count, dtype: int64

12. Display the total population for each continent

In [185]:
df_csv[["UN continental region[1]", "Population (1 July 2023)"]].groupby("UN continental region[1]").sum()


Unnamed: 0_level_0,Population (1 July 2023)
UN continental region[1],Unnamed: 1_level_1
Africa,1480770521
Americas,1041794259
Asia,4777234490
Europe,745602874
Oceania,45562783


13. Display the number of countries per continent (HINT:  .groupby)

In [186]:
df_csv[["UN continental region[1]", "Country or territory"]].groupby("UN continental region[1]").count()


Unnamed: 0_level_0,Country or territory
UN continental region[1],Unnamed: 1_level_1
Africa,58
Americas,55
Asia,51
Europe,50
Oceania,23
