In [164]:
import pandas as pd
import re

In [3]:
data = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population')

In [5]:
data

[    Unnamed: 0                 Country / Dependency  Population % of world  \
 0            –                                World  8071066000       100%   
 1            1                                China  1411750000        NaN   
 2            2                                India  1392329000        NaN   
 3            3                        United States   335614000        NaN   
 4            4                            Indonesia   279118866        NaN   
 ..         ...                                  ...         ...        ...   
 237          –                         Tokelau (NZ)        1647        NaN   
 238          –                                 Niue        1549        NaN   
 239        195                         Vatican City         764        NaN   
 240          –  Cocos (Keeling) Islands (Australia)         593        NaN   
 241          –                Pitcairn Islands (UK)          47        NaN   
 
             Date Source (official or from the Uni

In [8]:
len(data)

3

In [7]:
for ind,tab in enumerate(data):
    print('_' * 50)
    print('index =>', ind)
    print(tab)
    

__________________________________________________
index => 0
    Unnamed: 0                 Country / Dependency  Population % of world  \
0            –                                World  8071066000       100%   
1            1                                China  1411750000        NaN   
2            2                                India  1392329000        NaN   
3            3                        United States   335614000        NaN   
4            4                            Indonesia   279118866        NaN   
..         ...                                  ...         ...        ...   
237          –                         Tokelau (NZ)        1647        NaN   
238          –                                 Niue        1549        NaN   
239        195                         Vatican City         764        NaN   
240          –  Cocos (Keeling) Islands (Australia)         593        NaN   
241          –                Pitcairn Islands (UK)          47        NaN   

 

In [10]:
population = data[0]

In [12]:
list(population.columns)

['Unnamed: 0',
 'Country / Dependency',
 'Population',
 '% of world',
 'Date',
 'Source (official or from the United Nations)',
 'Unnamed: 6']

In [13]:
population['Unnamed: 6']

0       NaN
1       [b]
2       [c]
3       [d]
4       NaN
       ... 
237     NaN
238     NaN
239    [af]
240     NaN
241     NaN
Name: Unnamed: 6, Length: 242, dtype: object

In [16]:
population.shape

(242, 7)

In [14]:
population.isna().sum()

Unnamed: 0                                        0
Country / Dependency                              0
Population                                        0
% of world                                      241
Date                                              0
Source (official or from the United Nations)      0
Unnamed: 6                                      211
dtype: int64

In [23]:
# Dropping the useless columns
population = population.drop(['Unnamed: 0','Unnamed: 6'], axis=1)

In [154]:
population.columns.values.tolist()

['Country / Dependency',
 'Population',
 '% of world',
 'Date',
 'Source (official or from the United Nations)']

In [26]:
population.head()

Unnamed: 0,Country / Dependency,Population,% of world,Date,Source (official or from the United Nations)
0,World,8071066000,100%,11 Nov 2023,UN projection[3]
1,China,1411750000,,31 Dec 2022,Official estimate[4]
2,India,1392329000,,1 Mar 2023,Official projection[5]
3,United States,335614000,,11 Nov 2023,National population clock[7]
4,Indonesia,279118866,,1 Jul 2023,National annual projection[8]


In [None]:
# we would be filling the column '% of world' with % of World
# percent = (population['Population'] * 100)/world_population

In [146]:
world_population = population.loc[population['Country / Dependency'] == 'World', 'Population']
world_population

0    8071066000
Name: Population, dtype: int64

In [148]:
def percent_population(df):
    df.percent = (df['Population'] * 100)/world_population
    return df.percent

In [152]:
population['% of world'] = population.apply(percent_population, axis=1)

In [153]:
population.head()

Unnamed: 0,Country / Dependency,Population,% of world,Date,Source (official or from the United Nations)
0,World,8071066000,100.0,11 Nov 2023,UN projection[3]
1,China,1411750000,17.491494,31 Dec 2022,Official estimate[4]
2,India,1392329000,17.250869,1 Mar 2023,Official projection[5]
3,United States,335614000,4.158236,11 Nov 2023,National population clock[7]
4,Indonesia,279118866,3.458265,1 Jul 2023,National annual projection[8]


In [169]:
# defining our functions
def remove_brackets_values(cell):
    if isinstance(cell, str):
        return re.sub(r'\[.*?\]', '', cell)
    return cell

In [180]:
# using the function to apply the custom function on each cell
population = population.applymap(remove_brackets_values)

In [181]:
population.head()

Unnamed: 0,Country / Dependency,Population,% of world,Date,Source (official or from the United Nations)
0,World,8071066000,100.0,11 Nov 2023,UN projection
1,China,1411750000,17.491494,31 Dec 2022,Official estimate
2,India,1392329000,17.250869,1 Mar 2023,Official projection
3,United States,335614000,4.158236,11 Nov 2023,National population clock
4,Indonesia,279118866,3.458265,1 Jul 2023,National annual projection


In [182]:
population.to_csv('The World popultion from wikipedia.csv')