# I. Extract
We will extract the population, fertility rate and urban population for each Region of the world (Africa, Asia, Europe, South America, North America, Oceania)

##### For the scraping, we will use Selenium

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
from selenium.webdriver.common.by import By

### Chrome webDriver

In [5]:
#Chrome webDriver
driver = webdriver.Chrome('C:/Users/Amal/Desktop/chromedriver')


  driver = webdriver.Chrome('C:/Users/Amal/Desktop/chromedriver')


### Scraping Function

In [6]:
import numpy as np
def scrapingByRegion(url):
    driver.get(url)
    #years
    years = driver.find_elements(by=By.XPATH, value='//table[@class="table table-striped table-bordered table-hover table-condensed table-list"]/tbody/tr/td[1]')
    years_list = []
    for y in range(len(years)):
        years_list.append(years[y].text)
    #print(years_list)
    
    #population
    population = driver.find_elements(by=By.XPATH, value='//table[@class="table table-striped table-bordered table-hover table-condensed table-list"]/tbody/tr/td[2]')
    population_list = []
    for p in range(len(population)):
        population_list.append(population[p].text)
    #print(population_list)
    
    #Fertility Rate
    fertilityRate = driver.find_elements(by=By.XPATH, value='//table[@class="table table-striped table-bordered table-hover table-condensed table-list"]/tbody/tr/td[7]')
    fertilityRate_list = []
    for f in range(len(fertilityRate)):
        fertilityRate_list.append(fertilityRate[f].text)
    #print(fertilityRate_list)
    
    #Urban Population
    urbanPopulation = driver.find_elements(by=By.XPATH, value='//table[@class="table table-striped table-bordered table-hover table-condensed table-list"]/tbody/tr/td[10]')
    urbanPopulation_list = []
    for up in range(len(urbanPopulation)):
        urbanPopulation_list.append(urbanPopulation[up].text)
    #print(urbanPopulation_list)
    
    #Create Dataframe
    population_df = pd.DataFrame(list(zip(years_list, population_list, fertilityRate_list, urbanPopulation_list)),
               columns =['Year', 'Population','Fertility Rate','Urban Population'])
    return population_df


### Call the scraping function and create a dataframe for each region

In [7]:
african_df = scrapingByRegion('https://www.worldometers.info/world-population/africa-population/')
african_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population
0,2020,1340598147,4.44,587737793
1,2019,1308064195,4.67,567387619
2,2018,1275920972,4.67,547602182
3,2017,1244222267,4.67,528371323
4,2016,1213040521,4.67,509683886


In [8]:
asian_df = scrapingByRegion('https://www.worldometers.info/world-population/asia-population/')
asian_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population
0,2020,4641054775,2.15,2361464416
1,2019,4601371198,2.2,2314051549
2,2018,4560667108,2.2,2266130580
3,2017,4519040027,2.2,2217723679
4,2016,4476607853,2.2,2168879760


In [9]:
european_df = scrapingByRegion('https://www.worldometers.info/world-population/europe-population/')
european_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population
0,2020,747636026,1.61,556684039
1,2019,747182751,1.6,554832332
2,2018,746419440,1.6,552911225
3,2017,745414735,1.6,550959276
4,2016,744268827,1.6,549030165


In [10]:
southAmerican_df = scrapingByRegion('https://www.worldometers.info/world-population/latin-america-and-the-caribbean-population/')
southAmerican_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population
0,2020,653962331,2.04,539427335
1,2019,648120957,2.12,532783467
2,2018,642216682,2.12,526057192
3,2017,636233123,2.12,519249889
4,2016,630144555,2.12,512362472


In [11]:
northAmerican_df = scrapingByRegion('https://www.worldometers.info/world-population/northern-america-population/')
northAmerican_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population
0,2020,368869647,1.83,304761178
1,2019,366600964,1.83,301858117
2,2018,364295996,1.83,298987110
3,2017,361942268,1.83,296153390
4,2016,359524647,1.83,293362886


In [12]:
oceanian_df = scrapingByRegion('https://www.worldometers.info/world-population/oceania-population/')
oceanian_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population
0,2020,42677813,2.36,28919183
1,2019,42128035,2.42,28525534
2,2018,41570842,2.42,28129029
3,2017,41006505,2.42,27731037
4,2016,40435646,2.42,27333514



# II. Transform

 Our analysis is pretty simple (just a visualization), so we don't have too much tasks in this step.
 
     1. remove commas from population and urban population columns 
     
     2. Create new column: rural population 
     
     3. Split each region's df into two dfs 
         - the first df is the population of each region (from 1955 to 2020)
         - the second df is the population forecast (from 2020 to 2050)

### 1. remove commas from population and urban population columns 

In [13]:
def removeCommas(df):
    df['Population']=df['Population'].str.replace(',','')
    df['Urban Population']=african_df['Urban Population'].str.replace(',','')
    return df

In [14]:
african_df = removeCommas(african_df)
asian_df = removeCommas(asian_df)
european_df = removeCommas(european_df)
southAmerican_df = removeCommas(southAmerican_df)
northAmerican_df = removeCommas(northAmerican_df)
oceanian_df = removeCommas(oceanian_df)

### 2. Create new column: rural population 

In [22]:
def createRPColumn(df):
    df['Rural Population'] = df['Population'].astype(np.int64) - df['Urban Population'].astype(np.int64)
    return df

In [23]:
african_df = createRPColumn(african_df)
asian_df = createRPColumn(asian_df)
european_df = createRPColumn(european_df)
southAmerican_df = createRPColumn(southAmerican_df)
northAmerican_df = createRPColumn(northAmerican_df)
oceanian_df = createRPColumn(oceanian_df)

In [24]:
african_df

Unnamed: 0,Year,Population,Fertility Rate,Urban Population,Rural Population
0,2020,1340598147,4.44,587737793,752860354
1,2019,1308064195,4.67,567387619,740676576
2,2018,1275920972,4.67,547602182,728318790
3,2017,1244222267,4.67,528371323,715850944
4,2016,1213040521,4.67,509683886,703356635
5,2015,1182438784,4.73,491531092,690907692
6,2010,1039304033,4.9,408587045,630716988
7,2005,916154288,5.08,341033592,575120696
8,2000,810984226,5.35,285997612,524986614
9,1995,717270078,5.72,241824184,475445894


In [25]:
asian_df

Unnamed: 0,Year,Population,Fertility Rate,Urban Population,Rural Population
0,2020,4641054775,2.15,587737793,4053316982
1,2019,4601371198,2.2,567387619,4033983579
2,2018,4560667108,2.2,547602182,4013064926
3,2017,4519040027,2.2,528371323,3990668704
4,2016,4476607853,2.2,509683886,3966923967
5,2015,4433475358,2.21,491531092,3941944266
6,2010,4209593693,2.33,408587045,3801006648
7,2005,3977986502,2.45,341033592,3636952910
8,2000,3741263381,2.61,285997612,3455265769
9,1995,3493086983,2.9,241824184,3251262799


In [26]:
european_df

Unnamed: 0,Year,Population,Fertility Rate,Urban Population,Rural Population
0,2020,747636026,1.61,587737793,159898233
1,2019,747182751,1.6,567387619,179795132
2,2018,746419440,1.6,547602182,198817258
3,2017,745414735,1.6,528371323,217043412
4,2016,744268827,1.6,509683886,234584941
5,2015,743059035,1.6,491531092,251527943
6,2010,736412989,1.56,408587045,327825944
7,2005,729287846,1.43,341033592,388254254
8,2000,725558036,1.43,285997612,439560424
9,1995,726994464,1.57,241824184,485170280


In [27]:
southAmerican_df

Unnamed: 0,Year,Population,Fertility Rate,Urban Population,Rural Population
0,2020,653962331,2.04,587737793,66224538
1,2019,648120957,2.12,567387619,80733338
2,2018,642216682,2.12,547602182,94614500
3,2017,636233123,2.12,528371323,107861800
4,2016,630144555,2.12,509683886,120460669
5,2015,623934168,2.14,491531092,132403076
6,2010,591352388,2.26,408587045,182765343
7,2005,557501032,2.49,341033592,216467440
8,2000,521836304,2.77,285997612,235838692
9,1995,483018265,3.08,241824184,241194081


In [28]:
northAmerican_df

Unnamed: 0,Year,Population,Fertility Rate,Urban Population,Rural Population
0,2020,368869647,1.83,587737793,-218868146
1,2019,366600964,1.83,567387619,-200786655
2,2018,364295996,1.83,547602182,-183306186
3,2017,361942268,1.83,528371323,-166429055
4,2016,359524647,1.83,509683886,-150159239
5,2015,357031048,1.85,491531092,-134500044
6,2010,343287419,2.01,408587045,-65299626
7,2005,327287151,1.99,341033592,-13746441
8,2000,312426773,1.95,285997612,26429161
9,1995,294453542,2.0,241824184,52629358


In [29]:
oceanian_df

Unnamed: 0,Year,Population,Fertility Rate,Urban Population,Rural Population
0,2020,42677813,2.36,587737793,-545059980
1,2019,42128035,2.42,567387619,-525259584
2,2018,41570842,2.42,547602182,-506031340
3,2017,41006505,2.42,528371323,-487364818
4,2016,40435646,2.42,509683886,-469248240
5,2015,39858746,2.44,491531092,-451672346
6,2010,36873081,2.54,408587045,-371713964
7,2005,33690208,2.46,341033592,-307343384
8,2000,31425103,2.48,285997612,-254572509
9,1995,29389647,2.52,241824184,-212434537
