# I. Extract
We will extract the population, fertility rate and urban population for each Region of the world (Africa, Asia, Europe, South America, North America, Oceania)

##### For the scraping, we will use Selenium

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
from selenium.webdriver.common.by import By

### Chrome webDriver

In [2]:
#Chrome webDriver
driver = webdriver.Chrome('C:/Users/Amal/Desktop/chromedriver')

  driver = webdriver.Chrome('C:/Users/Amal/Desktop/chromedriver')


### Scraping Function

In [3]:
import numpy as np
def scrapingByRegion(url):
    driver.get(url)
    #years
    years = driver.find_elements(by=By.XPATH, value='//table[@class="table table-striped table-bordered table-hover table-condensed table-list"]/tbody/tr/td[1]')
    years_list = []
    for y in range(len(years)):
        years_list.append(years[y].text)
    #print(years_list)
    
    #population
    population = driver.find_elements(by=By.XPATH, value='//table[@class="table table-striped table-bordered table-hover table-condensed table-list"]/tbody/tr/td[2]')
    population_list = []
    for p in range(len(population)):
        population_list.append(population[p].text)
    #print(population_list)
    
    #Fertility Rate
    fertilityRate = driver.find_elements(by=By.XPATH, value='//table[@class="table table-striped table-bordered table-hover table-condensed table-list"]/tbody/tr/td[7]')
    fertilityRate_list = []
    for f in range(len(fertilityRate)):
        fertilityRate_list.append(fertilityRate[f].text)
    #print(fertilityRate_list)
    
    #Urban Population
    urbanPopulation = driver.find_elements(by=By.XPATH, value='//table[@class="table table-striped table-bordered table-hover table-condensed table-list"]/tbody/tr/td[10]')
    urbanPopulation_list = []
    for up in range(len(urbanPopulation)):
        urbanPopulation_list.append(urbanPopulation[up].text)
    #print(urbanPopulation_list)
    
    #Create Dataframe
    population_df = pd.DataFrame(list(zip(years_list, population_list, fertilityRate_list, urbanPopulation_list)),
               columns =['Year', 'Population','Fertility Rate','Urban Population'])
    return population_df


### Call the scraping function and create a dataframe for each region

In [4]:
african_df = scrapingByRegion('https://www.worldometers.info/world-population/africa-population/')
african_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population
0,2020,1340598147,4.44,587737793
1,2019,1308064195,4.67,567387619
2,2018,1275920972,4.67,547602182
3,2017,1244222267,4.67,528371323
4,2016,1213040521,4.67,509683886


In [5]:
asian_df = scrapingByRegion('https://www.worldometers.info/world-population/asia-population/')
asian_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population
0,2020,4641054775,2.15,2361464416
1,2019,4601371198,2.2,2314051549
2,2018,4560667108,2.2,2266130580
3,2017,4519040027,2.2,2217723679
4,2016,4476607853,2.2,2168879760


In [6]:
european_df = scrapingByRegion('https://www.worldometers.info/world-population/europe-population/')
european_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population
0,2020,747636026,1.61,556684039
1,2019,747182751,1.6,554832332
2,2018,746419440,1.6,552911225
3,2017,745414735,1.6,550959276
4,2016,744268827,1.6,549030165


In [7]:
southAmerican_df = scrapingByRegion('https://www.worldometers.info/world-population/latin-america-and-the-caribbean-population/')
southAmerican_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population
0,2020,653962331,2.04,539427335
1,2019,648120957,2.12,532783467
2,2018,642216682,2.12,526057192
3,2017,636233123,2.12,519249889
4,2016,630144555,2.12,512362472


In [8]:
northAmerican_df = scrapingByRegion('https://www.worldometers.info/world-population/northern-america-population/')
northAmerican_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population
0,2020,368869647,1.83,304761178
1,2019,366600964,1.83,301858117
2,2018,364295996,1.83,298987110
3,2017,361942268,1.83,296153390
4,2016,359524647,1.83,293362886


In [9]:
oceanian_df = scrapingByRegion('https://www.worldometers.info/world-population/oceania-population/')
oceanian_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population
0,2020,42677813,2.36,28919183
1,2019,42128035,2.42,28525534
2,2018,41570842,2.42,28129029
3,2017,41006505,2.42,27731037
4,2016,40435646,2.42,27333514



# II. Transform

 Our analysis is pretty simple (just a visualization), so we don't have too much tasks in this step.
 
     1. Create new column: rural population 
     
     2. For each parameter ( population, fertility rate ...) we will create a dataframe that contain the values of the 6 regions

### 1. Create new column: rural population 

In [10]:
def createRPColumn(df):
    df['Rural Population'] = df['Population'].str.replace(',','').astype('Int64', errors='ignore') - df['Urban Population'].str.replace(',','').astype('Int64', errors='ignore')
    return df

In [11]:
african_df = createRPColumn(african_df)
asian_df = createRPColumn(asian_df)
european_df = createRPColumn(european_df)
southAmerican_df = createRPColumn(southAmerican_df)
northAmerican_df = createRPColumn(northAmerican_df)
oceanian_df = createRPColumn(oceanian_df)

In [12]:
african_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population,Rural Population
0,2020,1340598147,4.44,587737793,752860354
1,2019,1308064195,4.67,567387619,740676576
2,2018,1275920972,4.67,547602182,728318790
3,2017,1244222267,4.67,528371323,715850944
4,2016,1213040521,4.67,509683886,703356635


### 2. Create dataframe for each parameter

### Population of all regions

In [13]:
population= pd.DataFrame(columns =['Year', 'African Population','Asian Population','European Population','South American Population','North American Population','Oceanian Population'])
population["Year"] = african_df["Year"]
population["African Population"] = african_df["Population"]
population["Asian Population"] = asian_df["Population"]
population["European Population"] = european_df["Population"]
population["South American Population"] = southAmerican_df["Population"]
population["North American Population"] = northAmerican_df["Population"]
population["Oceanian Population"] = oceanian_df["Population"]
population = population.iloc[:18,:]

In [14]:
population.head()

Unnamed: 0,Year,African Population,Asian Population,European Population,South American Population,North American Population,Oceanian Population
0,2020,1340598147,4641054775,747636026,653962331,368869647,42677813
1,2019,1308064195,4601371198,747182751,648120957,366600964,42128035
2,2018,1275920972,4560667108,746419440,642216682,364295996,41570842
3,2017,1244222267,4519040027,745414735,636233123,361942268,41006505
4,2016,1213040521,4476607853,744268827,630144555,359524647,40435646


### Fertility Rate of all regions

In [15]:
fertility_rate= pd.DataFrame(columns =['Year', 'African FR','Asian FR','European FR','South American FR','North American FR','Oceanian FR'])
fertility_rate["Year"] = african_df["Year"]
fertility_rate["African FR"] = african_df["Fertility Rate"]
fertility_rate["Asian FR"] = asian_df["Fertility Rate"]
fertility_rate["European FR"] = european_df["Fertility Rate"]
fertility_rate["South American FR"] = southAmerican_df["Fertility Rate"]
fertility_rate["North American FR"] = northAmerican_df["Fertility Rate"]
fertility_rate["Oceanian FR"] = oceanian_df["Fertility Rate"]
fertility_rate = fertility_rate.iloc[:18,:]
fertility_rate.head()

Unnamed: 0,Year,African FR,Asian FR,European FR,South American FR,North American FR,Oceanian FR
0,2020,4.44,2.15,1.61,2.04,1.83,2.36
1,2019,4.67,2.2,1.6,2.12,1.83,2.42
2,2018,4.67,2.2,1.6,2.12,1.83,2.42
3,2017,4.67,2.2,1.6,2.12,1.83,2.42
4,2016,4.67,2.2,1.6,2.12,1.83,2.42


### Urban Population of all regions

In [16]:
urban_population= pd.DataFrame(columns =['Year', 'African U.Population','Asian U.Population','European U.Population','South American U.Population','North American U.Population','Oceanian U.Population'])
urban_population["Year"] = african_df["Year"]
urban_population["African U.Population"] = african_df["Urban Population"]
urban_population["Asian U.Population"] = asian_df["Urban Population"]
urban_population["European U.Population"] = european_df["Urban Population"]
urban_population["South American U.Population"] = southAmerican_df["Urban Population"]
urban_population["North American U.Population"] = northAmerican_df["Urban Population"]
urban_population["Oceanian U.Population"] = oceanian_df["Urban Population"]
urban_population = urban_population.iloc[:18,:]
urban_population.head()

Unnamed: 0,Year,African U.Population,Asian U.Population,European U.Population,South American U.Population,North American U.Population,Oceanian U.Population
0,2020,587737793,2361464416,556684039,539427335,304761178,28919183
1,2019,567387619,2314051549,554832332,532783467,301858117,28525534
2,2018,547602182,2266130580,552911225,526057192,298987110,28129029
3,2017,528371323,2217723679,550959276,519249889,296153390,27731037
4,2016,509683886,2168879760,549030165,512362472,293362886,27333514


### Rural Population of all regions

In [17]:
rural_population= pd.DataFrame(columns =['Year', 'African R.Population','Asian R.Population','European R.Population','South American R.Population','North American R.Population','Oceanian R.Population'])
rural_population["Year"] = african_df["Year"]
rural_population["African R.Population"] = african_df["Rural Population"]
rural_population["Asian R.Population"] = asian_df["Rural Population"]
rural_population["European R.Population"] = european_df["Rural Population"]
rural_population["South American R.Population"] = southAmerican_df["Rural Population"]
rural_population["North American R.Population"] = northAmerican_df["Rural Population"]
rural_population["Oceanian R.Population"] = oceanian_df["Rural Population"]
rural_population = rural_population.iloc[:18,:]
rural_population.head()

Unnamed: 0,Year,African R.Population,Asian R.Population,European R.Population,South American R.Population,North American R.Population,Oceanian R.Population
0,2020,752860354,2279590359,190951987,114534996,64108469,13758630
1,2019,740676576,2287319649,192350419,115337490,64742847,13602501
2,2018,728318790,2294536528,193508215,116159490,65308886,13441813
3,2017,715850944,2301316348,194455459,116983234,65788878,13275468
4,2016,703356635,2307728093,195238662,117782083,66161761,13102132



# III. Load
this is the final step of the ETL. In this step we will load our data to MongoDB

In [18]:
import pandas as pd
import pymongo
import json

# Making a connection with MongoClient
client = pymongo.MongoClient("mongodb://localhost:27017")
#create db
db = client["Population_db"]

### Population Collection

In [19]:
#convert dataframe to dictionary
data = population.to_dict(orient="records")
#save data to a mongodb collection 
db.population.insert_many(data)

<pymongo.results.InsertManyResult at 0x199aa96f820>

### Fertility Rate Collection

In [21]:
data = fertility_rate.to_dict(orient="records")
db.fertility_rate.insert_many(data)

<pymongo.results.InsertManyResult at 0x199aad58880>

### Urban Population Collection

In [22]:
data = urban_population.to_dict(orient="records")
db.urban_population.insert_many(data)

<pymongo.results.InsertManyResult at 0x199aad11210>

### Rural Population Collection

In [23]:
data = rural_population.to_dict(orient="records")
db.ruralPopulation.insert_many(data)

<pymongo.results.InsertManyResult at 0x199aad034c0>