# I. Extract
We will extract the population, fertility rate and urban population for each Region of the world (Africa, Asia, Europe, South America, North America, Oceania)

##### For the scraping, we will use Selenium

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
from selenium.webdriver.common.by import By

### Chrome webDriver

In [3]:
#Chrome webDriver
driver = webdriver.Chrome('C:/Users/Amal/Desktop/chromedriver')

  driver = webdriver.Chrome('C:/Users/Amal/Desktop/chromedriver')


### Scraping Function

In [4]:
import numpy as np
def scrapingByRegion(url):
    driver.get(url)
    #years
    years = driver.find_elements(by=By.XPATH, value='//table[@class="table table-striped table-bordered table-hover table-condensed table-list"]/tbody/tr/td[1]')
    years_list = []
    for y in range(len(years)):
        years_list.append(years[y].text)
    #print(years_list)
    
    #population
    population = driver.find_elements(by=By.XPATH, value='//table[@class="table table-striped table-bordered table-hover table-condensed table-list"]/tbody/tr/td[2]')
    population_list = []
    for p in range(len(population)):
        population_list.append(population[p].text)
    #print(population_list)
    
    #Fertility Rate
    fertilityRate = driver.find_elements(by=By.XPATH, value='//table[@class="table table-striped table-bordered table-hover table-condensed table-list"]/tbody/tr/td[7]')
    fertilityRate_list = []
    for f in range(len(fertilityRate)):
        fertilityRate_list.append(fertilityRate[f].text)
    #print(fertilityRate_list)
    
    #Urban Population
    urbanPopulation = driver.find_elements(by=By.XPATH, value='//table[@class="table table-striped table-bordered table-hover table-condensed table-list"]/tbody/tr/td[10]')
    urbanPopulation_list = []
    for up in range(len(urbanPopulation)):
        urbanPopulation_list.append(urbanPopulation[up].text)
    #print(urbanPopulation_list)
    
    #Create Dataframe
    population_df = pd.DataFrame(list(zip(years_list, population_list, fertilityRate_list, urbanPopulation_list)),
               columns =['Year', 'Population','Fertility Rate','Urban Population'])
    return population_df


### Call the scraping function and create a dataframe for each region

In [5]:
african_df = scrapingByRegion('https://www.worldometers.info/world-population/africa-population/')
african_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population
0,2020,1340598147,4.44,587737793
1,2019,1308064195,4.67,567387619
2,2018,1275920972,4.67,547602182
3,2017,1244222267,4.67,528371323
4,2016,1213040521,4.67,509683886


In [6]:
asian_df = scrapingByRegion('https://www.worldometers.info/world-population/asia-population/')
asian_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population
0,2020,4641054775,2.15,2361464416
1,2019,4601371198,2.2,2314051549
2,2018,4560667108,2.2,2266130580
3,2017,4519040027,2.2,2217723679
4,2016,4476607853,2.2,2168879760


In [7]:
european_df = scrapingByRegion('https://www.worldometers.info/world-population/europe-population/')
european_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population
0,2020,747636026,1.61,556684039
1,2019,747182751,1.6,554832332
2,2018,746419440,1.6,552911225
3,2017,745414735,1.6,550959276
4,2016,744268827,1.6,549030165


In [8]:
southAmerican_df = scrapingByRegion('https://www.worldometers.info/world-population/latin-america-and-the-caribbean-population/')
southAmerican_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population
0,2020,653962331,2.04,539427335
1,2019,648120957,2.12,532783467
2,2018,642216682,2.12,526057192
3,2017,636233123,2.12,519249889
4,2016,630144555,2.12,512362472


In [9]:
northAmerican_df = scrapingByRegion('https://www.worldometers.info/world-population/northern-america-population/')
northAmerican_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population
0,2020,368869647,1.83,304761178
1,2019,366600964,1.83,301858117
2,2018,364295996,1.83,298987110
3,2017,361942268,1.83,296153390
4,2016,359524647,1.83,293362886


In [10]:
oceanian_df = scrapingByRegion('https://www.worldometers.info/world-population/oceania-population/')
oceanian_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population
0,2020,42677813,2.36,28919183
1,2019,42128035,2.42,28525534
2,2018,41570842,2.42,28129029
3,2017,41006505,2.42,27731037
4,2016,40435646,2.42,27333514



# II. Transform

 Our analysis is pretty simple (just a visualization), so we don't have too much tasks in this step.
 
     1. Create new column: rural population 
     
     2. Split each region's df into two dfs 
         - the first df is the population of each region (from 1955 to 2020)
         - the second df is the population forecast (from 2020 to 2050)

### 1. Create new column: rural population 

In [11]:
def createRPColumn(df):
    df['Rural Population'] = df['Population'].str.replace(',','').astype('Int64', errors='ignore') - df['Urban Population'].str.replace(',','').astype('Int64', errors='ignore')
    return df

In [12]:
african_df = createRPColumn(african_df)
asian_df = createRPColumn(asian_df)
european_df = createRPColumn(european_df)
southAmerican_df = createRPColumn(southAmerican_df)
northAmerican_df = createRPColumn(northAmerican_df)
oceanian_df = createRPColumn(oceanian_df)

In [13]:
african_df.head()

Unnamed: 0,Year,Population,Fertility Rate,Urban Population,Rural Population
0,2020,1340598147,4.44,587737793,752860354
1,2019,1308064195,4.67,567387619,740676576
2,2018,1275920972,4.67,547602182,728318790
3,2017,1244222267,4.67,528371323,715850944
4,2016,1213040521,4.67,509683886,703356635


### 2. Split each region's df into two dfs 

In [13]:
def population(df):
    population_df = df.iloc[:18,:]
    return population_df

def populationForcast(df):   
    population_forecast_df = df.iloc[18:,:]
    return population_forecast_df
    

#### Call the function to create 2 dfs 

In [14]:
african_population_df = population(african_df)
african_population_df

Unnamed: 0,Year,Population,Fertility Rate,Urban Population,Rural Population
0,2020,1340598147,4.44,587737793,752860354
1,2019,1308064195,4.67,567387619,740676576
2,2018,1275920972,4.67,547602182,728318790
3,2017,1244222267,4.67,528371323,715850944
4,2016,1213040521,4.67,509683886,703356635
5,2015,1182438784,4.73,491531092,690907692
6,2010,1039304033,4.9,408587045,630716988
7,2005,916154288,5.08,341033592,575120696
8,2000,810984226,5.35,285997612,524986614
9,1995,717270078,5.72,241824184,475445894


In [15]:
african_population_forcast_df = populationForcast(african_df)
african_population_forcast_df

Unnamed: 0,Year,Population,Fertility Rate,Urban Population,Rural Population
18,2020,1340598147,4.44,587737793,752860354
19,2025,1508935218,4.48,698148943,810786275
20,2030,1688321099,4.53,824013801,864307298
21,2035,1878193685,4.57,966329885,911863800
22,2040,2076749529,4.61,1125161515,951588014
23,2045,2281452464,4.65,1299953249,981499215
24,2050,2489275458,4.69,1488920045,1000355413


In [18]:
asian_population_df = population(asian_df)
asian_population_forcast_df = populationForcast(asian_df)

In [19]:
european_population_df = population(european_df)
european_population_forcast_df = populationForcast(european_df)

In [20]:
southAmerican_population_df = population(southAmerican_df)
southAmerican_population_forcast_df = populationForcast(southAmerican_df)

In [21]:
northAmerican_population_df = population(northAmerican_df)
northAmerican_population_forcast_df = populationForcast(northAmerican_df)

In [22]:
oceanian_population_df = population(oceanian_df)
oceanian_population_forcast_df = populationForcast(oceanian_df)


# III. Load
this is the final step of the ETL. In this step we will load our data to MongoDB

In [39]:
import pandas as pd
import pymongo
import json

# Making a connection with MongoClient
client = pymongo.MongoClient("mongodb://localhost:27017")

#create db
db = client["Population_db"]

In [41]:
#convert dataframe to dictionary
data = african_population_df.to_dict(orient="records")
#save data to a mongodb collection 
db.africanPopulation.insert_many(data)

<pymongo.results.InsertManyResult at 0x129ce0456c0>

In [42]:
data = african_population_forcast_df.to_dict(orient="records")
db.africanPopulationForcast.insert_many(data)

<pymongo.results.InsertManyResult at 0x129cdffe860>

In [43]:
#convert dataframe to dictionary
data = asian_population_df.to_dict(orient="records")
#save data to a mongodb collection 
db.asianPopulation.insert_many(data)

data = asian_population_forcast_df.to_dict(orient="records")
db.asianPopulationForcast.insert_many(data)

<pymongo.results.InsertManyResult at 0x129cebe1e40>

In [44]:
#convert dataframe to dictionary
data = european_population_df.to_dict(orient="records")
#save data to a mongodb collection 
db.europeanPopulation.insert_many(data)

data = european_population_forcast_df.to_dict(orient="records")
db.europeanPopulationForcast.insert_many(data)

<pymongo.results.InsertManyResult at 0x129cebe2bf0>

In [45]:
#convert dataframe to dictionary
data = southAmerican_population_df.to_dict(orient="records")
#save data to a mongodb collection 
db.southAmericanPopulation.insert_many(data)

data = southAmerican_population_forcast_df.to_dict(orient="records")
db.southAmericanPopulationForcast.insert_many(data)

<pymongo.results.InsertManyResult at 0x129cebe2fe0>

In [46]:
#convert dataframe to dictionary
data = northAmerican_population_df.to_dict(orient="records")
#save data to a mongodb collection 
db.northAmericanPopulation.insert_many(data)

data = northAmerican_population_forcast_df.to_dict(orient="records")
db.northAmericanPopulationForcast.insert_many(data)

<pymongo.results.InsertManyResult at 0x129cdcc1b70>

In [47]:
#convert dataframe to dictionary
data = oceanian_population_df.to_dict(orient="records")
#save data to a mongodb collection 
db.oceanianPopulation.insert_many(data)

data = oceanian_population_forcast_df.to_dict(orient="records")
db.oceanianPopulationForcast.insert_many(data)

<pymongo.results.InsertManyResult at 0x129cdcc1e10>