# Task 1.4 Accessing Web Data with Data Scraping
###  1. Import Libraries
###  2. Install ChromeDriver
###  3. Scraping content of Key Events of the 20th Century page
###  4. Save page content as TXT

## 1. Importing Libraries

In [1]:
# Import libraries
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

## 2. Install ChromeDriver

In [2]:
# Setup Chrome Options
chrome_options = webdriver.ChromeOptions()

In [3]:
# Setup chrome options cont.

chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")

In [4]:
driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)

## 3. Scraping Key events of the 20th century Wiki page using selenium

In [5]:
# Get the page's contents
Wiki_url = "https://simple.m.wikipedia.org/wiki/List_of_countries"
driver.get(Wiki_url)

In [6]:
# Get all "page source"
html = driver.page_source


In [7]:
# Parcing HTML info
soup = BeautifulSoup (html,"html.parser")
text = soup.get_text()  # This is the FULL text from the page, including irrelevant noise

In [8]:
body_content = soup.find(id="bodyContent")
text2 = body_content.get_text()  # Only text from the "bodyContent" section of the page

In [20]:
# Get all the a href title tags -- looks like they contain country names
# div with id mw-content-text
div = soup.find("div", {"id": "mw-content-text"})
elements = div.find_all("a", title=True)
countries_list = []
for e in elements:
    if e.get("class"): continue
    countries_list.append(e.get('title'))
countries_list

['Sovereign state',
 'Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'The Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cape Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Democratic Republic of the Congo',
 'Republic of the Congo',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'East Timor',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'The Gambia',
 'Georgia (country)',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea',

In [21]:
countries_set = set(countries_list)  # eliminate duplicates
countries = list(countries_set)  # convert back to list for sorting
countries.sort()  # sort alphabetically
countries[:20]

['Abkhazia',
 'Adjara',
 'Afghanistan',
 'Akrotiri and Dhekelia',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Anjouan',
 'Antarctica',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Autonomous Region of Bougainville',
 'Azad Kashmir']

## 4. Save page content as TXT

In [23]:
# Save html content
fp = open("Country_Scrape.txt", "w")
# loop through countries list and write each country to the file on one line
# for var_name in COLLECTION:
for c in countries:
    fp.write(c + "\n")
fp.close()