## Web Scraping 20th Century

## 01. Import Libraries

In [1]:
# import libraries


import pandas as pd
import time
import selenium
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import matplotlib.pyplot as plt
import os
import logging
import requests
import bs4
from bs4 import BeautifulSoup
import requests

## 02. Set Up ChromeDriver

In [2]:
# Setup chrome options
# unsure why this was done in the exercise, minimal explanation

chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")

In [4]:
# set up driver:
service_object = Service(executable_path=r"C:\Program Files\Google\Chrome\chromedriver-win64\chromedriver.exe")
driver = webdriver.Chrome(service=service_object)
#service = Service(ChromeDriverManager().install())
#driver = webdriver.Chrome(service=service)

## 03. Scrape Key Events of the 20th Century

In [57]:
# project URL: Key Events of the 20th Century

# (https://en.wikipedia.org/wiki/Key_events_of_the_20th_century)

In [58]:
# will use beautifulsoup / requests to scrape
# this is a better option as we want the entire page instead of a small list / segment

# have already imported beautifulsoup/requests

In [None]:
# get page's contents:

page = requests.get("https://en.wikipedia.org/wiki/Key_events_of_the_20th_century")

In [None]:
# create soup / get title

soup = BeautifulSoup(page.text, 'html.parser')
print(soup.title)

<title>Key events of the 20th century - Wikipedia</title>


In [61]:
# print(soup.text)

# this definitely worked to display the full text
# changing to a markdown to prevent unnecessary scrolling

In [62]:
# creating a new object to store the text:

text = soup.get_text()

In [63]:
# define the text encoding

text = text.encode('utf-8')

In [64]:
# quick google said that utf-8 is now preferred for spatial efficiency
# encoding allows computers to display digital data as letters,numbers,characters

In [65]:
# save file to working directory

with open('20th-Century.txt', 'wb') as f:
       f.write(text)

## 04. Scrape List of Countries

In [66]:
# project URL: List of Countries

# url: ("https://simple.m.wikipedia.org/wiki/List_of_countries"

In [5]:
# as we are creating a list, selenium will suffice here
# get page contents:

list_url = "https://simple.m.wikipedia.org/wiki/List_of_countries"
driver.get(list_url)

In [68]:
# elements of interest will be the countries themselves, 
# after inspecting, the div id appears to be "mw-content-text" class = "mw-body-content"
# another option is "bodyContent"

#div class = "mw-content-ltr mw-parser-output"

# going to try "mw-body-content" first

In [6]:
new_countries_elem = driver.find_elements(By.TAG_NAME, value="a")

In [12]:
# for c in new_countries_elem:
# 	print(c.text)

In [13]:
file_path = "C:\\Users\\Mallika\\workfolder\\20th-century\\countries_list.txt"

In [8]:
# use find element to create a collection of the countries:

countries_elem = driver.find_elements(by = By.CLASS_NAME, value = 'mw-body-content')

In [9]:
# checking to see if it worked:

countries_elem[0].text

"This is a list of sovereign states. Disputed countries are listed at the bottom.\nCountries[change | change source]\nA[change | change source]\n Afghanistan –  Albania –  Algeria –  Andorra –  Angola –  Antigua and Barbuda –  Argentina –  Armenia –  Australia –  Austria –  Azerbaijan\nB[change | change source]\n Bahamas –  Bahrain –  Bangladesh –  Barbados –  Belarus –  Belgium –  Belize –  Benin –  Bhutan –  Bolivia –  Bosnia and Herzegovina –  Botswana –  Brazil –  Brunei –  Bulgaria –  Burkina Faso –  Burundi\nC[change | change source]\n Cabo Verde –  Cambodia –  Cameroon –  Canada –  Central African Republic –  Chad –  Chile –  China –  Colombia –  Comoros –  Costa Rica –  Côte d'Ivoire –  Croatia –  Cuba –  Cyprus –  Czechia\nD[change | change source]\n Democratic Republic of the Congo –  Denmark –  Djibouti –  Dominica –  Dominican Republic\nE[change | change source]\n Ecuador –  Egypt –  El Salvador –  Equatorial Guinea –  Eritrea –  Estonia –  Eswatini –  Ethiopia\nF[change | 

In [14]:
lst = countries_elem[0].text.split()[22:]
lst = [item for item in lst if item[0].isascii() and len(item) > 1 and item != "and"]
#lst

In [21]:
lst

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua',
 'Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'B[change',
 'change',
 'source]',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia',
 'Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina',
 'Faso',
 'Burundi',
 'C[change',
 'change',
 'source]',
 'Cabo',
 'Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central',
 'African',
 'Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Costa',
 'Rica',
 'Côte',
 "d'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 'D[change',
 'change',
 'source]',
 'Democratic',
 'Republic',
 'of',
 'the',
 'Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican',
 'Republic',
 'E[change',
 'change',
 'source]',
 'Ecuador',
 'Egypt',
 'El',
 'Salvador',
 'Equatorial',
 'Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'F[ch

In [19]:
#my_list = ["item1", "item2", "item3"]
#file_path = "output.txt"

with open(file_path, 'w') as file:
    # Joins the list items into one string with a newline character between each item
    file.write('\n'.join(str(item) for item in lst) + '\n')

In [20]:
with open(file_path, "w", encoding='utf-8') as f:
    for item in lst:
        f.write(f"{item}\n")
        print(f"Successfully saved page source to {file_path}")

Successfully saved page source to C:\Users\Mallika\workfolder\20th-century\countries_list.txt
Successfully saved page source to C:\Users\Mallika\workfolder\20th-century\countries_list.txt
Successfully saved page source to C:\Users\Mallika\workfolder\20th-century\countries_list.txt
Successfully saved page source to C:\Users\Mallika\workfolder\20th-century\countries_list.txt
Successfully saved page source to C:\Users\Mallika\workfolder\20th-century\countries_list.txt
Successfully saved page source to C:\Users\Mallika\workfolder\20th-century\countries_list.txt
Successfully saved page source to C:\Users\Mallika\workfolder\20th-century\countries_list.txt
Successfully saved page source to C:\Users\Mallika\workfolder\20th-century\countries_list.txt
Successfully saved page source to C:\Users\Mallika\workfolder\20th-century\countries_list.txt
Successfully saved page source to C:\Users\Mallika\workfolder\20th-century\countries_list.txt
Successfully saved page source to C:\Users\Mallika\workfolde

In [23]:
anchors = driver.find_elements(By.CSS_SELECTOR, 'a[title]')

In [24]:
countries_lst = []

In [26]:
for a in anchors:
	# print(a.text)
	if len(a.text) > 0:
		countries_lst.append(a.text)

In [29]:
countries_lst

['Search',
 'Create account',
 'Log in',
 'Page',
 'Talk',
 'View source',
 'View history',
 'sovereign states',
 'Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Costa Rica',
 "Côte d'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 'Democratic Republic of the Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 '

In [19]:
countries_lst.remove('List of states with limited recognition')

In [20]:
countries_lst.index('Afghanistan')

3

In [21]:
countries_lst = countries_lst[countries_lst.index('Afghanistan'):]
countries_lst

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo, Democratic Republic of the',
 'Congo, Republic of the',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'East Timor',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',
 'Hait