In [9]:
#imports
import pandas as pd

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

import time

#set ups
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### 1. Visit a website

In [2]:
#define the url you potentially wish to visit
url = 'https://cpj.org/data/?status=Killed&start_year=1992&end_year=2023&group_by=year&motiveConfirmed%5B%5D=Confirmed&type%5B%5D=Journalist'

In [3]:
#activate your automated Chrome webdriver and store it in a variable called driver
driver = webdriver.Chrome()

In [4]:
#make your driver visit your url
driver.get(url)

In [5]:
#ok, now, please go to your automated browser window and "Reject All" cookies; literally, please click on it!!!

### 2. Find elements by class name; let's find the table pagination verbatims

In [6]:
#find the text of the page link 1
driver.find_element(By.CLASS_NAME,"page-link").text

'First'

In [7]:
#find them all 
page_numbers = driver.find_elements(By.CLASS_NAME, 'page-item')

#loop through the pagination elements
for page_number in page_numbers:
    #and print each one of those
    print(page_number.text)

First
Previous
1
2
3
4
5
Next
Last


In [8]:
#find the last real number displayed on the page and convert this text into integer in one line of code
last_number = int(page_numbers[-3].text)
last_number

5

### 3. Select and deselect things from a dropdown menu

In [10]:
#ok, let's say we wish to "female" from the gender dropdown menu of the page

#find the element you wish to select something from and make Selenium use selection
#store this in a variable called "select"
select = Select(driver.find_element(By.NAME, 'gender'))

#then, make your choice according to the text which is visible on the webpage >> "Female"
select.select_by_visible_text('Female')

In [11]:
#deselect "Female"
select.deselect_by_visible_text('Female')

In [12]:
#let's say we need to select, one by one, all genders given in the dropdown list
#create a list called genders
#its items will be the visible strings we are interested in
genders = ['Female', 'Male', 'Nonbinary']

In [19]:
#loop through genders and make your driver select them one by one
for gender in genders:
    select.select_by_visible_text(gender)
    
    #make your driver sleep for 3 seconds before it goes on
    time.sleep(3)
    
    #deselect the previously selected gender
    #before selecting the next one
    select.deselect_by_visible_text(gender)

### 4. Click on a button; click Next

In [20]:
#find the "Next" button in your page
button = driver.find_element(By.LINK_TEXT, "Next")  

#click on it
button.click()

### 5. Find element by xpath 
(Inspect on the element you wish to get and copy paste its xpath)

In [23]:
#let's find the first name of your page by its xpath

#define the name xpath
name_xpath = '/html/body/div[1]/div/div[2]/div[2]/table/tbody/tr[1]/td[1]/a'

#make the driver find this element and store it in a variable called name
name = driver.find_element(By.XPATH, name_xpath)

#print the name text
print(name.text)

Abdirizak Ali Abdi


In [25]:
#ok, if you look your name_xpath more carefully, you'll discover that this is a td in a tr (row)
#our table has 20 rows

#create a list of numbers from 1 to 20
numbers = list(range(1,21))

#loop through this list
for n in numbers:
    #redefine your name_xpath by replacing the number of the tr with your n item in your numbers list
    name_xpath = f"/html/body/div[1]/div/div[2]/div[2]/table/tbody/tr[{n}]/td[1]/a"
    #redefine your name each time
    name = driver.find_element(By.XPATH, name_xpath)
    
    #print each name text
    print(name.text)

Abdirizak Ali Abdi
Abdisalan Sheikh Hassan
Abdisatar Daher Sabriye
Abdiwali Ali Hassan
Abdul Aziz Shaheen
Abdul Hakim Shimul
Abdul Hameed al-Yousef
Abdul Haq Baloch
Abdul Manan Arghand
Abdul Nasser Haj Hamdan
Abdul Qadir Hajizai
Abdul Qodus
Abdul Raheem Kour Hassan
Abdul Rahman Ismael Yassin
Abdul Razzak Johra
Abdul Salam Kanaan
Abdul Samad Rohani
Abdul Shariff
Abdul Wahab
Abdul-Rahim Nasrallah al-Shimari


In [27]:
#instead of printing names, store them; you should remember what we should do if we want to store these names in a df
#there you go

mydata = []

#create a list of numbers from 1 to 20
numbers = list(range(1,21))

#loop through this list
for n in numbers:
    
    data = {}
    
    #redefine your name_xpath by replacing the number of the tr with your n item in your numbers list
    name_xpath = f"/html/body/div[1]/div/div[2]/div[2]/table/tbody/tr[{n}]/td[1]/a"
    #redefine your name each time
    name = driver.find_element(By.XPATH, name_xpath)
    
    data['name'] = name.text
    
    mydata.append(data)

#create your df
df = pd.DataFrame(mydata)
df

Unnamed: 0,name
0,Abdirizak Ali Abdi
1,Abdisalan Sheikh Hassan
2,Abdisatar Daher Sabriye
3,Abdiwali Ali Hassan
4,Abdul Aziz Shaheen
5,Abdul Hakim Shimul
6,Abdul Hameed al-Yousef
7,Abdul Haq Baloch
8,Abdul Manan Arghand
9,Abdul Nasser Haj Hamdan


### 6. Select your currently active page using CSS Selector 


In [28]:
#find the element
driver.find_element(By.CSS_SELECTOR, "li.page-item.active")

<selenium.webdriver.remote.webelement.WebElement (session="68859f09bc8e179b25cd8d4a94a293f8", element="83ED4603F692C39D849BC7968E6EB29C_element_756")>

In [29]:
#display its text
driver.find_element(By.CSS_SELECTOR, "li.page-item.active").text

'2'

In [30]:
#convert this into an integer and store it in a variable called active_page
active_page = int(driver.find_element(By.CSS_SELECTOR, "li.page-item.active").text)
active_page

2

### *** Write a script ***
Ok, now that we learned all these, we'll put everything in a script aiming to succeed in the following scenario: We wish to visit the [database of CPJ for killed journalists](https://cpj.org/data/?status=Killed&start_year=1992&end_year=2023&group_by=year&motiveConfirmed%5B%5D=Confirmed&type%5B%5D=Journalist), when motive is confirmed. And we want to select females first and males then, and scrape all data each time (according to the website there is no data for nonbinary journalists killed).

In [36]:
#define the url to visit
url = 'https://cpj.org/data/?status=Killed&start_year=1992&end_year=2023&group_by=year&motiveConfirmed%5B%5D=Confirmed&type%5B%5D=Journalist'

#activate a browser
driver = webdriver.Chrome()
#make it visit your url
driver.get(url)

#please remember to reject all cookies (literally click on it)

In [37]:
#select the dropdown menu you wish to make choices from
select = Select(driver.find_element(By.NAME, 'gender'))

#create your list of visible strings for genders
genders = ['Female', 'Male']

#create a list of numbers from 1 to 20
numbers = list(range(1,21))

#create an initially empty list
mydata = []

#loop through your genders
#select each one of those and do several other things for each gender in genders
for gender in genders:

    #select each one
    select.select_by_visible_text(gender)

    #sleep for a while
    time.sleep(5)
    
    #find your last page
    driver.find_element(By.LINK_TEXT, 'Last').click()
    
    #get the page numbers that are visible while you are on the last page
    page_numbers = driver.find_elements(By.CLASS_NAME, 'page-item')
    #define the number of the last page of the table
    last_table_page = int(page_numbers[-3].text)

    #click on the First page to go back
    driver.find_element(By.LINK_TEXT, 'First').click()
    
    #define your currently active page 
    active_page = int(driver.find_element(By.CSS_SELECTOR, "li.page-item.active").text)
    
    #create a while loop!!!
    #we use while loops when we don't know the exact number of iterations
    #we just know that we want to keep looping through as long as a specific condition is true
    #we hereby wish to do several things while our active page is not the last one
    
    #so, while that happens...
    while active_page < last_table_page:
        
        #print your currently active page
        print(f"Page: {active_page}")
    
        #loop through your list of numbers
        for n in numbers:
            
            #create a dictionary called data
            data = {}
            
            #find each different name text
            name = driver.find_element(By.XPATH, f'/html/body/div[1]/div/div[2]/div[2]/table/tbody/tr[{n}]/td[1]/a').text
            #store the name
            data['name'] = name
            #store the gender 
            data['gender'] = gender
            
            #check them out by printing them
            print(name, gender)
            
            #append your initially empty list
            mydata.append(data)
            
        #find the "Next" button in your page
        button = driver.find_element(By.LINK_TEXT, "Next")  
        #click "Next"
        button.click()
        
        #redefine your active page
        active_page += 1
        
        #make your driver sleep for 5'' before it goes on
        time.sleep(5)
            
    else:
        print(f'Your currently active page is page number {active_page}. This should be the last page. I am getting the data and then I am done.')
        
        #get the data, store them, append your mydata list and that's it
        for n in numbers:
            
            data = {}
            
            #use try-except to avoid errors, especially when in your last page
            #because then most possible scenario is that your script will not find 20 names...
            try:
                name = driver.find_element(By.XPATH, f'/html/body/div[1]/div/div[2]/div[2]/table/tbody/tr[{n}]/td[1]/a').text
            
                data['name'] = name
                data['gender'] = gender

                print(name, gender)
                
            except:
                #if you fail finding data, just continue instead of breaking (throwing errors)
                continue
            
            mydata.append(data)

    #deselect your currently selected gender before you go on and start from the beginning again
    select.deselect_by_visible_text(gender)

Page: 1
Alaa Taher Al-Hassanat Female
Alexandra Tuttle Female
Alison Parker Female
Amparo Leonor Jiménez Pallares Female
Anastasiya Baburova Female
Anja Niedringhaus Female
Anna Politkovskaya Female
Asiya Jeelani Female
Atwar Bahjat Female
Audrey Gaid Estrada Female
Ayat Khadoura Female
Ayelet Arnin Female
Aysel Malkac Female
Batoul Mokhles al-Warrar Female
Camille Lepage Female
Cynthia Elbaum Female
Dalia Marko Female
Daphne Caruana Galizia Female
Dilshan Ibash Female
Dolores Guadalupe García Escamilla Female
Page: 2
Duniya Muhyadin Nur Female
Elsa Cayat Female
Farah Omar Female
Flor Alba Núñez Vargas Female
Francisca Sandoval Female
Gabrielle Marian Hulsen Female
Gauri Lankesh Female
Ghislaine Dupont Female
Gina Dela Cruz Female
Habiba Ahmed Abd Elaziz Female
Halima Idris Salim Female
Hind Ismail Female
Hindia Haji Mohamed Female
Ilaria Alpi Female
Isaivizhi Chempiyan Female
Jhannah Villegas Female
Johanne Sutton Female
Karen Fischer Female
Karmela Sojanovic Female
Kate Peyton Female

Page: 12
Assaf Abu Rahal Male
Aswan Ahmed Lutfallah Male
Atallah Bajbouj Male
Athiwat Chaiyanurat Male
Atilano Segundo Pérez Barrios Male
Aung Kyaw Naing, "Par Gyi" Male
Avijit Roy Male
Avinash Jha Male
Awab al-Zubiry Male
Awil Dahir Salad Male
Aye Kyaw Male
Ayham Mostafa Ghazzoul Male
Ayoub Mohamed Male
Ayub Khattak Male
Azamat Ali Bangash Male
Azizullah Haidari Male
Azzedine Saidj Male
Bakhti Benaouda Male
Bala Nadarajah Iyer Male
Bardhyl Ajeti Male
Page: 13
Barkhat Awale Male
Basel Tawfiq Youssef Male
Bashar al-Attar Male
Bashar al-Nuaimi Male
Bashiir Noor Gedi Male
Basil al-Sayed Male
Basil Nabil Ibrahim Faraj Male
Bassel al-Shahade Male
Bassem Fawaz al-Zabi Male
Bayo Ohu Male
Benjamín Flores González Male
Bernabé Cortés Valderrama Male
Bernard Maris Male
Bernard Verlhac (Tignous) Male
Bhola Nath Masoom Male
Bienvenido Legarte Jr. Male
Bienvenido Lemos Male
Bilal Ahmed Bilal Male
Bilal Jadallah Male
Bilal Sharaf al-Deen Male
Page: 14
Birendra Shah Male
Bladimir Antuna García Male
B

Page: 30
Israel Vázquez Rangel Male
Issam Abdallah Male
Issam Obeid Male
Issam Tillawi Male
Ítalo Eduardo Diniz Barros Male
Ivan Darío Pelayo Male
Ivan Safronov Male
Ivo Pukanic Male
Ivo Standeker Male
Izzet Kezer Male
Jacinto Romero Flores Male
Jagadish Babu Male
Jagendra Singh Male
Jaime Garzón Male
Jaime Rengifo Revero Male
Jairo Elías Márquez Gallego Male
Jairo Souza Male
Jalaa al-Abadi Male
Jamal Abdul-Nasser Sami Male
Jamal al-Sharaabi Male
Page: 31
Jamal al-Zubaidi Male
Jamal Farah Adan Male
Jamal Khalifeh Male
Jamal Khashoggi Male
Jamal Uddin Male
James Brolan Male
James Foley Male
James Miller Male
James Ogogo Male
James P. Hunter Male
Jamshed Davliyatmamatov Male
Ján Kuciak Male
Janullah Hashimzada Male
Jarosław Ziętara Male
Jaruek Rangcharoen Male
Jassim al-Batat Male
Javed Ahmed Mir Male
Javed Khan Male
Javed Naseer Rind Male
Javier Valdez Cárdenas Male
Page: 32
Jean Cabut (Cabu) Male
Jean Hélène Male
Jean Léopold Dominique Male
Jean-Claude Jumel Male
Jean-Jacques Ola Bebe 

Page: 48
Musab Mahmood al-Ezawi Male
Mushtaq Ali Male
Mushtaq Khand Male
Mustaf Abdi Noor Male
Mustafa Abada Male
Mustafa Abdul Hassa Male
Mustafa Gaimayani Male
Mustafa Jeha Male
Mustafa Said Male
Mustafa Salamah Male
Muthanna Abdel Hussein Male
MVN Shankar Male
Myles Tierney Male
Mylvaganam Nimalarajan Male
Nabil Hasan al-Quaety Male
Nabil Ibrahim al-Dulaimi Male
Nacer Ouari Male
Nahar Ali Male
Nahúm Palacios Arteaga Male
Naimatullah Zaheer Male
Page: 49
Naimullah Male
Najem Abed Khudair Male
Naji Asaad Male
Naji Jerf Male
Namik Taranci Male
Namir Noor-Eldeen Male
Napoleon Salaysay Male
Narcisse Orédjé Male
Narendra Dabholkar Male
Nasrullah Khan Afridi Male
Nasseredine Lekhal Male
Nasteh Dahir Farah Male
Nathan S. Dabak Male
Nava Raj Sharma Male
Navin Nischal Male
Nazar Abdulwahid al-Radhi Male
Nazih Darwazeh Male
Nazim Babaoglu Male
Néhémie Joseph Male
Nelson Carvajal Carvajal Male
Page: 50
Nevith Condés Jaramillo Male
Nicanor Linhares Batista Male
Nikolai Andrushchenko Male
Niloy N

Page: 67
Vedat Erdemci Male
Veeraboina Yadagiri Male
Vénant Ntawucikayenda Male
Viatcheslav Rudnev Male
Víctor Hernández Martínez Male
Victor Hugo López Escobar Male
Victor Nuñez Male
Vijay Pratap Singh Male
Vikas Ranjan Male
Viktor Dedov Male
Viktor Mikhailov Male
Viktor Nikulin Male
Viktor Pimenov Male
Vincent Francis Male
Vincent Rodriguez Male
Vincent Rwabukwisi Male
Virgilio Fernández Male
Virgilio Maganes Male
Vitas Lingis Male
Vladimir Drobyshev Male
Page: 68
Vladimir Ivanov Male
Vladimir Kirsanov Male
Vladimir Yatsina Male
Vladimir Zhitarenko Male
Vladislav Listyev Male
Volker Handloik Male
Volker Kraemer Male
Vyacheslav Veremiy Male
Wadallah Sarhan Male
Wadih Sa'ad al-Hamdani Male
Wael al-Absi Male
Wael Mikhael Male
Waldemar Milewicz Male
Waleed Khaled Male
Walgney Assis Carvalho Male
Wali Khan Babar Male
Walid Jamil Amira Male
Wasem Aledel Male
Washiqur Rahman Babu Male
Wilguens Louis-Saint Male
Page: 69
William Biggart Male
Wilson Ndayambadje Male
Wissam Ali Ouda Male
Wisut 

In [38]:
#convert your mydata list into a df
df = pd.DataFrame(mydata)

#check it out
df.shape

(1521, 2)

In [39]:
df

Unnamed: 0,name,gender
0,Alaa Taher Al-Hassanat,Female
1,Alexandra Tuttle,Female
2,Alison Parker,Female
3,Amparo Leonor Jiménez Pallares,Female
4,Anastasiya Baburova,Female
5,Anja Niedringhaus,Female
6,Anna Politkovskaya,Female
7,Asiya Jeelani,Female
8,Atwar Bahjat,Female
9,Audrey Gaid Estrada,Female


In [40]:
#save this work locally
df.to_csv('CPJ_journalists_killed.csv',index=False)

In [42]:
#close your driver before you go
driver.close()

# Readings

- CSS Selector Reference, χωρίς ημερομηνία. <i> w3schools </i>. Διαθέσιμο σε: https://www.w3schools.com/cssref/css_selectors.php (προσπελάστηκε: 6 Δεκεμβρίου 2023)
- Python While Else, χωρίς ημερομηνία. <i> w3schools </i>. Διαθέσιμο σε:  https://www.w3schools.com/python/gloss_python_while_else.asp (προσπελάστηκε: 6 Δεκεμβρίου 2023)
- Selenium with Python, χωρίς ημερομηνία. <i>Read the Docs</i>. Διαθέσιμο σε: https://selenium-python.readthedocs.io/ (προσπελάστηκε: 6 Δεκεμβρίου 2023)