# journalists-scraper

## Load Libraries

In [None]:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from datetime import timedelta
from datetime import date
import csv
import pandas as pd
import time
import json
import fnmatch
import os
import tabula
from tabula.io import read_pdf
import urllib.request
from bs4 import BeautifulSoup
import re
from geopy.geocoders import Nominatim
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Download csv

#### Download the [csv](https://cpj.org/data/killed/?status=Killed&motiveConfirmed%5B%5D=Confirmed&type%5B%5D=Journalist&start_year=1992&end_year=2021&group_by=year) from the CPJ website about journalists who were killed and load it in memory.

In [None]:
journalists_raw = pd.read_csv('journalists_killed.csv')
journalists_raw

## Scraper

#### More information about each journalist can be retrieved from their bio page, which is in format 'https://cpj.org/data/people/< firstname-lastname >'

In [None]:
# Create a list to store journalists' names
names_list = []
# Create a list of URLs that we want to scrape
url_list = []
# create a list of descriptions. We will use this to store the bios we retrieve.
desc_list = []
# Define base URL - the first part of the URL that's constant for every journalist's bio
base_url = "https://cpj.org/data/people/"
# Iterate through the dataframe to get each name
for index, row in journalists_raw.iterrows():
    names_list.append(row["fullName"])
# In order to get the desired URL, we have to add a dash (-) between first and last names
for name in names_list:
    name = name.replace(' ', '-')
    # Add this to our base URL defined above
    name_w_url = base_url + name
    # Append url_list with this newly constructed URL
    url_list.append(name_w_url)
# Iterate through each URL in url_list
for link in url_list:
    url = link
    # Open the URL
    html = urllib.request.urlopen(url)
    # Parse the html file
    htmlParse = BeautifulSoup(html, 'html.parser')
    # Iterate through all 'p' tags, clean them
    for para in htmlParse.find_all('article', {"class":"entry-content"}):
        para = para.text
        para = para.replace('Share this:TwitterFacebookWhatsAppLinkedInEmailTelegram ', '')
        para = para.strip()
        para = para.replace('.\n', '.<br>')
        para = para.split('<br>')
        # We want the first paragraph only, which tells us about the journalist; more cleaning
        para = para[0]
        para = para.replace('\n','')
        # Print for testing
        print((para))
        print('--')
    desc_text = para
    # Append these descriptions to the desc_list created above
    desc_list.append(desc_text)
journalists_with_desc = journalists_raw
# Create a column in our original df, and store these descriptions for each journalist
journalists_with_desc['desc'] = desc_list
# View the df
journalists_with_desc

#### Now, we have to add a new column to our df, which will have both city and country. This will be used to obtain coordinates for that location.

In [None]:
# Journalists with city_country column

journalists_city_country = journalists_with_desc
journalists_city_country["city_country"] = journalists_city_country["location"] + ", "+ journalists_city_country["country"]

journalists_city_country

#### Some areas were listed as "an area outside..." or "an area near..." Take them out for consistency. 

In [None]:
# clean that column using string detect
journalists_city_country_clean = journalists_city_country
journalists_city_country_clean['city_country'] = journalists_city_country_clean['city_country'].str.replace('an area outside ','')
journalists_city_country_clean['city_country'] = journalists_city_country_clean['city_country'].str.replace('an area near ','')
journalists_city_country_clean['city_country'] = journalists_city_country_clean['city_country'].str.replace('Malé','Male')


# journalists_city_country_clean.sort_values(by = ["city_country"], ascending=True)
journalists_city_country_clean

In [None]:
# Store this in a dataframe called jourcoords
jourcoords = journalists_city_country_clean
jourcoords

#### Now, we use the geopy package to get coordinates. Some locations return errors, so we'll have to add coordinates manually. Add 'pass' if the function returns an error.

In [None]:
# create an empty list list called place_list
place_list = []
for place in jourcoords["city_country"]:
    place_list.append(place)

# Create empty lists for latitude and longitude
lat_list = []
long_list = []
for item in place_list:
    # Try to get coordinates for locations
    try: 
        geolocator = Nominatim(user_agent="Aadit")
        location = geolocator.geocode(item)
        lat_list.append(location.latitude)
        long_list.append(location.longitude)
    # If the function errors out, return 'pass'
    except (RuntimeError, TypeError, NameError, AttributeError):
        lat_list.append('pass')
        long_list.append('pass')
# Print for testing
print(len(long_list))
print(len(lat_list))

In [None]:
# Append lat and lon to jourcoords df
jourcoords["lat"] = lat_list
jourcoords['lon'] = long_list
jourcoords

## Export csv

#### Add coordinates for the four locations that returned 'pass'

In [None]:
jourcoords.to_csv('final_file_clean.csv')