## Data from Wikipedia

In this part we scrape data from Wikipedia. We want to access the tables that register the terror attacks that happened at some point in the past. There are some Wikipedia articles (such as https://en.wikipedia.org/wiki/List_of_terrorist_incidents_in_January-June_2011) that do exactly that. The data is presented as tables, and all the articles that we need present data in this form.

In [39]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
%matplotlib inline

In [40]:
from datetime import date
import re

In [41]:
# Simple map of month name to its number
month_to_int = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

# Reversed map
int_to_month = {i: m for m, i in month_to_int.items()}

In [42]:
# The wikipedia URL that every article has in common
base_url = 'https://en.wikipedia.org/wiki/List_of_terrorist_incidents_in_'

We show all the articles that we are going to use to find the data

In [43]:
# All specific end of the wikipedia URL, along with the corresponding month numbers of the article
times = {}

for year in range(2011, 2015):
    # For years 2011 to 2014, the articles appear biyearly
    times.update({'January-June_' + str(year): list(range(1, 7))})
    times.update({'July-December_' + str(year): list(range(7, 13))})
    
for year in range(2015, 2018):
    # For years 2015 to 2017, the articles appear monthly
    for month, int_ in month_to_int.items():
        times.update({month + '_' + str(year): [int_]})
        
list(times.keys())

['January-June_2011',
 'July-December_2011',
 'January-June_2012',
 'July-December_2012',
 'January-June_2013',
 'July-December_2013',
 'January-June_2014',
 'July-December_2014',
 'January_2015',
 'February_2015',
 'March_2015',
 'April_2015',
 'May_2015',
 'June_2015',
 'July_2015',
 'August_2015',
 'September_2015',
 'October_2015',
 'November_2015',
 'December_2015',
 'January_2016',
 'February_2016',
 'March_2016',
 'April_2016',
 'May_2016',
 'June_2016',
 'July_2016',
 'August_2016',
 'September_2016',
 'October_2016',
 'November_2016',
 'December_2016',
 'January_2017',
 'February_2017',
 'March_2017',
 'April_2017',
 'May_2017',
 'June_2017',
 'July_2017',
 'August_2017',
 'September_2017',
 'October_2017',
 'November_2017',
 'December_2017']

In [44]:
def to_int(s):
    '''Returns the first integer found in s'''
    i = re.findall('\d+', s)
    return int(i[0]) if len(i) > 0 else float('NaN')

In [45]:
def to_date(s, year):
    '''Returns a date from the datetime library from a string like \'January 1\''''
    l = s.split(' ')
    return date(to_int(year), month_to_int[l[0]], to_int(l[1]))

In [50]:
def wiki_table_to_df(end_url, month_range, base_url=base_url):
    '''Creates a dataframe from the tables available in the wikipedia page'''
    print('Scraping for', end_url)
    r = requests.get(base_url + end_url) # Get request
    soup = BeautifulSoup(r.text, 'lxml') # Parse HTML
    wiki_tables = soup.findAll('table', {'class': 'wikitable sortable'}) # Get tables from the wikipedia page

    table = []

    for month_int, wiki_table in zip(month_range, wiki_tables):
        for row in wiki_table.findAll('tr'):
            elems = row.findAll('td') 
            if len(elems) != 0:
                interesting = [elem.text for elem in elems[:5]]
                 # First element is the day of the month, but we add the name of the month as well in front of it
                interesting[0] = int_to_month[month_int] + ' ' + interesting[0]
                table.append(interesting)
                
    df = pd.DataFrame(table, columns=['date', 'type', 'deaths', 'injuries', 'location'])
    df.date = df.date.apply(lambda s: to_date(s, end_url[-4:])) # Translate the date with the year defined by the end_url arg
    df.deaths = df.deaths.apply(to_int) # Map death number to int
    df.injuries = df.injuries.apply(to_int) # Map injuries number to int
    
    return df

In [56]:
dfs = []

# Get a DataFrame for every article from 2011 to 2017
for time, month_range in times.items():
    dfs.append(wiki_table_to_df(time, month_range))
    
df = pd.concat(dfs)
print('We have {} registered attacks from January 1st, 2011 up to today (November 28th, 2017)'.format(df.shape[0]))

Scraping for January-June_2011
Scraping for July-December_2011
Scraping for January-June_2012
Scraping for July-December_2012
Scraping for January-June_2013
Scraping for July-December_2013
Scraping for January-June_2014
Scraping for July-December_2014
Scraping for January_2015
Scraping for February_2015
Scraping for March_2015
Scraping for April_2015
Scraping for May_2015
Scraping for June_2015
Scraping for July_2015
Scraping for August_2015
Scraping for September_2015
Scraping for October_2015
Scraping for November_2015
Scraping for December_2015
Scraping for January_2016
Scraping for February_2016
Scraping for March_2016
Scraping for April_2016
Scraping for May_2016
Scraping for June_2016
Scraping for July_2016
Scraping for August_2016
Scraping for September_2016
Scraping for October_2016
Scraping for November_2016
Scraping for December_2016
Scraping for January_2017
Scraping for February_2017
Scraping for March_2017
Scraping for April_2017
Scraping for May_2017
Scraping for June_201

In [57]:
df = df.reset_index()

Here is what some of the entries of the final result look like

In [58]:
df.iloc[[0, 56, 1033, -1]]

Unnamed: 0,index,date,type,deaths,injuries,location
0,0,2011-01-01,Suicide bombing,21.0,97.0,"Alexandria, Egypt"
56,56,2011-02-13,Raid,7.0,5.0,"Zamboanga, Philippines"
1033,37,2014-11-18,"Shooting, Melee attack",5.0,7.0,"Jerusalem, Israel"
4663,41,2017-11-28,Bombing,8.0,,"Kandahar province, Afghanistan"


In [59]:
# Reindex and save
df.to_csv('attacks.csv')