In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
%matplotlib inline

In [3]:
from datetime import date
import re

In [4]:
# Simple map of month name to its number
month_to_int = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

# Reversed map
int_to_month = {i: m for m, i in month_to_int.items()}

In [5]:
# The wikipedia URL that every article has in common
base_url = 'https://en.wikipedia.org/wiki/List_of_terrorist_incidents_in_'

In [6]:
# All specific end of the wikipedia URL, along with the corresponding month numbers of the article
times = {}

for year in range(2011, 2015):
    # For years 2011 to 2014, the articles appear biyearly
    times.update({'January-June_' + str(year): list(range(1, 7))})
    times.update({'July-December_' + str(year): list(range(7, 13))})
    
for year in range(2015, 2018):
    # For years 2015 to 2017, the articles appear monthly
    for month, int_ in month_to_int.items():
        times.update({month + '_' + str(year): [int_]})
        
times

{'April_2015': [4],
 'April_2016': [4],
 'April_2017': [4],
 'August_2015': [8],
 'August_2016': [8],
 'August_2017': [8],
 'December_2015': [12],
 'December_2016': [12],
 'December_2017': [12],
 'February_2015': [2],
 'February_2016': [2],
 'February_2017': [2],
 'January-June_2011': [1, 2, 3, 4, 5, 6],
 'January-June_2012': [1, 2, 3, 4, 5, 6],
 'January-June_2013': [1, 2, 3, 4, 5, 6],
 'January-June_2014': [1, 2, 3, 4, 5, 6],
 'January_2015': [1],
 'January_2016': [1],
 'January_2017': [1],
 'July-December_2011': [7, 8, 9, 10, 11, 12],
 'July-December_2012': [7, 8, 9, 10, 11, 12],
 'July-December_2013': [7, 8, 9, 10, 11, 12],
 'July-December_2014': [7, 8, 9, 10, 11, 12],
 'July_2015': [7],
 'July_2016': [7],
 'July_2017': [7],
 'June_2015': [6],
 'June_2016': [6],
 'June_2017': [6],
 'March_2015': [3],
 'March_2016': [3],
 'March_2017': [3],
 'May_2015': [5],
 'May_2016': [5],
 'May_2017': [5],
 'November_2015': [11],
 'November_2016': [11],
 'November_2017': [11],
 'October_2015': [

In [7]:
def to_int(s):
    '''Returns the first integer found in s'''
    i = re.findall('\d+', s)
    return int(i[0]) if len(i) > 0 else float('NaN')

In [8]:
def to_date(s, year):
    '''Returns a date from the datetime library from a string like \'January 1\''''
    l = s.split(' ')
    return date(to_int(year), month_to_int[l[0]], to_int(l[1]))

In [9]:
def wiki_table_to_df(end_url, month_range, base_url=base_url):
    print(end_url)
    r = requests.get(base_url + end_url) # Get request
    soup = BeautifulSoup(r.text, 'lxml') # Parse HTML
    wiki_tables = soup.findAll('table', {'class': 'wikitable sortable'}) # Get tables from the wikipedia page

    table = []

    for month_int, wiki_table in zip(month_range, wiki_tables):
        for row in wiki_table.findAll('tr'):
            elems = row.findAll('td') 
            if len(elems) != 0:
                interesting = [elem.text for elem in elems[:5]]
                 # First element is the day of the month, but we add the name of the month as well in front of it
                interesting[0] = int_to_month[month_int] + ' ' + interesting[0]
                table.append(interesting)
                
    df = pd.DataFrame(table, columns=['date', 'type', 'deaths', 'injuries', 'location'])
    df.date = df.date.apply(lambda s: to_date(s, end_url[-4:])) # Translate the date with the year defined by the end_url arg
    df.deaths = df.deaths.apply(to_int) # Map death number to int
    df.injuries = df.injuries.apply(to_int) # Map injuries number to int
    
    return df

In [10]:
dfs = []

# Get a DataFrame for every article from 2011 to 2017
for time, month_range in times.items():
    dfs.append(wiki_table_to_df(time, month_range))
    
df = pd.concat(dfs)

January-June_2011
July-December_2011
January-June_2012
July-December_2012
January-June_2013
July-December_2013
January-June_2014
July-December_2014
January_2015
February_2015
March_2015
April_2015
May_2015
June_2015
July_2015
August_2015
September_2015
October_2015
November_2015
December_2015
January_2016
February_2016
March_2016
April_2016
May_2016
June_2016
July_2016
August_2016
September_2016
October_2016
November_2016
December_2016
January_2017
February_2017
March_2017
April_2017
May_2017
June_2017
July_2017
August_2017
September_2017
October_2017
November_2017
December_2017


In [11]:
# Reindex and save
df.reset_index().to_csv('attacks.csv')

In [29]:
a = {1: ['a', 'b'], 2: ['c', 'd']}
b = {1: 'e', 2: 'f'}

for k, v in b.items():
    a[k].append(v)

In [30]:
a

{1: ['a', 'b', 'e'], 2: ['c', 'd', 'f']}

In [32]:
a[3] = ['r']

In [33]:
a

{1: ['a', 'b', 'e'], 2: ['c', 'd', 'f'], 3: ['r']}

In [36]:
a, b = {}, {}