In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
%matplotlib inline

In [2]:
from datetime import date
import re

In [36]:
month_to_int = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

int_to_month = {i: m for m, i in month_to_int.items()}

In [37]:
base_url = 'https://en.wikipedia.org/wiki/List_of_terrorist_incidents_in_'
times = [
    'January-June_2011', 'July-December_2011',
    'January-June_2012', 'July-December_2012',
    'January-June_2013', 'July-December_2013',
    'January-June_2014', 'July-December_2014',
]

In [38]:
months_to_int.update()

In [39]:
times = {}

for year in range(2011, 2015):
    times.update({'January-June_' + str(year): list(range(1, 7))})
    times.update({'July-December_' + str(year): list(range(7, 13))})
    
for year in range(2015, 2018):
    for month, int_ in month_to_int.items():
        times.update({month + '_' + str(year): [int_]})
        
times

{'April_2015': [4],
 'April_2016': [4],
 'April_2017': [4],
 'August_2015': [8],
 'August_2016': [8],
 'August_2017': [8],
 'December_2015': [12],
 'December_2016': [12],
 'December_2017': [12],
 'February_2015': [2],
 'February_2016': [2],
 'February_2017': [2],
 'January-June_2011': [1, 2, 3, 4, 5, 6],
 'January-June_2012': [1, 2, 3, 4, 5, 6],
 'January-June_2013': [1, 2, 3, 4, 5, 6],
 'January-June_2014': [1, 2, 3, 4, 5, 6],
 'January_2015': [1],
 'January_2016': [1],
 'January_2017': [1],
 'July-December_2011': [7, 8, 9, 10, 11, 12],
 'July-December_2012': [7, 8, 9, 10, 11, 12],
 'July-December_2013': [7, 8, 9, 10, 11, 12],
 'July-December_2014': [7, 8, 9, 10, 11, 12],
 'July_2015': [7],
 'July_2016': [7],
 'July_2017': [7],
 'June_2015': [6],
 'June_2016': [6],
 'June_2017': [6],
 'March_2015': [3],
 'March_2016': [3],
 'March_2017': [3],
 'May_2015': [5],
 'May_2016': [5],
 'May_2017': [5],
 'November_2015': [11],
 'November_2016': [11],
 'November_2017': [11],
 'October_2015': [

In [40]:
def to_int(s):
    i = re.findall('\d+', s)
    return int(i[0]) if len(i) > 0 else float('NaN')

In [41]:
def to_date(s, year):
    l = s.split(' ')
    return date(to_int(year), month_to_int[l[0]], to_int(l[1]))

In [57]:
def wiki_table_to_df(end_url, month_range, base_url=base_url):
    print(end_url)
    r = requests.get(base_url + end_url)
    soup = BeautifulSoup(r.text, 'lxml')
    wiki_tables = soup.findAll('table', {'class': 'wikitable sortable'})

    table = []

    for month_int, wiki_table in zip(month_range, wiki_tables):
        for row in wiki_table.findAll('tr'):
            elems = row.findAll('td') 
            if len(elems) != 0:
                interesting = [elem.text for elem in elems[:5]]
                interesting[0] = int_to_month[month_int] + ' ' + interesting[0]
                table.append(interesting)
                
    df = pd.DataFrame(table, columns=['date', 'type', 'deaths', 'injuries', 'location'])
    df.date = df.date.apply(lambda s: to_date(s, end_url[-4:]))
    df.deaths = df.deaths.apply(to_int)
    df.injuries = df.injuries.apply(to_int)
    
    return df

In [64]:
dfs = []

for time, month_range in times.items():
    dfs.append(wiki_table_to_df(time, month_range))
    
df = pd.concat(dfs)

January-June_2011
July-December_2011
January-June_2012
July-December_2012
January-June_2013
July-December_2013
January-June_2014
July-December_2014
January_2015
February_2015
March_2015
April_2015
May_2015
June_2015
July_2015
August_2015
September_2015
October_2015
November_2015
December_2015
January_2016
February_2016
March_2016
April_2016
May_2016
June_2016
July_2016
August_2016
September_2016
October_2016
November_2016
December_2016
January_2017
February_2017
March_2017
April_2017
May_2017
June_2017
July_2017
August_2017
September_2017
October_2017
November_2017
December_2017


Unnamed: 0,index,date,type,deaths,injuries,location
0,0,2011-01-01,Suicide bombing,21.0,97.0,"Alexandria, Egypt"
1,1,2011-01-04,Assassination,1.0,0.0,"Islamabad, Pakistan"
2,2,2011-01-04,Bombing,4.0,26.0,"Abuja, Nigeria"
3,3,2011-01-07,Kidnapping,9.0,,Niger
4,4,2011-01-07,Suicide bombing,17.0,23.0,"Spin Boldak, Afghanistan"
5,5,2011-01-08,Ambush,16.0,14.0,"Lawdar, Lahij Yemen"
6,6,2011-01-11,Ambush,10.0,18.0,"South Kordofan, Sudan"
7,7,2011-01-11,"Violence, fighting",1.0,4.0,"Abidjan, Ivory Coast"
8,8,2011-01-11,"Shooting, riot",1.0,4.0,"Samalut, Egypt"
9,9,2011-01-12,Bombing,2.0,7.0,"Peshawar, Pakistan"


In [65]:
df.reset_index().to_csv('attacks.csv')