In [1]:
import pandas as pd
import json
import folium
import numpy as np
import datetime

In [2]:
import requests
from bs4 import BeautifulSoup
%matplotlib inline

In [3]:
from datetime import date
import re

In [4]:
# Simple map of month name to its number
month_to_int = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

# Reversed map
int_to_month = {i: m for m, i in month_to_int.items()}

In [5]:
# The wikipedia URL that every article has in common
base_url = 'https://en.wikipedia.org/wiki/List_of_terrorist_incidents_in_'

In [6]:
# All specific end of the wikipedia URL, along with the corresponding month numbers of the article
times = {}

for year in range(2011, 2015):
    # For years 2011 to 2014, the articles appear biyearly
    times.update({'January-June_' + str(year): list(range(1, 7))})
    times.update({'July-December_' + str(year): list(range(7, 13))})
    
for year in range(2015, 2018):
    # For years 2015 to 2017, the articles appear monthly
    for month, int_ in month_to_int.items():
        times.update({month + '_' + str(year): [int_]})


In [7]:
def to_int(s):
    '''Returns the first integer found in s'''
    i = re.findall('\d+', s)
    return int(i[0]) if len(i) > 0 else float('NaN')

In [8]:
def to_date(s, year):
    '''Returns a date from the datetime library from a string like \'January 1\''''
    l = s.split(' ')
    return date(to_int(year), month_to_int[l[0]], to_int(l[1]))

In [9]:
def wiki_table_to_df(end_url, month_range, base_url=base_url):
    '''Creates a dataframe from the tables available in the wikipedia page'''
    print('Scraping for', end_url)
    r = requests.get(base_url + end_url) # Get request
    soup = BeautifulSoup(r.text, 'lxml') # Parse HTML
    wiki_tables = soup.findAll('table', {'class': 'wikitable sortable'}) # Get tables from the wikipedia page

    table = []

    for month_int, wiki_table in zip(month_range, wiki_tables):
        for row in wiki_table.findAll('tr'):
            elems = row.findAll('td') 
            if len(elems) != 0:
                interesting = [elem.text for elem in elems[:5]]
                 # First element is the day of the month, but we add the name of the month as well in front of it
                interesting[0] = int_to_month[month_int] + ' ' + interesting[0]
                table.append(interesting)
                
    df = pd.DataFrame(table, columns=['date', 'type', 'deaths', 'injuries', 'location'])
    df.date = df.date.apply(lambda s: to_date(s, end_url[-4:])) # Translate the date with the year defined by the end_url arg
    df.deaths = df.deaths.apply(to_int) # Map death number to int
    df.injuries = df.injuries.apply(to_int) # Map injuries number to int
    
    return df

In [39]:
dfs = []

# Get a DataFrame for every article from 2011 to 2017
for time, month_range in times.items():
    dfs.append(wiki_table_to_df(time, month_range))
    
df = pd.concat(dfs)
print('We have {} registered attacks from 1-1-2011 up to today ({}-{}-{})'.format(df.shape[0], date.today().day, date.today().month, date.today().year))

Scraping for January-June_2011
Scraping for July-December_2011
Scraping for January-June_2012
Scraping for July-December_2012
Scraping for January-June_2013
Scraping for July-December_2013
Scraping for January-June_2014
Scraping for July-December_2014
Scraping for January_2015
Scraping for February_2015
Scraping for March_2015
Scraping for April_2015
Scraping for May_2015
Scraping for June_2015
Scraping for July_2015
Scraping for August_2015
Scraping for September_2015
Scraping for October_2015
Scraping for November_2015
Scraping for December_2015
Scraping for January_2016
Scraping for February_2016
Scraping for March_2016
Scraping for April_2016
Scraping for May_2016
Scraping for June_2016
Scraping for July_2016
Scraping for August_2016
Scraping for September_2016
Scraping for October_2016
Scraping for November_2016
Scraping for December_2016
Scraping for January_2017
Scraping for February_2017
Scraping for March_2017
Scraping for April_2017
Scraping for May_2017
Scraping for June_201

In [11]:
useless_words_cities = [
    'near',
    'far',
    'across', 
    'outskirts of',
    'south of',
    'north of'
    'south',
    'north',
    'east',
    'west', 
    'various places in',
    'border crossing',
    'industrial zone',
    'region',
    'province',
    'district'
]

useless_words_countries = [
    ' and ',
    'northern',
    'southern',
    'eastern',
    'northwestern',
    'east',
    'across',
    'central',
    'northwest',
    'border region',
    'prospective'
]

def remove_useless(s):
    s = s.lower()
    for uw in useless_words_cities:
        s = s.replace(uw + ' ', '')
        s = s.replace(uw, '')
            
    if ' and ' in s:
        s = s.split(' and ')[0]
        
    if '/' in s:
        s = s.split('/')[0]
            
    return s.title()

def clean_country(country):
    if 'Nigeria' in country:
        country = 'Nigeria'        
        
    if 'Jerusalem' in country:
        country = 'Israel'
        
    if 'West Bank' in country:
        country = 'Palestine'
        
    if 'Congo' in country:
        country = 'Congo'
        
    if 'Dagestan' in country:
        country = 'Russia'
        
    if 'Abkhazia' in country:
        country = 'Russia'
        
    if 'Balochistan' in country:
        country = 'Pakistan'
        
    if 'Hebron' in country:
        country = 'Palestine'
        
    if 'Burma' in country:
        country = 'Myanmar'
        
    if 'Northern Ireland' in country:
        country = 'United Kingdom'
    
    country = clean_extremities(country)
        
    return country

def clean_extremities(s):
    while len(s) > 0 and not s[0].isalpha():
        s = s[1:]
        
    while len(s) > 0 and not s[-1].isalpha():
        s = s[:-1]
        
    return s

def special_cases(location):
    location = location.lower()
    
    for uw in useless_words_countries:
        if uw + ' ' in location:
            location = location.replace(uw + ' ', '')
        if uw in location:
            location = location.replace(uw, '')
            
    if 'jerusalem' in location:
        location = 'Israel'
        
    if 'west bank' in location:
        location = 'Palestine'
            
    return 'Unknown', location.title()

def city_country_from_location(location):
    location = clean_extremities(location)
            
    really_special = {
        'Zliten Libya': ('Zliten', 'Libya'),
        'Alau.Nigeria': ('Alau', 'Nigeria'),
        'Damascus Syria': ('Damascus', 'Syria'),
        'Kibirizi Democratic Republic of Congo': ('Kibirizi', 'Congo'),
        'Belfast Northern Ireland': ('Belfast', 'United Kingdom'),
        'Oberhausen Germany': ('Oberhausen', 'Germany'),
        'Lahj Governorate Yemen': ('Unknown', 'Yemen'),
        'El-Baraf Somalia': ('Unknown', 'Somalia'),
        'Baghdad Iraq': ('Baghdad', 'Iraq'),
        'Luqa Malta': ('Luqa', 'Malta'),
        'Muradiye Turkey': ('Muradiye', 'Turkey'),
        'Concepción Paraguay': ('Concepción', 'Paraguay'),
        'Deir Ez-Zor Syria': ('Deir Ez-Zor', 'Syria'),
        'Deir ez-Zor Syria': ('Deir Ez-Zor', 'Syria'),
        'Indian Ocean': ('Unknown', 'Indian Ocean'),
        'Qamishli Syria': ('Qamishli', 'Syria'),
        'Oignies, Pas-de-Calais': ('Oignies', 'France'),
        'Khost Province': ('Unknown', 'Afghanistan'),
        'Ratchaprasong Intersection, Bangkok': ('Bangkok', 'Thailand'),
        'Beit Hanun, Gaza Strip': ('Beit Hanun', 'Palestine'),
        'Crimea': ('Unknown', 'Russia'),
        'Moyen-Cavally, Côte d\'Ivoire': ('Moyen-Cavally', 'Ivory Coast'),
        'Lawdar, Lahij Yemen': ('Lawdar', 'Yemen'),
        'Atizapán de Zaragoza, México': ('Atizapán de Zaragoza', 'Mexico'),
        'Queens, New York': ('New York City', 'United States'),
        'Jonglei, Southern Sudan': ('Jonglei', 'South Sudan'),
        'Austin, Texas': ('Austin', 'United States'),
        'Karabudakhkent, Dagestan, Russian Federation': ('Karabudakhkent', 'Russia')
    }
    
    if location in really_special.keys():
        return really_special[location]
    
    if ',' in location:
        city = location.split(',')[0]
        country = location.split(',')[-1]
        
        city = clean_extremities(remove_useless(city))
        country = clean_country(country)
        
        if len(city) == 0:
            city = 'Unknown'
        
        return city, country
    else:
        return special_cases(location)

In [12]:
cities_countries = df.location.map(city_country_from_location).apply(pd.Series)
df['city'] = cities_countries[0]
df['country'] = cities_countries[1]

In [13]:
# df = df.drop('location', axis=1)

In [14]:
df.to_csv('attacks.csv', index=False, encoding='utf-8')

-----------------------

In [15]:
attacks_per_country = df.groupby(by=['country'], as_index=False).count()[['country', 'date']]
attacks_per_country = pd.DataFrame(attacks_per_country.sort_values(by=['country'])).rename(columns={'date': 'number'})

In [16]:
geojson_world = json.load(open('custom.geo.json', encoding='utf-8'))

In [17]:
world_countries = []

for i in range( len(geojson_world['features'])):
    world_countries.append(geojson_world['features'][i]['properties']['name_sort'])

In [18]:
map_ = {
    'Bahamas': 'Bahamas, The',
    'Congo': 'Congo, Dem. Rep.',
    'Egypt': 'Egypt, Arab Rep.',
    'Iran': 'Iran, Islamic Rep.',
    'Ivory Coast': 'Côte d\'Ivoire',
    'Kyrgyzstan': 'Kyrgyz Republic',
    'Laos': 'Lao PDR',
    'Palestine': 'Palestine (West Bank and Gaza)',
    'Republic of Ireland': 'Ireland',
    'Republic of Macedonia': 'Macedonia, FYR',
    'Russia': 'Russian Federation',
    'South Korea': 'Korea, Rep.',
    'Syria': 'Syrian Arab Republic',
    'United States': 'United States of America',
    'Venezuela': 'Venezuela, RB',
    'Yemen': 'Yemen, Rep.',
    'Indian Ocean': 'Indian Ocean Territories'
}

In [19]:
attacks_per_country.country = attacks_per_country.country.map(lambda c: map_[c] if c in map_.keys() else c)

In [20]:
for c in attacks_per_country.country:
    if not c in world_countries:
        print('NO ', c)

In [21]:
def map_number_color(number):
    number = np.sqrt(number)
    min_ = np.sqrt(1071)
    c = int(202 - (number - 1) / (min_ - 1) * 202)
    s = hex(c)[-2:]
    if c < 16:
        s = str(0) + hex(c)[-1]
    return '#' + hex(236)[-2:] + s + '00'

In [22]:
def color(name):
    if name in list(attacks_per_country.country):
        return map_number_color(attacks_per_country[attacks_per_country.country == name]['number'].values[0])
    else:
        return '#aeae9e'    

In [23]:
world = folium.Map([30, 5], tiles='cartodbpositron', zoom_start=2)

In [24]:
# folium.GeoJson(
#     geojson_world,
#     style_function=lambda feature: {
#             'fillColor': color(feature['properties']['name_sort']),
#             'color': '#151515',
#             'fillOpacity': 0.5,
#             'weight': 1
#     }
# ).add_to(world)

In [25]:
from folium.plugins import MarkerCluster

In [26]:
def radius(country):
    v = attacks_per_country[attacks_per_country.country == country].number.values
    return 30000 * int(np.sqrt(v[0]))

In [27]:
clean = json.load(open('clean.json'))

In [28]:
def add_markers(map_):
    #marker_cluster = MarkerCluster().add_to(map_)
    for country in clean:
        folium.Circle(
            location=[float(country['lat']), float(country['lon'])],
            radius=radius(country['name']),
            fill=True,
            fill_color='#ff0000',#my_color_function(country['name']),
            fill_opacity=0.2,
            opacity=0
        ).add_to(map_)

In [29]:
add_markers(world)

In [30]:
world.save('test.html')