This notebook did country identification, cleaning and feature engineering for the pirate attack data.

In [1]:
import numpy as np
import pandas as pd
import geopy
from geopy.geocoders import Nominatim
import time

In [2]:
df = pd.read_csv('./datasets/Anti-Shipping_Activity_Messages.csv')
df.dropna(inplace = True)

In [3]:
df['Y'] = [str(x) for x in df['Y']]
df['X'] = [str(x) for x in df['X']]

In [4]:
df['coords'] = df['Y'] + ',' + df['X']

In [5]:
gloc = Nominatim(user_agent = 'Anything')

In [6]:
def closest_country(c):
    time.sleep(1)
    try:
        c = gloc.reverse(c, language = 'en')
        return c.raw['address']['country']
    except:
        return 'International Waters'

#https://www.geeksforgeeks.org/get-the-city-state-and-country-names-from-latitude-and-longitude-using-python/

In [7]:
# df_locations_1_250 = df['coords'][:250].apply(lambda x: closest_country(x))
# df_locations_250_500 = df['coords'][250:500].apply(lambda x: closest_country(x))
# df_locations_500_750 = df['coords'][500:750].apply(lambda x: closest_country(x))
# df_locations_750_1000 = df['coords'][750:1000].apply(lambda x: closest_country(x))
# df_locations_1000_1500 = df['coords'][1000:1500].apply(lambda x: closest_country(x))
# df_locations_1500_2500 = df['coords'][1500:2500].apply(lambda x: closest_country(x))
# df_locations_2500_end = df['coords'][2500:].apply(lambda x: closest_country(x))
# df.to_csv('./datasets/esri_w_country_columns')

In [8]:
df = pd.read_csv('./datasets/esri_w_country_columns', index_col=[0])

In [9]:
#Breaking into month, year, day and time.
df['year'] = [x[0:4] for x in df.dateofocc]
df['month'] = [x[5:7] for x in df.dateofocc]
df['day'] = [x[8:10] for x in df.dateofocc]
df['time'] = [x[11:] for x in df.dateofocc]

In [10]:
df[df['time'] == '00:00:00+00'].count()
#most rows are missing 'time'

X                  7792
Y                  7792
OBJECTID           7792
reference          7792
dateofocc          7792
subreg             7792
hostility_d        7792
victim_d           7792
description        7792
hostilitytype_l    7792
victim_l           7792
navarea            7792
coords             7792
country            7792
year               7792
month              7792
day                7792
time               7792
dtype: int64

In [11]:
#Dropping time column
df.drop(columns = 'time', inplace = True)

In [12]:
#Dropping original date of occurence column
df.drop(columns = 'dateofocc', inplace = True)

In [13]:
#Dropping 'OBJECTID'
df.drop(columns = 'OBJECTID', axis = 1, inplace = True)

In [14]:
#Lowercase all text
for i in df.columns:
    if df[i].dtypes == 'O':
        df[i] = df[i].str.lower()

In [15]:
df.hostilitytype_l.value_counts()

1.0    6971
3.0     537
4.0     163
6.0     124
2.0      47
5.0      40
9.0       6
7.0       1
Name: hostilitytype_l, dtype: int64

In [16]:
host_lst = ['pirate_assaults', 
            'navel_engagement', 
            'suspicious_approach', 
            'kidnapping', 
            'unknown', 
            'other', 
            'hijacking', 
            'no entries', 
            'attempted_boarding']

df.hostilitytype_l = [host_lst[int(x) - 1] for x in df.hostilitytype_l]

In [17]:
victim_lst = ['anchored_vessel', 
            'barge', 
            'cargo_ship', 
            'fishing_vessel', 
            'merchant_vessel', 
            'offshore_vessel', 
            'passenger_ship', 
            'sailing_vessel', 
            'tanker',
            'tugboat',
            'vessel',
            'unknown',
            'other']

df.victim_l = [victim_lst[int(x) - 1] for x in df.victim_l]

In [18]:
#Dummy all categorical columns
to_dummy = ['subreg', 'hostilitytype_l', 'victim_l', 'navarea', 'month']

for i in to_dummy:
    df = pd.concat([df, pd.get_dummies(df[i], prefix = i)], axis=1)
#df.drop(columns = to_dummy, axis = 1)

In [19]:
#ADD TO CLEANING

success_dict = {
    'pirate_assaults':1,      
    'suspicious_approach':0,  
    'kidnapping':1,
    'other':0,
    'navel_engagement':0, 
    'unknown':0,
    'attempted_boarding':0,
    'hijacking':1
}



df['pirate_success'] = df.hostilitytype_l.map(success_dict)

In [20]:
df['year'] = df['year'].astype(int)

In [21]:
#Incorporate only observations from 2010 on.
df = df[df['year'] >= 2010]

In [22]:
df.drop(columns = ['reference', 'hostility_d', 'victim_d', 'description'], inplace = True)

In [23]:
countries = ['The Bahamas', 'Indonesia','International Waters', 'Eritrea', 'India', 'Brazil',
 'Somalia', 'Ecuador', 'Philippines', 'Malta', 'China', 'Cameroon', 'Sri Lanka','Nicaragua',
 'Senegal', 'Bangladesh', 'Vietnam', 'Malaysia', 'Mozambique', 'Guyana', 'Algeria', 'Tanzania',
 'Lebanon', 'Visayas', 'Colombia', 'Nigeria', 'Egypt', 'Thailand', 'Russia', 'Guinea',
 'Morocco', "Côte d'Ivoire", 'Portugal', 'Japan', 'Myanmar', 'Mindanao',
 'Dominican Republic', 'Iran', 'Venezuela', 'Ghana', 'Angola', 'Sierra Leone', 
 'Democratic Republic of the Congo', 'Madagascar', 'Turkey', 'Peru', 'Italy', 'Oman', 'Djibouti',
 'North Korea', 'Greece', 'Yemen', 'Taiwan', 'Comoros', 'Papua New Guinea', 'Jamaica', 'Saudi Arabia',
 'Netherlands', 'Panama', 'Singapore', 'Kenya', 'France', 'Pakistan', 'United States', 'Gabon',
 'Congo-Brazzaville', 'Belgium', 'Brunei', 'Cyprus', 'Haiti', 'Liberia', 'Belize', 'Qatar', 'Solomon Islands',
 'Equatorial Guinea', 'Guatemala', 'Fiji', 'South Africa', 'Tunisia', 'Mauritania', 'United Arab Emirates',
 'Germany', 'Mexico', 'Montenegro', 'Togo', 'Honduras', 'United Kingdom', 'Benin', 'Trinidad and Tobago',
 'Bulgaria', 'Georgia', 'Cuba', 'Iraq', 'Suriname', 'Australia', 'El Salvador', 'Romania', 'Saint Lucia',
 'Uruguay', 'British Virgin Islands', 'Saint Vincent and the Grenadines', 'Sudan', 'Dominica',
 'Spain', 'Costa Rica', 'Antigua and Barbuda', 'Grenada', 'South Korea', 'Cape Verde', 'Seychelles',
 'Libya', 'Cayman Islands', 'Saint Kitts and Nevis']

countries = [x.lower() for x in countries]

In [24]:
lis = [44, 360, 1, 232, 356, 76, 706, 218, 608, 470, 156, 120, 144, 558, 686, 50, 704, 458, 508, 328, 
          12, 834, 422, 608, 170, 566, 818, 764, 643, 324, 504, 384, 620, 392, 104, 608, 214, 364, 862, 
          288, 24, 694, 180, 450, 792, 604, 380, 512, 262, 408, 300, 887, 156, 174, 598, 388, 682, 528,
         591, 702, 404, 250, 586, 840, 266, 178, 56, 96, 196, 332, 430, 84, 634, 90, 226, 320, 242, 710, 
         788, 478, 784, 276, 484, 499, 768, 340, 826, 204, 780, 100, 268, 192, 368, 740, 36, 222, 642, 
         662, 858, 92, 670, 729, 212, 724, 188, 28, 308, 410, 132, 690, 434, 136, 659]

In [25]:
dic = {countries[i]: lis[i] for i in range(len(countries))}

In [26]:
df['country_code'] = df['country'].map(dic)

In [27]:
df['join_key'] = list(zip(df.country_code, df.year))

In [28]:
df.to_csv('./datasets/cleaned_pirate_activity_eda.csv')