In [1]:
import pandas as pd
import numpy as np
import os

# download terror dataset from google drive link (this is much faster than uploading it yourself)
if not os.path.exists('gtdDataSet.csv'):
    !gdown --id 1VqQs1vz2TZdzR6NUo9klkgl1eI4LFpgW

In [2]:
# read in the dataset from a file
df = pd.read_csv('OECD_Leading_DataSet.csv')
df

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,GBR,CLI,AMPLITUD,LTRENDIDX,M,2005-02,100.2482,
1,GBR,CLI,AMPLITUD,LTRENDIDX,M,2005-03,100.2449,
2,GBR,CLI,AMPLITUD,LTRENDIDX,M,2005-04,100.2153,
3,GBR,CLI,AMPLITUD,LTRENDIDX,M,2005-05,100.1706,
4,GBR,CLI,AMPLITUD,LTRENDIDX,M,2005-06,100.1293,
...,...,...,...,...,...,...,...,...
1395,CAN,CLI,AMPLITUD,LTRENDIDX,M,2021-05,100.5329,
1396,CAN,CLI,AMPLITUD,LTRENDIDX,M,2021-06,100.7005,
1397,CAN,CLI,AMPLITUD,LTRENDIDX,M,2021-07,100.8202,
1398,CAN,CLI,AMPLITUD,LTRENDIDX,M,2021-08,100.8623,


In [3]:
# group by country and subtract the current row by 3 rows before
df['diffs'] = df.groupby(['LOCATION'])['Value'].diff(3)
# reset the index to return the dataframe to its original form
df.reset_index(drop=True, inplace=True)
# show the updated dataframe
df.head(10)

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes,diffs
0,GBR,CLI,AMPLITUD,LTRENDIDX,M,2005-02,100.2482,,
1,GBR,CLI,AMPLITUD,LTRENDIDX,M,2005-03,100.2449,,
2,GBR,CLI,AMPLITUD,LTRENDIDX,M,2005-04,100.2153,,
3,GBR,CLI,AMPLITUD,LTRENDIDX,M,2005-05,100.1706,,-0.0776
4,GBR,CLI,AMPLITUD,LTRENDIDX,M,2005-06,100.1293,,-0.1156
5,GBR,CLI,AMPLITUD,LTRENDIDX,M,2005-07,100.0971,,-0.1182
6,GBR,CLI,AMPLITUD,LTRENDIDX,M,2005-08,100.0654,,-0.1052
7,GBR,CLI,AMPLITUD,LTRENDIDX,M,2005-09,100.0354,,-0.0939
8,GBR,CLI,AMPLITUD,LTRENDIDX,M,2005-10,100.023,,-0.0741
9,GBR,CLI,AMPLITUD,LTRENDIDX,M,2005-11,100.0548,,-0.0106


In [4]:
# show the first 5 rows for the first 3 locations to show that a delta was calculated
df[df.LOCATION.isin(df.LOCATION.unique()[:3])].set_index(['LOCATION', 'TIME'])[['Value', 'diffs']].groupby(level=0).head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Value,diffs
LOCATION,TIME,Unnamed: 2_level_1,Unnamed: 3_level_1
GBR,2005-02,100.2482,
GBR,2005-03,100.2449,
GBR,2005-04,100.2153,
GBR,2005-05,100.1706,-0.0776
GBR,2005-06,100.1293,-0.1156
ITA,2005-02,99.84733,
ITA,2005-03,99.65719,
ITA,2005-04,99.44277,
ITA,2005-05,99.25113,-0.5962
ITA,2005-06,99.13202,-0.52517


In [5]:
# read the dataset
terror_df = pd.read_csv('gtdDataSet.csv')
terror_df.head(5)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
0,197000000001,1970,7,2,,0,,58,Dominican Republic,2,...,,,,,PGIS,0,0,0,0,
1,197000000002,1970,0,0,,0,,130,Mexico,1,...,,,,,PGIS,0,1,1,1,
2,197001000001,1970,1,0,,0,,160,Philippines,5,...,,,,,PGIS,-9,-9,1,1,
3,197001000002,1970,1,0,,0,,78,Greece,8,...,,,,,PGIS,-9,-9,1,1,
4,197001000003,1970,1,0,,0,,101,Japan,4,...,,,,,PGIS,-9,-9,1,1,


In [6]:
terror_df['year-month'] = terror_df.apply(lambda x: f"{x.iyear}-{str(x.imonth).zfill(2)}", axis=1)

In [7]:
# manually create a dataframe holding country code conversions
country_codes = pd.DataFrame([
    ['United Kingdom', 'GBR'],
    ['Italy', 'ITA'],
    ['Japan', 'JPN'],
    ['France', 'FRA'],
    ['United States', 'USA'],
    ['Germany','DEU'],
    ['Canada', 'CAN']
], columns=['country_name', 'country_code'])
country_codes

Unnamed: 0,country_name,country_code
0,United Kingdom,GBR
1,Italy,ITA
2,Japan,JPN
3,France,FRA
4,United States,USA
5,Germany,DEU
6,Canada,CAN


In [8]:
# add a column for country codes in order to merge the original dataframe
terror_df = terror_df.merge(country_codes, how='left', left_on='country_txt', right_on='country_name').drop(columns='country_name')
terror_df.head(10)

Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,...,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related,year-month,country_code
0,197000000001,1970,7,2,,0,,58,Dominican Republic,2,...,,,PGIS,0,0,0,0,,1970-07,
1,197000000002,1970,0,0,,0,,130,Mexico,1,...,,,PGIS,0,1,1,1,,1970-00,
2,197001000001,1970,1,0,,0,,160,Philippines,5,...,,,PGIS,-9,-9,1,1,,1970-01,
3,197001000002,1970,1,0,,0,,78,Greece,8,...,,,PGIS,-9,-9,1,1,,1970-01,
4,197001000003,1970,1,0,,0,,101,Japan,4,...,,,PGIS,-9,-9,1,1,,1970-01,JPN
5,197001010002,1970,1,1,,0,,217,United States,1,...,"""Cairo Police Chief Quits; Decries Local 'Mili...","Christopher Hewitt, ""Political Violence and Te...",Hewitt Project,-9,-9,0,-9,,1970-01,USA
6,197001020001,1970,1,2,,0,,218,Uruguay,3,...,,,PGIS,0,0,0,0,,1970-01,
7,197001020002,1970,1,2,,0,,217,United States,1,...,"Christopher Hewitt, ""Political Violence and Te...",,Hewitt Project,-9,-9,0,-9,,1970-01,USA
8,197001020003,1970,1,2,,0,,217,United States,1,...,"David Newman, Sandra Sutherland, and Jon Stewa...","The Wisconsin Cartographers' Guild, ""Wisconsin...",Hewitt Project,0,0,0,0,,1970-01,USA
9,197001030001,1970,1,3,,0,,217,United States,1,...,"Tom Bates, ""Rads: The 1970 Bombing of the Army...","David Newman, Sandra Sutherland, and Jon Stewa...",Hewitt Project,0,0,0,0,,1970-01,USA


In [9]:
# show countries without country codes
terror_df[terror_df['country_code'].isna()].country_txt.unique()

array(['Dominican Republic', 'Mexico', 'Philippines', 'Greece', 'Uruguay',
       'East Germany (GDR)', 'Ethiopia', 'Guatemala', 'Venezuela',
       'West Germany (FRG)', 'Switzerland', 'Jordan', 'Spain', 'Brazil',
       'Egypt', 'Argentina', 'Lebanon', 'Ireland', 'Turkey', 'Paraguay',
       'Iran', 'Colombia', 'Bolivia', 'Nicaragua', 'Netherlands',
       'Belgium', 'Australia', 'Pakistan', 'Zambia', 'Sweden',
       'Costa Rica', 'South Yemen', 'Cambodia', 'Israel', 'Poland',
       'Taiwan', 'Panama', 'Kuwait', 'West Bank and Gaza Strip',
       'Austria', 'Czechoslovakia', 'India', 'South Vietnam', 'Brunei',
       'Zaire', "People's Republic of the Congo", 'Portugal', 'Algeria',
       'El Salvador', 'Thailand', 'Haiti', 'Sudan', 'Morocco', 'Cyprus',
       'Myanmar', 'Afghanistan', 'Peru', 'Chile', 'Honduras',
       'Yugoslavia', 'Ecuador', 'New Zealand', 'Malaysia', 'Singapore',
       'Botswana', 'Jamaica', 'Chad', 'North Yemen', 'Andorra', 'Syria',
       'South Korea', 'Un

In [10]:
# remove rows with null country codes
terror_df = terror_df.dropna(subset=['country_code'])

In [11]:
# merge the two dataframes
merged_df = terror_df.merge(df, left_on=['country_code', 'year-month'], right_on=['LOCATION', 'TIME'])
merged_df.dropna(subset=['diffs']).head(20)

Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,...,country_code,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes,diffs
33,200505020002,2005,5,2,,0,,603,United Kingdom,8,...,GBR,GBR,CLI,AMPLITUD,LTRENDIDX,M,2005-05,100.1706,,-0.0776
34,200505050002,2005,5,5,,0,,217,United States,1,...,USA,USA,CLI,AMPLITUD,LTRENDIDX,M,2005-05,100.3353,,-0.1563
35,200505100012,2005,5,10,,0,,69,France,8,...,FRA,FRA,CLI,AMPLITUD,LTRENDIDX,M,2005-05,99.61779,,-0.13482
36,200505150015,2005,5,15,,0,,69,France,8,...,FRA,FRA,CLI,AMPLITUD,LTRENDIDX,M,2005-05,99.61779,,-0.13482
37,200505230008,2005,5,23,,0,,69,France,8,...,FRA,FRA,CLI,AMPLITUD,LTRENDIDX,M,2005-05,99.61779,,-0.13482
38,200505230009,2005,5,23,,0,,69,France,8,...,FRA,FRA,CLI,AMPLITUD,LTRENDIDX,M,2005-05,99.61779,,-0.13482
39,200505290001,2005,5,29,,0,,69,France,8,...,FRA,FRA,CLI,AMPLITUD,LTRENDIDX,M,2005-05,99.61779,,-0.13482
40,200505290002,2005,5,29,,0,,69,France,8,...,FRA,FRA,CLI,AMPLITUD,LTRENDIDX,M,2005-05,99.61779,,-0.13482
41,200505290003,2005,5,29,,0,,69,France,8,...,FRA,FRA,CLI,AMPLITUD,LTRENDIDX,M,2005-05,99.61779,,-0.13482
42,200505290004,2005,5,29,,0,,69,France,8,...,FRA,FRA,CLI,AMPLITUD,LTRENDIDX,M,2005-05,99.61779,,-0.13482


In [16]:
terror_df.weaptype1.value_counts()

6     6675
5     3145
8     3049
13     551
9      388
2       67
10      44
12      37
11      33
1       28
3       12
7        6
Name: weaptype1, dtype: int64

In [12]:
terror_df.columns

Index(['eventid', 'iyear', 'imonth', 'iday', 'approxdate', 'extended',
       'resolution', 'country', 'country_txt', 'region',
       ...
       'scite2', 'scite3', 'dbsource', 'INT_LOG', 'INT_IDEO', 'INT_MISC',
       'INT_ANY', 'related', 'year-month', 'country_code'],
      dtype='object', length=137)