# Data Loading

In [56]:
import pandas as pd

The first dataset that we will be using comes from the University of Maryland and is called the Global Terrorism Database (GTD). This is a dataset comprised of global terrorism events from 1970 to 2020 with 135 columns
We don't plan on using all of these columns but a few examples would be Year, Region, TargetType, Motive, and WeaponType.

https://www.start.umd.edu/gtd/

In [57]:
terror_df = pd.read_csv("GlobalTerrorism/GlobalTerrorismDataset.csv", low_memory=False, encoding="latin-1")
terror_df.head()

Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
0,197000000001,1970,7,2,,0,,58,Dominican Republic,2,...,,,,,PGIS,0,0,0,0,
1,197000000002,1970,0,0,,0,,130,Mexico,1,...,,,,,PGIS,0,1,1,1,
2,197001000001,1970,1,0,,0,,160,Philippines,5,...,,,,,PGIS,-9,-9,1,1,
3,197001000002,1970,1,0,,0,,78,Greece,8,...,,,,,PGIS,-9,-9,1,1,
4,197001000003,1970,1,0,,0,,101,Japan,4,...,,,,,PGIS,-9,-9,1,1,


Terrorism_df
Selected columns of potential interest to cut down on how many columns there are in the dataset. Rename columns for consistency

In [58]:
cols_of_interest = [
    "iyear", #year
    "extended", #incident extended more than 24 hours (binary: 0=no, 1=yes)
    "country", #country id
    "country_txt", #country name (country name for the year the event occurred is recorded)
    "region", #region id
    "region_txt", #region the attack occurred in
    "crit1", #criterion 1: POLITICAL, ECONOMIC, RELIGIOUS, OR SOCIAL GOAL (binary: 0=no, 1=yes)
    "crit2", #criterion 2: INTENTION TO COERCE, INTIMIDATE OR PUBLICIZE TO LARGER AUDIENCE(S) (binary: 0=no, 1=yes)
    "crit3", #criterion 3: OUTSIDE INTERNATIONAL HUMANITARIAN LAW (binary: 0=no, 1=yes)
    "multiple", #part of multiple incidents (binary: 0=no, 1=yes)
    "success", #was there tangible effects of the attack (binary: 0=no, 1=yes)
    "suicide", #was it a suicide attack (binary: 0=no, 1=yes)
    "attacktype1", #attack type id
    "attacktype1_txt", #name of attack type
    "attacktype2",
    "attacktype2_txt",
    "attacktype3",
    "attacktype3_txt",
    "targtype1", #target type id
    "targtype1_txt", #target type name
    "targsubtype1", #target subtype id
    "targsubtype1_txt", #target subtype name
    "natlty1", #nationality of target id
    "natlty1_txt", #nationality of target name
    "gname", #main perpetrator group name
    "motive", #specific motive for attack (text)
    "guncertain1", #main perpetrator group confirmed (binary: 0=unconfirmed, 1=suspected)
    "claimed", #group claimed responsibility for attack (binary: 0=no, 1=yes)
    "weaptype1", #weapon type id
    "weaptype1_txt", #weapon type name
    "weapsubtype1", #weapon subtype id
    "weapsubtype1_txt", #weapon subtype name
    "weaptype2",
    "weaptype2_txt",
    "weapsubtype2",
    "weapsubtype2_txt",
    "nkill", #total number of fatalities (includes victims and attackers)
    "nkillter", #number of perpetrator fatalities
    "nwound", #total number of injured
    "nwoundte", #number of perpetrators injured
    "property", #property damage occured  (binary: 0=no, 1=yes)
    "propextent", #cost of property damage occurred
    "ishostkid", #did hostages or kidnapping occur (0=no, 1=yes, -9=unknown)
    "ransom", #was ransom demanded (0=no, 1=yes, -9=unknown, null=N/A)
    "ransomamt", #ransom amount demanded
    "hostkidoutcome", #hostage/kidnapping outcome
    "INT_LOG", #international-logistical: indicates whether a gropu crossed a border to carry out attack
    "INT_IDEO", #international-ideological: indicates whether a group attacked a target of different nationality
]
terror_df = terror_df.loc[:, cols_of_interest]

terror_df.rename(columns={
    "iyear": "year",
    "extended": "isExtendedIncident",
    "country": "countyID",
    "country_txt": "country",
    "region": "regionID",
    "region_txt": "region",
    "crit1": "criterion1",
    "crit2": "criterion2",
    "crit3": "criterion3",
    "multiple": "isMultipleIncident",
    "success": "isSuccessfulAttack",
    "suicide": "isSuicideAttack",
    "attacktype1": "attackType1ID",
    "attacktype1_txt": "attackType1",
    "attacktype2": "attackType2ID",
    "attacktype2_txt": "attackType2",
    "attacktype3": "attackType3ID",
    "attacktype3_txt": "attackType3",
    "targtype1": "targetTypeID",
    "targtype1_txt": "targetType",
    "targsubtype1": "targetSubTypeID",
    "targsubtype1_txt": "targetSubType",
    "natlty1": "targetNationalityID",
    "natlty1_txt": "targetNationality",
    "gname": "attackerGroup",
    "motive": "attackerMotive",
    "guncertain1": "isGroupSuspected",
    "claimed": "isAttackClaimed",
    "weaptype1": "weaponType1ID",
    "weaptype1_txt": "weaponType1",
    "weapsubtype1": "weaponSubType1ID",
    "weapsubtype1_txt": "weaponSubType1",
    "weaptype2": "weaponType2ID",
    "weaptype2_txt": "weaponType2",
    "weapsubtype2": "weaponSubType2ID",
    "weapsubtype2_txt": "weaponSubType2",
    "nkill": "totalFatalities",
    "nkillter": "attackerFatalities",
    "nwound": "totalWounded",
    "nwoundte": "attackersWounded",
    "property": "propertyDamage",
    "propextent": "propertyDamageCost",
    "ishostkid": "isHostage/Kidnap",
    "ransom": "ransomDemanded",
    "ransomamt": "ransomAmount",
    "hostkidoutcome": "hostage/KidnapOutcome",
    "INT_LOG": "internationalLogistical",
    "INT_IDEO": "internationalIdeological"
}, inplace=True)

Set datatypes of columns

In [59]:
#Haven't figured out exactly what we want to do for categorical data that has null values

# terror_df = terror_df.astype({
#     "year": "int32",
#     "isExtendedIncident": "bool",
#     "countyID": "int32",
#     "country": "category",
#     "regionID": "int32",
#     "region": "category",
#     "criterion1": "bool",
#     "criterion2": "bool",
#     "criterion3": "bool",
#     "isMultipleIncident": "bool",
#     "isSuccessfulAttack": "bool",
#     "isSuicideAttack": "bool",
#     "attackType1ID": "int32",
#     "attackType1": "category",
#     "attackType2ID": "int32",
#     "attackType2": "category",
#     "attackType3ID": "int32",
#     "attackType3": "category",
#     "targetTypeID": "int32",
#     "targetType": "category",
#     "targetSubTypeID": "int32",
#     "targetSubType": "category",
#     "targetNationalityID": "int32",
#     "targetNationality": "category",
#     "attackerGroup": "object",
#     "attackerMotive": "object",
#     "isGroupSuspected": "bool",
#     "isAttackClaimed": "bool",
#     "weaponType1ID": "int32",
#     "weaponType1": "category",
#     "weaponSubType1ID": "int32",
#     "weaponSubType1": "category",
#     "weaponType2ID": "int32",
#     "weaponType2": "category",
#     "weaponSubType2ID": "int32",
#     "weaponSubType2": "category",
#     "totalFatalities": "int32",
#     "attackerFatalities": "int32",
#     "totalWounded": "int32",
#     "attackersWounded": "int32",
#     "propertyDamage": "category",
#     "propertyDamageCost": "category",
#     "isHostage/Kidnap": "category",
#     "ransomDemanded": "category",
#     "ransomAmount": "int32",
#     "hostage/KidnapOutcome": "category",
#     "internationalLogistical": "bool",
#     "internationalIdeological": "bool"
# })

This dataset comes from the World Bank Group and is a breakdown of Life Expectancy at Birth for countries, regions, and socio-economic classes. It has data from 1960-2022.

https://data.worldbank.org/indicator/SP.DYN.LE00.IN

In [60]:
life_df = pd.read_csv("LifeExpectancy/LifeExpectancyAtBirth.csv", encoding="utf-8")
life_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,Unnamed: 68
0,Aruba,ABW,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,64.152,64.537,64.752,65.132,65.294,65.502,...,75.683,75.617,75.903,76.072,76.248,75.723,74.626,74.992,,
1,Africa Eastern and Southern,AFE,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,44.085552,44.386697,44.752182,44.913159,45.479043,45.498338,...,61.856458,62.44405,62.92239,63.365863,63.755678,63.31386,62.45459,62.899031,,
2,Afghanistan,AFG,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,32.535,33.068,33.547,34.016,34.494,34.953,...,62.659,63.136,63.016,63.081,63.565,62.575,61.982,62.879,,
3,Africa Western and Central,AFW,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,37.845152,38.16495,38.735102,39.063715,39.33536,39.618038,...,56.195872,56.581678,56.888446,57.189139,57.555796,57.226373,56.988657,57.626176,,
4,Angola,AGO,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,38.211,37.267,37.539,37.824,38.131,38.495,...,60.655,61.092,61.68,62.144,62.448,62.261,61.643,61.929,,


Since table came as a pivot table, we have to melt it back down into a normal table

In [61]:
life_df = pd.melt(life_df, 
                    id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"], 
                    var_name="Year", 
                    value_name="Value")
life_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,Value
0,Aruba,ABW,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1960,64.152
1,Africa Eastern and Southern,AFE,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1960,44.085552
2,Afghanistan,AFG,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1960,32.535
3,Africa Western and Central,AFW,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1960,37.845152
4,Angola,AGO,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1960,38.211


Get rid of "Unnamed: 68" entries created for some reason from the melting of the pivot table.

In [62]:
life_df["Year"].unique()
life_df = life_df[life_df["Year"] != "Unnamed: 68"]

Rename columns for consistency

In [63]:
life_df = life_df.drop(columns=["Country Code", "Indicator Name", "Indicator Code"])

life_df.rename(columns={
    "Country Name": "country",
    "Year": "year",
    "Value": "lifeExpectancy"
}, inplace=True)

Retype columns to appropriate values

In [64]:
life_df = life_df.astype({
    "country": "category",
    "year": "int32",
    "lifeExpectancy": "float32",
})

# Data Cleaning

filter life_df and terrorism_df so that they have the same year range as one another

In [65]:
#check for null values in year columns
print(f"life_df null year values: {life_df['year'].isna().sum()}")
print(f"terror_df null year values: {terror_df['year'].isna().sum()}")

# Get min and max years for both dfs
life_yr_min = life_df["year"].min()
life_yr_max = life_df["year"].max()
terror_yr_min = terror_df["year"].min()
terror_yr_max = terror_df["year"].max()

# Filter dfs so that they are both using the same year ranges
terror_df = terror_df[(terror_df["year"] >= life_yr_min) & (terror_df["year"] <= life_yr_max)]
life_df = life_df[(life_df["year"] >= terror_yr_min) & (life_df["year"] <= terror_yr_max)]

life_df null year values: 0
terror_df null year values: 0


Check for nulls in terror_df

In [68]:
terror_df.isna().sum()


year                             0
isExtendedIncident               0
countyID                         0
country                          0
regionID                         0
region                           0
criterion1                       0
criterion2                       0
criterion3                       0
isMultipleIncident               1
isSuccessfulAttack               0
isSuicideAttack                  0
attackType1ID                    0
attackType1                      0
attackType2ID               201272
attackType2                 201272
attackType3ID               209048
attackType3                 209048
targetTypeID                     0
targetType                       0
targetSubTypeID              11839
targetSubType                11839
targetNationalityID           2014
targetNationality             2014
attackerGroup                    0
attackerMotive              154648
isGroupSuspected               380
isAttackClaimed              66093
weaponType1ID       

Check for nulls in life_df

In [67]:

life_df.isna().sum()

country             0
year                0
lifeExpectancy    483
dtype: int64

# Visualization