## Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df1 = pd.read_csv('./data/suicide.csv')
print(df1.shape)
df1.head(3)

(27820, 12)


Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X


In [3]:
df = pd.read_excel('./data/conflicts.xlsx')
print(df.shape)
df.head(3)

(225385, 49)


Unnamed: 0,id,relid,year,active_year,code_status,type_of_violence,conflict_dset_id,conflict_new_id,conflict_name,dyad_dset_id,...,date_end,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best,high,low,gwnoa,gwnob
0,244657,IRQ-2017-1-524-322,2017,1,Clear,1,259,259,Iraq: Government,524,...,2017-07-31,0,4,0,2,6,6,6,645,
1,132140,AFG-1989-1-411-2,1989,1,Clear,1,333,333,Afghanistan: Government,724,...,1989-01-13,6,0,0,0,6,6,6,700,
2,130364,AFG-1989-1-411-37,1989,1,Clear,1,333,333,Afghanistan: Government,724,...,1989-01-18,0,0,0,4,4,4,0,700,


In [4]:
### Creating functions for preprocessing conflicts data 

# Deleting columns
def _drop_columns(df):
    df = df.drop(['id', 'relid', 'active_year', 'code_status'
              , 'type_of_violence','conflict_dset_id', 'conflict_new_id'
              , 'conflict_name', 'dyad_dset_id','dyad_new_id', 'dyad_name'
              , 'side_a_dset_id', 'side_a_new_id', 'side_a','side_b_dset_id'
              , 'side_b_new_id', 'side_b', 'number_of_sources','source_article'
              , 'source_office', 'source_date', 'source_headline','source_original'
              , 'where_prec', 'where_coordinates','where_description', 'adm_1', 'adm_2'
              , 'latitude', 'longitude','geom_wkt', 'priogrid_gid', 'country_id'
              ,'event_clarity', 'date_prec', 'date_start', 'date_end', 'best', 'high'
              , 'low','gwnoa', 'gwnob'], axis = 1)
    return df


# Filtering by year
def _filter_year(df):
    df = df[df.year > 2001]
    return df


# Adding new column
def _total_deaths(df):
    df['total_deaths'] = df['deaths_a'] + df['deaths_b'] + df['deaths_civilians'] + df['deaths_unknown']
    return df


# Calling all functions
def _main():
    df1 = _drop_columns(df)
    df2 = _filter_year(df1)
    df_final = _total_deaths(df2)
    return df_final


df_conflicts = _main()
df_conflicts.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['total_deaths'] = df['deaths_a'] + df['deaths_b'] + df['deaths_civilians'] + df['deaths_unknown']


Unnamed: 0,year,country,region,deaths_a,deaths_b,deaths_civilians,deaths_unknown,total_deaths
0,2017,Afghanistan,Asia,0,4,0,2,6
277,2002,Afghanistan,Asia,0,0,30,0,30
278,2003,Afghanistan,Asia,1,18,0,0,19
279,2003,Afghanistan,Asia,0,3,0,0,3
280,2003,Afghanistan,Asia,0,0,0,0,0


In [5]:
### Creating functions for preprocessing suicide data 

# Deleting columns
def _drop_columns(df1):
    df1 = df1.drop(['HDI for year', 'country-year'], axis = 1)
    return df1

# Filtering year
def _filter_year(df1):
    df1 = df1[df1.year > 2001]
    return df1

# Calling all functions
def _main():
    df_col = _drop_columns(df1)
    df_final = _filter_year(df_col)
    return df_final
    
df_suicide = _main()
df_suicide.shape

(14080, 10)

In [6]:
# Lists of countries per continent

EUROPE = ['Albania'
          , 'Austria'
          , 'Azerbaijan'
          , 'Belarus'
          , 'Belgium'
          , 'Bosnia and Herzegovina'
          , 'Bulgaria'
          , 'Croatia'
          , 'Cyprus'
          , 'Czech Republic'
          , 'Denmark'
          , 'Estonia'
          , 'Finland'
          , 'France'
          , 'Georgia'
          , 'Germany'
          , 'Greece'
          , 'Hungary'
          , 'Iceland'
          , 'Ireland'
          , 'Italy'
          , 'Latvia'
          , 'Lithuania'
          , 'Luxembourg'
          , 'Malta'
          , 'Montenegro'
          , 'Netherlands'
          , 'Norway'
          , 'Poland'
          , 'Portugal'
          , 'Romania'
          , 'Russian Federation'
          , 'San Marino'
          , 'Serbia'
          , 'Slovakia'
          , 'Slovenia'
          , 'Spain'
          , 'Sweden'
          , 'Switzerland'
          , 'Ukraine'
          , 'United Kingdom']


ASIA = ['Armenia'
        , 'Bahrain'
        , 'Israel'
        , 'Japan'
        , 'Kazakhstan'
        , 'Kuwait'
        , 'Kyrgyzstan'
        , 'Macau'
        , 'Maldives'
        , 'Mongolia'
        , 'Oman'
        , 'Philippines'
        , 'Qatar'
        , 'Republic of Korea'
        , 'Singapore'
        , 'Sri Lanka'
        , 'Thailand'
        , 'Turkey'
        , 'Turkmenistan'
        , 'United Arab Emirates'
        , 'Uzbekistan']


AMERICAS = ['Antigua and Barbuda'
            , 'Bahamas'
            , 'Barbados'
            , 'Belize'
            , 'Canada'
            , 'Costa Rica'
            , 'Cuba'
            , 'Dominica'
            , 'El Salvador'
            , 'Grenada'
            , 'Guatemala'
            , 'Jamaica'
            , 'Mexico'
            , 'Nicaragua'
            , 'Panama'
            , 'Puerto Rico'
            , 'Saint Kitts and Nevis'
            , 'Saint Lucia'
            , 'Saint Vincent and Grenadines'
            , 'United States'
            ,'Argentina'
            , 'Aruba'
            , 'Brazil'
            , 'Chile'
            , 'Colombia'
            , 'Ecuador'
            , 'Guyana'
            , 'Paraguay'
            , 'Suriname'
            , 'Trinidad and Tobago'
            , 'Uruguay']


AFRICA = ['Cabo Verde'
          , 'Mauritius'
          , 'Seychelles'
          , 'South Africa'] 

OCEANIA = ['Australia'
           , 'Fiji'
           , 'Kiribati'
           , 'New Zealand']


# Create a dictionary of continents

region = {country: 'Asia' for country in ASIA}
region.update({country: 'Europe' for country in EUROPE})
region.update({country: 'Africa' for country in AFRICA})
region.update({country: 'Americas' for country in AMERICAS})
region.update({country: 'Oceania' for country in OCEANIA})

df_suicide['region'] = df_suicide['country'].map(region)
print(df_suicide.shape)
df_suicide.head(3)

(14080, 11)


Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,gdp_for_year ($),gdp_per_capita ($),generation,region
156,Albania,2002,male,75+ years,4,31007,12.9,4435078648,1573,Silent,Europe
157,Albania,2002,male,25-34 years,23,206286,11.15,4435078648,1573,Generation X,Europe
158,Albania,2002,male,35-54 years,35,382139,9.16,4435078648,1573,Boomers,Europe


In [7]:
# Sorting 
suicide = pd.DataFrame(df_suicide.groupby('region')['suicides_no'].sum().sort_values(ascending=False))
conflicts = pd.DataFrame(df_conflicts.groupby('region')['total_deaths'].sum().sort_values(ascending=False))

In [8]:
suicide

Unnamed: 0_level_0,suicides_no
region,Unnamed: 1_level_1
Europe,1554027
Americas,930658
Asia,773563
Oceania,37938
Africa,7699


In [9]:
conflicts

Unnamed: 0_level_0,total_deaths
region,Unnamed: 1_level_1
Middle East,499230
Asia,315266
Africa,254463
Americas,60157
Europe,16430


In [11]:
# Save data frame as CSV
#df_suicide.to_csv('suicide.csv', encoding='utf-8')
#df_conflicts.to_csv('conflicts.csv', encoding='utf-8')