In [16]:
## Dependencies
import os
import pandas as pd

## Suicide Rate of Change (pct_change)

In [17]:
## Set input file
filename = 'extract_owid_and_wb.csv'
path = os.path.join('clean', filename)

In [18]:
## Load CSV as DataFrame
df = pd.read_csv(path, low_memory=False)

In [19]:
df.head()

Unnamed: 0,country,year,country_code,suicide_rate_owid,mental_substance_disorders,life_expectancy,birth_rate,death_rate,gdp,health_pct,population,unemployment
0,Afghanistan,1990,Afghanistan-1990,10.318504,17.553463,50.331,48.88,15.241,,,12412311.0,
1,Afghanistan,1991,Afghanistan-1991,10.32701,17.837032,50.999,48.763,14.783,,,13299016.0,11.38
2,Afghanistan,1992,Afghanistan-1992,10.271411,18.092542,51.641,48.709,14.362,,,14485543.0,11.46
3,Afghanistan,1993,Afghanistan-1993,10.376123,18.294931,52.256,48.717,13.974,,,15816601.0,11.61
4,Afghanistan,1994,Afghanistan-1994,10.575915,18.428908,52.842,48.77,13.616,,,17075728.0,11.65


In [20]:
## Number of Countries & Shape
print("Countries:", df['country'].nunique())
print("Shape:", df.shape)

Countries: 172
Shape: (4748, 12)


In [21]:
## Explore data
df.dtypes

country                        object
year                            int64
country_code                   object
suicide_rate_owid             float64
mental_substance_disorders    float64
life_expectancy               float64
birth_rate                    float64
death_rate                    float64
gdp                           float64
health_pct                    float64
population                    float64
unemployment                  float64
dtype: object

In [22]:
## Drop NaN
df = df.dropna()

## Number of Countries & Shape
print("Countries:", df['country'].nunique())
print("Shape:", df.shape)

Countries: 159
Shape: (2834, 12)


In [23]:
## Drop Zeros
zero_loc = (df != 0).any(axis=1)
# zero_loc.value_counts()

df = df.loc[zero_loc]

## Number of Countries & Shape
print("Countries:", df['country'].nunique())
print("Shape:", df.shape)

Countries: 159
Shape: (2834, 12)


In [24]:
df = df.sort_values(by=['country', 'year'])
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,country,year,country_code,suicide_rate_owid,mental_substance_disorders,life_expectancy,birth_rate,death_rate,gdp,health_pct,population,unemployment
0,Afghanistan,2002,Afghanistan-2002,11.054472,18.137951,56.784,46.901,11.048,179.426579,9.44339,22600774.0,11.68
1,Afghanistan,2003,Afghanistan-2003,10.931093,17.958849,57.271,46.231,10.704,190.683814,8.941258,23680871.0,11.68
2,Afghanistan,2004,Afghanistan-2004,10.83979,17.788825,57.772,45.507,10.356,211.382074,9.808474,24726689.0,11.61
3,Afghanistan,2005,Afghanistan-2005,10.655626,17.647911,58.29,44.723,10.003,242.031313,9.94829,25654274.0,11.52
4,Afghanistan,2006,Afghanistan-2006,10.538475,17.539914,58.826,43.87,9.645,263.733602,10.622766,26433058.0,11.34


In [25]:
df.columns

Index(['country', 'year', 'country_code', 'suicide_rate_owid',
       'mental_substance_disorders', 'life_expectancy', 'birth_rate',
       'death_rate', 'gdp', 'health_pct', 'population', 'unemployment'],
      dtype='object')

In [26]:
## Select columns with data (target & features)
data_cols = ['suicide_rate_owid',
             'mental_substance_disorders',
             'life_expectancy',
             'birth_rate',
             'death_rate',
             'gdp',
             'population',
             'unemployment',
             'health_pct']

## Pct_change by country

In [27]:
country_list = df['country'].unique()
len(country_list)

159

In [41]:
## Create empty DataFrame
roc_df = pd.DataFrame(columns=data_cols)

## ROC by country
for country in country_list:
    country_df = df.loc[df['country'] == country]
    country_roc = country_df[data_cols].pct_change().dropna()
    roc_df = roc_df.append(country_roc, ignore_index=True)

In [42]:
roc_df

Unnamed: 0,suicide_rate_owid,mental_substance_disorders,life_expectancy,birth_rate,death_rate,gdp,population,unemployment,health_pct
0,-0.011161,-0.009874,0.008576,-0.014285,-0.031137,0.062740,0.047790,0.000000,-0.053173
1,-0.008353,-0.009467,0.008748,-0.015660,-0.032511,0.108548,0.044163,-0.005993,0.096990
2,-0.016990,-0.007921,0.008966,-0.017228,-0.034087,0.144995,0.037514,-0.007752,0.014255
3,-0.010994,-0.006120,0.009195,-0.019073,-0.035789,0.089667,0.030357,-0.015625,0.067798
4,-0.013982,-0.005450,0.009333,-0.021108,-0.037118,0.363850,0.025252,-0.014109,-0.067599
...,...,...,...,...,...,...,...,...,...
2670,-0.033555,0.000965,0.033889,-0.014438,-0.094206,0.095811,0.017936,0.000000,0.027723
2671,-0.026644,0.001184,0.026592,-0.022087,-0.082847,0.003425,0.017702,-0.009294,0.143932
2672,-0.014797,0.001209,0.019243,-0.028504,-0.065107,0.007090,0.016776,-0.007505,-0.083784
2673,-0.013144,0.001584,0.012766,-0.032871,-0.044621,0.013507,0.015614,-0.009452,0.026242


In [44]:
roc_df.describe()

Unnamed: 0,suicide_rate_owid,mental_substance_disorders,life_expectancy,birth_rate,death_rate,gdp,population,unemployment,health_pct
count,2675.0,2675.0,2675.0,2675.0,2675.0,2675.0,2675.0,2675.0,2675.0
mean,-0.010692,-0.000709,0.005471,-0.009637,-0.010181,0.072694,0.015518,0.004607,0.015221
std,0.029556,0.001799,0.006091,0.021118,0.024079,0.136647,0.016554,0.189547,0.108067
min,-0.169668,-0.010424,-0.035607,-0.138686,-0.133333,-0.640223,-0.037746,-0.805556,-0.443989
25%,-0.023397,-0.001476,0.002238,-0.018552,-0.024478,0.006146,0.005234,-0.059596,-0.030245
50%,-0.009074,-0.000387,0.003973,-0.011367,-0.009213,0.063959,0.013566,-0.007792,0.006485
75%,0.001347,0.000308,0.007101,-0.003252,0.003859,0.138676,0.024548,0.037356,0.048239
max,0.413126,0.007115,0.04455,0.144928,0.098361,1.884354,0.191392,5.8125,1.478131


In [40]:
roc_df = pd.DataFrame(columns=data_cols)
roc_df

Unnamed: 0,suicide_rate_owid,mental_substance_disorders,life_expectancy,birth_rate,death_rate,gdp,population,unemployment,health_pct


In [33]:
country = 'Afghanistan'
country_df = df.loc[df['country'] == country]
country_roc = country_df[data_cols].pct_change().dropna()
country_roc

Unnamed: 0,suicide_rate_owid,mental_substance_disorders,life_expectancy,birth_rate,death_rate,gdp,population,unemployment,health_pct
1,-0.011161,-0.009874,0.008576,-0.014285,-0.031137,0.06274,0.04779,0.0,-0.053173
2,-0.008353,-0.009467,0.008748,-0.01566,-0.032511,0.108548,0.044163,-0.005993,0.09699
3,-0.01699,-0.007921,0.008966,-0.017228,-0.034087,0.144995,0.037514,-0.007752,0.014255
4,-0.010994,-0.00612,0.009195,-0.019073,-0.035789,0.089667,0.030357,-0.015625,0.067798
5,-0.013982,-0.00545,0.009333,-0.021108,-0.037118,0.36385,0.025252,-0.014109,-0.067599
6,-0.01655,-0.004999,0.009347,-0.02317,-0.038225,0.01381,0.022942,-0.006261,0.035521
7,-0.017858,-0.004709,0.009244,-0.024935,-0.038961,0.201325,0.024259,0.031503,-0.042705
8,-0.016068,-0.004163,0.008994,-0.026257,-0.03891,0.240202,0.027847,0.005236,-0.12719
9,-0.015546,-0.003903,0.008603,-0.027091,-0.038061,0.088091,0.03193,-0.000868,-0.000906
10,-0.015554,-0.003692,0.008139,-0.027355,-0.036668,0.085778,0.034663,0.000869,-0.077639


In [36]:
roc_df = roc_df.concat(country_roc, ignore_index=True)

AttributeError: 'DataFrame' object has no attribute 'concat'

In [37]:
roc_df

Unnamed: 0,suicide_rate_owid,mental_substance_disorders,life_expectancy,birth_rate,death_rate,gdp,population,unemployment,health_pct


In [11]:
## Compute year-to-year rate of change (%)
roc_df = df[data_cols].pct_change().dropna()
roc_df

Unnamed: 0,suicide_rate_owid,mental_substance_disorders,life_expectancy,birth_rate,death_rate,gdp,population,unemployment,health_pct
1,-0.011161,-0.009874,0.008576,-0.014285,-0.031137,0.062740,0.047790,0.000000,-0.053173
2,-0.008353,-0.009467,0.008748,-0.015660,-0.032511,0.108548,0.044163,-0.005993,0.096990
3,-0.016990,-0.007921,0.008966,-0.017228,-0.034087,0.144995,0.037514,-0.007752,0.014255
4,-0.010994,-0.006120,0.009195,-0.019073,-0.035789,0.089667,0.030357,-0.015625,0.067798
5,-0.013982,-0.005450,0.009333,-0.021108,-0.037118,0.363850,0.025252,-0.014109,-0.067599
...,...,...,...,...,...,...,...,...,...
2811,-0.033555,0.000965,0.033889,-0.014438,-0.094206,0.095811,0.017936,0.000000,0.027723
2812,-0.026644,0.001184,0.026592,-0.022087,-0.082847,0.003425,0.017702,-0.009294,0.143932
2813,-0.014797,0.001209,0.019243,-0.028504,-0.065107,0.007090,0.016776,-0.007505,-0.083784
2814,-0.013144,0.001584,0.012766,-0.032871,-0.044621,0.013507,0.015614,-0.009452,0.026242


In [13]:
## Shape & Summary
print("Shape:", roc_df.shape)
roc_df.describe()

Shape: (2815, 9)


Unnamed: 0,suicide_rate_owid,mental_substance_disorders,life_expectancy,birth_rate,death_rate,gdp,population,unemployment,health_pct
count,2815.0,2815.0,2815.0,2815.0,2815.0,2815.0,2815.0,2815.0,2815.0
mean,0.028349,0.00089,0.001252,0.017843,0.017424,0.127778,1.548401,0.096066,0.014358
std,0.429687,0.047205,0.045777,0.267122,0.311297,1.237604,58.365208,1.686726,0.176845
min,-0.813706,-0.417155,-0.477873,-0.799728,-0.871103,-0.998339,-0.998613,-0.935738,-0.798837
25%,-0.023841,-0.001542,0.00209,-0.018868,-0.024926,-0.002425,0.004939,-0.063785,-0.034625
50%,-0.008764,-0.000382,0.003853,-0.011111,-0.008547,0.06013,0.013589,-0.007557,0.005553
75%,0.002898,0.000352,0.007128,-0.002077,0.005685,0.138901,0.025241,0.042288,0.049307
max,10.278132,0.707883,0.446353,4.139222,8.751925,46.763788,3075.80707,53.285714,2.262684


In [14]:
roc_df.to_csv("output/suicide_roc.csv", index=False)