In [1]:
# for presentation purposes
import warnings
warnings.filterwarnings("ignore")

# wrangle
import os

# transform
import numpy as np
import pandas as pd

# visualize 
import matplotlib.pyplot as plt
import seaborn as sns

# working with dates
from datetime import datetime

# modeling
import statsmodels.api as sm
from statsmodels.tsa.api import Holt, ExponentialSmoothing

# evaluate
from sklearn.metrics import mean_squared_error
from math import sqrt 

import wrangle as w

In [2]:
df = w.get_mental_health_data()
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108553 entries, 0 to 108552
Data columns (total 11 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   index                      108553 non-null  int64  
 1   Entity                     108553 non-null  object 
 2   Code                       103141 non-null  object 
 3   Year                       108553 non-null  object 
 4   Schizophrenia (%)          25875 non-null   object 
 5   Bipolar disorder (%)       19406 non-null   object 
 6   Eating disorders (%)       100236 non-null  object 
 7   Anxiety disorders (%)      6468 non-null    float64
 8   Drug use disorders (%)     6468 non-null    float64
 9   Depression (%)             6468 non-null    float64
 10  Alcohol use disorders (%)  6468 non-null    float64
dtypes: float64(4), int64(1), object(6)
memory usage: 9.1+ MB
None


Unnamed: 0,index,Entity,Code,Year,Schizophrenia (%),Bipolar disorder (%),Eating disorders (%),Anxiety disorders (%),Drug use disorders (%),Depression (%),Alcohol use disorders (%)
0,0,Afghanistan,AFG,1990,0.16056,0.697779,0.101855,4.82883,1.677082,4.071831,0.672404
1,1,Afghanistan,AFG,1991,0.160312,0.697961,0.099313,4.82974,1.684746,4.079531,0.671768
2,2,Afghanistan,AFG,1992,0.160135,0.698107,0.096692,4.831108,1.694334,4.088358,0.670644
3,3,Afghanistan,AFG,1993,0.160037,0.698257,0.094336,4.830864,1.70532,4.09619,0.669738
4,4,Afghanistan,AFG,1994,0.160022,0.698469,0.092439,4.829423,1.716069,4.099582,0.66926


In [3]:
mental_health_df, population_df, rates_df, depressive_rates_df = w.separate_data()

In [4]:
mental_health_df = w.clean_mental_health_data(mental_health_df)

There are 0, 0.0%, null values in index
There are 0, 0.0%, null values in entity
There are 980, 15.15%, null values in code
There are 0, 0.0%, null values in year
There are 0, 0.0%, null values in schizophrenia
There are 0, 0.0%, null values in bipolar_disorder
There are 0, 0.0%, null values in eating_disorders
There are 0, 0.0%, null values in anxiety_disorders
There are 0, 0.0%, null values in drug_use_disorders
There are 0, 0.0%, null values in depression
There are 0, 0.0%, null values in alcohol_use_disorders


In [5]:
mental_health_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6468 entries, 1990 to 2017
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   entity                 6468 non-null   object 
 1   code                   6468 non-null   object 
 2   schizophrenia          6468 non-null   float64
 3   bipolar_disorder       6468 non-null   float64
 4   eating_disorders       6468 non-null   float64
 5   anxiety_disorders      6468 non-null   float64
 6   drug_use_disorders     6468 non-null   float64
 7   depression             6468 non-null   float64
 8   alcohol_use_disorders  6468 non-null   float64
dtypes: float64(7), object(2)
memory usage: 505.3+ KB


In [6]:
mental_health_df.head()

Unnamed: 0_level_0,entity,code,schizophrenia,bipolar_disorder,eating_disorders,anxiety_disorders,drug_use_disorders,depression,alcohol_use_disorders
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1990,Afghanistan,AFG,0.16056,0.697779,0.101855,4.82883,1.677082,4.071831,0.672404
1991,Afghanistan,AFG,0.160312,0.697961,0.099313,4.82974,1.684746,4.079531,0.671768
1992,Afghanistan,AFG,0.160135,0.698107,0.096692,4.831108,1.694334,4.088358,0.670644
1993,Afghanistan,AFG,0.160037,0.698257,0.094336,4.830864,1.70532,4.09619,0.669738
1994,Afghanistan,AFG,0.160022,0.698469,0.092439,4.829423,1.716069,4.099582,0.66926


In [7]:
population_df

Unnamed: 0,index,entity,code,year,schizophrenia,bipolar_disorder,eating_disorders,anxiety_disorders,drug_use_disorders,depression,alcohol_use_disorders
6469,6469,Afghanistan,AFG,1800,,,3280000,,,,
6470,6470,Afghanistan,AFG,1801,,,3280000,,,,
6471,6471,Afghanistan,AFG,1802,,,3280000,,,,
6472,6472,Afghanistan,AFG,1803,,,3280000,,,,
6473,6473,Afghanistan,AFG,1804,,,3280000,,,,
...,...,...,...,...,...,...,...,...,...,...,...
54271,54271,Zimbabwe,ZWE,2015,2.789152,3.455323,13815000.000000,,,,
54272,54272,Zimbabwe,ZWE,2016,2.799308,3.479071,14030000.000000,,,,
54273,54273,Zimbabwe,ZWE,2017,2.812022,3.50086,14237000.000000,,,,
54274,54274,Zimbabwe,ZWE,2018,,,14439000.000000,,,,


In [8]:
population_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47807 entries, 6469 to 54275
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   index                  47807 non-null  int64  
 1   entity                 47807 non-null  object 
 2   code                   46081 non-null  object 
 3   year                   47807 non-null  object 
 4   schizophrenia          6468 non-null   object 
 5   bipolar_disorder       6468 non-null   object 
 6   eating_disorders       46883 non-null  object 
 7   anxiety_disorders      0 non-null      float64
 8   drug_use_disorders     0 non-null      float64
 9   depression             0 non-null      float64
 10  alcohol_use_disorders  0 non-null      float64
dtypes: float64(4), int64(1), object(6)
memory usage: 4.0+ MB


In [9]:
population_df = w.clean_population_data(population_df)

There are 0, 0.0%, null values in index
There are 0, 0.0%, null values in entity
There are 1726, 3.61%, null values in code
There are 0, 0.0%, null values in year
There are 41339, 86.47%, null values in schizophrenia
There are 41339, 86.47%, null values in bipolar_disorder
There are 924, 1.93%, null values in eating_disorders
There are 47807, 100.0%, null values in anxiety_disorders
There are 47807, 100.0%, null values in drug_use_disorders
There are 47807, 100.0%, null values in depression
There are 47807, 100.0%, null values in alcohol_use_disorders


In [10]:
population_df

Unnamed: 0_level_0,entity,code,prevalence_males,prevalance_female,effected_population
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1990,Afghanistan,AFG,3.499982,4.647815,12412000.0
1991,Afghanistan,AFG,3.503947,4.655772,13299000.0
1992,Afghanistan,AFG,3.508912,4.662066,14486000.0
1993,Afghanistan,AFG,3.513429,4.669012,15817000.0
1994,Afghanistan,AFG,3.515578,4.673050,17076000.0
...,...,...,...,...,...
2013,Zimbabwe,ZWE,2.769193,3.424106,13350000.0
2014,Zimbabwe,ZWE,2.778101,3.437674,13587000.0
2015,Zimbabwe,ZWE,2.789152,3.455323,13815000.0
2016,Zimbabwe,ZWE,2.799308,3.479071,14030000.0


In [11]:
population_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5488 entries, 1990 to 2017
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   entity               5488 non-null   object 
 1   code                 5488 non-null   object 
 2   prevalence_males     5488 non-null   float64
 3   prevalance_female    5488 non-null   float64
 4   effected_population  5488 non-null   float64
dtypes: float64(3), object(2)
memory usage: 257.2+ KB


In [12]:
rates_df

Unnamed: 0,index,entity,code,year,schizophrenia,bipolar_disorder,eating_disorders,anxiety_disorders,drug_use_disorders,depression,alcohol_use_disorders
54277,54277,Afghanistan,AFG,1800,,,3280000,,,,
54278,54278,Afghanistan,AFG,1801,,,3280000,,,,
54279,54279,Afghanistan,AFG,1802,,,3280000,,,,
54280,54280,Afghanistan,AFG,1803,,,3280000,,,,
54281,54281,Afghanistan,AFG,1804,,,3280000,,,,
...,...,...,...,...,...,...,...,...,...,...,...
102079,102079,Zimbabwe,ZWE,2015,27.197061,3068.250731,13815000.0,,,,
102080,102080,Zimbabwe,ZWE,2016,26.839591,3081.782858,14030000.0,,,,
102081,102081,Zimbabwe,ZWE,2017,26.391769,3094.795065,14237000.0,,,,
102082,102082,Zimbabwe,ZWE,2018,,,14439000.0,,,,


In [13]:
rates_df = w.clean_rates_data(rates_df)

There are 0, 0.0%, null values in index
There are 0, 0.0%, null values in entity
There are 1126, 14.61%, null values in code
There are 1239, 16.08%, null values in suicide_rates_per_100k
There are 1239, 16.08%, null values in depressive_disorder_rates_per_100k
There are 924, 11.99%, null values in population


In [14]:
rates_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5488 entries, 1990 to 2017
Data columns (total 9 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   entity                              5488 non-null   object 
 1   code                                5488 non-null   object 
 2   suicide_rates_per_100k              5488 non-null   float64
 3   depressive_disorder_rates_per_100k  5488 non-null   float64
 4   population                          5488 non-null   float64
 5   percentage_suicide                  5488 non-null   float64
 6   percentage_depressive_disorder      5488 non-null   float64
 7   num_suicide                         5488 non-null   float64
 8   num_depressed                       5488 non-null   float64
dtypes: float64(7), object(2)
memory usage: 428.8+ KB


In [15]:
rates_df

Unnamed: 0_level_0,entity,code,suicide_rates_per_100k,depressive_disorder_rates_per_100k,population,percentage_suicide,percentage_depressive_disorder,num_suicide,num_depressed
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1990,Afghanistan,AFG,10.318504,4039.755763,12412000.0,0.010319,4.039756,128073.0,50141449.0
1991,Afghanistan,AFG,10.327010,4046.256034,13299000.0,0.010327,4.046256,137339.0,53811159.0
1992,Afghanistan,AFG,10.271411,4053.709902,14486000.0,0.010271,4.053710,148792.0,58722042.0
1993,Afghanistan,AFG,10.376123,4060.203474,15817000.0,0.010376,4.060203,164119.0,64220238.0
1994,Afghanistan,AFG,10.575915,4062.290365,17076000.0,0.010576,4.062290,180594.0,69367670.0
...,...,...,...,...,...,...,...,...,...
2013,Zimbabwe,ZWE,28.361200,3048.264249,13350000.0,0.028361,3.048264,378622.0,40694328.0
2014,Zimbabwe,ZWE,27.605547,3056.996704,13587000.0,0.027606,3.056997,375077.0,41535414.0
2015,Zimbabwe,ZWE,27.197061,3068.250731,13815000.0,0.027197,3.068251,375727.0,42387884.0
2016,Zimbabwe,ZWE,26.839591,3081.782858,14030000.0,0.026840,3.081783,376559.0,43237413.0


In [16]:
depressive_rates_df

Unnamed: 0,index,entity,code,year,schizophrenia,bipolar_disorder,eating_disorders,anxiety_disorders,drug_use_disorders,depression,alcohol_use_disorders
102085,102085,Afghanistan,AFG,1990,318435.81367,,,,,,
102086,102086,Afghanistan,AFG,1991,329044.773956,,,,,,
102087,102087,Afghanistan,AFG,1992,382544.572895,,,,,,
102088,102088,Afghanistan,AFG,1993,440381.507393,,,,,,
102089,102089,Afghanistan,AFG,1994,456916.645489,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
108548,108548,Zimbabwe,ZWE,2013,303564.60359,,,,,,
108549,108549,Zimbabwe,ZWE,2014,311665.769283,,,,,,
108550,108550,Zimbabwe,ZWE,2015,320638.507158,,,,,,
108551,108551,Zimbabwe,ZWE,2016,330437.353798,,,,,,


In [17]:
depressive_rates_df = w.clean_depressive_rates_data(depressive_rates_df)

In [18]:
depressive_rates_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6468 entries, 1990 to 2017
Data columns (total 3 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   entity                          6468 non-null   object 
 1   code                            6468 non-null   object 
 2   prevelance_depressive_disorder  6468 non-null   float64
dtypes: float64(1), object(2)
memory usage: 202.1+ KB


In [19]:
depressive_rates_df

Unnamed: 0_level_0,entity,code,prevelance_depressive_disorder
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1990,Afghanistan,AFG,318435.813670
1991,Afghanistan,AFG,329044.773956
1992,Afghanistan,AFG,382544.572895
1993,Afghanistan,AFG,440381.507393
1994,Afghanistan,AFG,456916.645489
...,...,...,...
2013,Zimbabwe,ZWE,303564.603590
2014,Zimbabwe,ZWE,311665.769283
2015,Zimbabwe,ZWE,320638.507158
2016,Zimbabwe,ZWE,330437.353798


In [20]:
mental_health_df.shape

(6468, 9)

In [21]:
population_df.shape

(5488, 5)

In [22]:
rates_df.shape

(5488, 9)

In [23]:
depressive_rates_df.shape

(6468, 3)

In [24]:
df_1, df_2, df_3, df_4 = w.yearly_aggregation()

There are 0, 0.0%, null values in index
There are 0, 0.0%, null values in entity
There are 980, 15.15%, null values in code
There are 0, 0.0%, null values in year
There are 0, 0.0%, null values in schizophrenia
There are 0, 0.0%, null values in bipolar_disorder
There are 0, 0.0%, null values in eating_disorders
There are 0, 0.0%, null values in anxiety_disorders
There are 0, 0.0%, null values in drug_use_disorders
There are 0, 0.0%, null values in depression
There are 0, 0.0%, null values in alcohol_use_disorders
There are 0, 0.0%, null values in index
There are 0, 0.0%, null values in entity
There are 1726, 3.61%, null values in code
There are 0, 0.0%, null values in year
There are 41339, 86.47%, null values in schizophrenia
There are 41339, 86.47%, null values in bipolar_disorder
There are 924, 1.93%, null values in eating_disorders
There are 47807, 100.0%, null values in anxiety_disorders
There are 47807, 100.0%, null values in drug_use_disorders
There are 47807, 100.0%, null values

In [25]:
df = df_1.join(df_2)

In [26]:
df = df.join(df_3)

In [27]:
df_4

Unnamed: 0_level_0,suicide_rates_per_100k,depressive_disorder_rates_per_100k,population,percentage_suicide,percentage_depressive_disorder,num_suicide,num_depressed
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1990,12.686986,3359.130759,54313110.0,0.012687,3.359131,826901.193878,187913000.0
1991,12.710841,3362.407064,55200780.0,0.012711,3.362407,845021.352041,191190600.0
1992,12.805068,3365.061522,56063720.0,0.012805,3.365062,870116.811224,194346100.0
1993,12.999077,3367.120386,56906810.0,0.012999,3.36712,893601.653061,197364300.0
1994,13.149959,3368.752615,57738270.0,0.01315,3.368753,910276.153061,200263700.0
1995,13.193023,3369.597188,58564680.0,0.013193,3.369597,914498.459184,203030200.0
1996,13.052057,3369.549509,59387130.0,0.013052,3.36955,900249.030612,205739200.0
1997,12.974895,3368.35191,60204090.0,0.012975,3.368352,892415.397959,208445400.0
1998,12.911718,3366.353267,61016930.0,0.012912,3.366353,885597.44898,211135500.0
1999,12.794362,3363.817854,61826720.0,0.012794,3.363818,884252.637755,213829000.0


In [28]:
df = w.merge_yearly_aggregation()
df

There are 0, 0.0%, null values in index
There are 0, 0.0%, null values in entity
There are 980, 15.15%, null values in code
There are 0, 0.0%, null values in year
There are 0, 0.0%, null values in schizophrenia
There are 0, 0.0%, null values in bipolar_disorder
There are 0, 0.0%, null values in eating_disorders
There are 0, 0.0%, null values in anxiety_disorders
There are 0, 0.0%, null values in drug_use_disorders
There are 0, 0.0%, null values in depression
There are 0, 0.0%, null values in alcohol_use_disorders
There are 0, 0.0%, null values in index
There are 0, 0.0%, null values in entity
There are 1726, 3.61%, null values in code
There are 0, 0.0%, null values in year
There are 41339, 86.47%, null values in schizophrenia
There are 41339, 86.47%, null values in bipolar_disorder
There are 924, 1.93%, null values in eating_disorders
There are 47807, 100.0%, null values in anxiety_disorders
There are 47807, 100.0%, null values in drug_use_disorders
There are 47807, 100.0%, null values

Unnamed: 0_level_0,schizophrenia,bipolar_disorder,eating_disorders,anxiety_disorders,drug_use_disorders,depression,alcohol_use_disorders,prevalence_males,prevalance_female,effected_population,prevelance_depressive_disorder,suicide_rates_per_100k,depressive_disorder_rates_per_100k,population,percentage_suicide,percentage_depressive_disorder,num_suicide,num_depressed
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1990,0.209548,0.715392,0.221774,3.957269,0.808283,3.506288,1.546533,546.006065,813.774802,10645370000.0,3490000.0,12.686986,3359.130759,54313110.0,0.012687,3.359131,826901.193878,187913000.0
1991,0.209586,0.71574,0.222076,3.960009,0.813466,3.510948,1.5535,547.002837,814.699209,10819350000.0,3556168.0,12.710841,3362.407064,55200780.0,0.012711,3.362407,845021.352041,191190600.0
1992,0.209634,0.716091,0.222481,3.962778,0.818692,3.515033,1.559927,547.899575,815.450894,10988490000.0,3622025.0,12.805068,3365.061522,56063720.0,0.012805,3.365062,870116.811224,194346100.0
1993,0.20969,0.71643,0.223033,3.965405,0.82378,3.518531,1.565611,548.682179,816.055314,11153740000.0,3686046.0,12.999077,3367.120386,56906810.0,0.012999,3.36712,893601.653061,197364300.0
1994,0.209751,0.716755,0.22371,3.967976,0.828389,3.521437,1.570127,549.323818,816.514307,11316700000.0,3749624.0,13.149959,3368.752615,57738270.0,0.01315,3.368753,910276.153061,200263700.0
1995,0.20981,0.717037,0.224514,3.970051,0.832391,3.523328,1.573532,549.840916,816.695117,11478680000.0,3812622.0,13.193023,3369.597188,58564680.0,0.013193,3.369597,914498.459184,203030200.0
1996,0.209877,0.717293,0.225581,3.972869,0.837124,3.524659,1.576549,550.20511,816.698938,11639880000.0,3876798.0,13.052057,3369.549509,59387130.0,0.013052,3.36955,900249.030612,205739200.0
1997,0.209966,0.717564,0.22698,3.977437,0.843217,3.525602,1.579598,550.435902,816.424037,11800000000.0,3943264.0,12.974895,3368.35191,60204090.0,0.012975,3.368352,892415.397959,208445400.0
1998,0.210075,0.717843,0.228581,3.982739,0.849664,3.525968,1.582334,550.527123,815.983055,11959320000.0,4010870.0,12.911718,3366.353267,61016930.0,0.012912,3.366353,885597.44898,211135500.0
1999,0.2102,0.718117,0.230264,3.987662,0.855211,3.525664,1.58445,550.553431,815.394137,12118040000.0,4080575.0,12.794362,3363.817854,61826720.0,0.012794,3.363818,884252.637755,213829000.0
