Analysis of Depression Dataset

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import ipywidgets
from ipywidgets import interactive
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import widgets, interactive

Reading the csvs required

In [2]:
def read_file(file_name):
    return pd.read_csv(file_name)

Analysis 1: Creating a scatter plot to replicate the article's visualization

In [3]:
# Interactive plot
def plot_scatter(continent):
    df1 = read_file("prevalence_male_females.csv")
    df_continents = read_file("continents2.csv")
    df1.dropna()
    # Renaming the columns
    df1 = df1.rename(columns={'Prevalence - Depressive disorders - Sex: Male - Age: Age-standardized (Percent)':'Male',
                              'Prevalence - Depressive disorders - Sex: Female - Age: Age-standardized (Percent)':'Female'})
    # Filtering the data for one year as mentioned in the article
    df_2017 = df1[df1["Year"] == 2017]

    #Merging df_2017 with df_continents for continents correspinding to country code
    result = pd.merge(df_2017, df_continents, how="left")
    result['region'] = result['region'].replace(np.nan, 'Others')
    if continent == 'All':
        result.plot(kind='scatter', x='Female', y='Male', ylim=(0, 8), xlim=(0, 8),
                    figsize=(20, 10))  # scatter plot
    else:
        df_temp = result[result['region'] == continent]
        df_temp.plot(kind='scatter', x='Female', y='Male', ylim=(0, 8), xlim=(0, 8),
                     figsize=(20, 10))  # scatter plot
    plt.xlabel('Prevelance in Female')
    plt.ylabel('Prevelance in Male')
    plt.title('Prevalence in gender')
    plt.show()

In [4]:
# Creating an interactive plot using 'interactive' function with all the necessary dropdowns
plot1 = interactive(plot_scatter,
                    continent=ipywidgets.Dropdown(
                        value='All',
                        options=['All', 'Asia', 'Others', 'Europe', 'Africa', 'Oceania', 'Americas'],
                        description='Continent'
                    ))
plot1

interactive(children=(Dropdown(description='Continent', options=('All', 'Asia', 'Others', 'Europe', 'Africa', …

Reading another data to cross check the above conclusion

In [6]:
df2 = pd.read_csv("GHD_male_female.csv")
df2.dropna()
# Creates a pivot table dataframe
table = pd.pivot_table(df2, values ='val', columns =['sex'], aggfunc = np.sum) 
print(table)

sex    Female      Male
val  0.907279  0.546125


Filtering the data for required years

In [10]:
df2_2017 = df2[df2["year"].isin([2017,2018,2019])]
val1 = df2_2017.groupby(['sex'])['val'].mean()
val1

sex
Female    0.030603
Male      0.018774
Name: val, dtype: float64

In [11]:
df2['location'].unique()

array(['Global'], dtype=object)

Since, the only location is global, we cannot analyse country wise 

*** ANALYSIS RELATING TO SECOND HYPOTHESIS(Supriya Jayadev Hiremath)

There were no standard limitation to set a particular range of age to identify as youth or adults or seniors. So using the below mentiones source: https://www.statcan.gc.ca/en/concepts/definitions/age2 as a standard. Function 'age' defined below is used to categorize age ranges present in original data set into a particular age group as follows:-

1) youth:'15 to 19', '20 to 24' 2) adults: '25 to 29', '30 to 34', '35 to 39','40 to 44', '45 to 49', '50 to 54', '55 to 59', '60 to 64' 3) Seniors: '65 to 69', '70 to 74', '75 to 79', '80 to 84', '85 to 89','90 to 94', '95 plus'


In [None]:
def age(row: pd.Series) -> str:
    try:
        if row['age'] in ('15 to 19', '20 to 24'):
            category = 'youth'
        elif row['age'] in ('25 to 29', '30 to 34', '35 to 39', '40 to 44', '45 to 49', '50 to 54', '55 to 59',
                            '60 to 64','65 to 69'):
            category = 'adults'
        elif row['age'] in ('70 to 74', '75 to 79', '80 to 84', '85 to 89', '90 to 94', '95 plus'):
            category = 'seniors'
    except UnboundLocalError:
        print('Data set has age group that is not required for computation, please re-check the dataset.')
    return category

This function takes in main_df dataframe which has data grouped by year, location, cause and age_category.

It plots graphs displaying Prevalence of any mental health condition over a period of time for a set of regions/ set of age groups passed as list.

In [None]:
# Reference from DATAPANE site https://docs.datapane.com/examples-and-tutorials/interactive-filters#plotly-1
def interactive_graph(maindf, list_to_display, cause_to_plot, region_or_category):
    final_plot = go.Figure()

    list_of_countries.sort()
    list_to_display.sort()

    if list_of_countries == list_to_display:
        for every_country in list_to_display:
            df_to_plot = maindf[(maindf['location'] == every_country) & (maindf['cause'] == cause_to_plot)
                             & (maindf['age_categories'] == region_or_category)]
            final_plot.add_trace(
            go.Scatter(
                x = df_to_plot['year'][df_to_plot['location']==every_country],
                y = df_to_plot['val'][df_to_plot['location']==every_country],
                name = country, visible = True
            )
        )
    else:
        for group in list_to_display:
            df_to_plot = maindf[(maindf['location'] == region_or_category) & (maindf['cause'] == cause_to_plot)
                             & (maindf['age_categories'] == group)]
            final_plot.add_trace(
            go.Scatter(
                x = df_to_plot['year'][df_to_plot['age_categories']==group],
                y = df_to_plot['val'][df_to_plot['age_categories']==group],
                name = group, visible = True
            )
        )


    buttons = []

    for i, each_val in enumerate(list_to_display):
        args = [False] * len(list_to_display)
        args[i] = True

        button = dict(label = each_val,
                      method = "restyle",
                      args=[{"visible": args}])

        buttons.append(button)


        final_plot.update_layout(
        updatemenus=[dict(
                        active=0,
                        type="dropdown",
                        buttons=buttons,
                        x = 0.0,
                        y = 1.0,
                        xanchor = 'right',
                        yanchor = 'top'
                    )],
        title="PREVELANCE OF " + cause_to_plot.upper() + " IN " + region_or_category.upper(),
        title_x = 0.6,
        xaxis_title="YEAR",
        yaxis_title="PREVALENCE",
        autosize=False,
        width=1000,
        height=800,
        xaxis1_rangeslider_visible = True
)

    return final_plot

In [None]:
def compute(df: pd.DataFrame):
    """
    Creates new column in source dataframe called 'age_categories' using 'age' function and
    multiplies val coloumn with 100 as the original dataset value got divided by 100 while downloading.

    Since original data set has age groups(ex: 15 to 19), and in source dataframe each group was assigned to
    it's corresponding category either as youth, adults or seniors.
    So there are various values for one age category for every mental health condiiton.
    To perform analysis, this function also calculates mean value of these rows so that every category has
    one average value for every disorder.
    """
    if 'age_categories' not in df.columns:
        df['age_categories'] = df.apply(age, axis=1)
        df.val *= 100
    else:
        print('Computation has already been performed please check or import the data file again')
    df_grouped = df.groupby(by=['year','location','cause','age_categories']).agg({'val':'mean'}).reset_index()
    return df_grouped


In [None]:
def max_val(df_max, country, identifier):

    if identifier == 'Major depressive disorder':
        country_df = df_max[(df_max['location'] == country) & (df_max['cause'] == identifier)]
    elif identifier == 'Bipolar disorder':
        country_df = df_max[(df_max['location'] == country) & (df_max['cause'] == identifier)]
    elif identifier == 'Dysthymia':
        country_df = df_max[(df_max['location'] == country) & (df_max['cause'] == identifier)]
    elif identifier == 'Anxiety disorders':
        country_df = df_max[(df_max['location'] == country) & (df_max['cause'] == identifier)]

    values = country_df[country_df.val == country_df.val.max()]
    return values

In [None]:
df_global = pd.read_csv('IHME-GBD_2019_DATA_global.csv', usecols = ['location', 'sex','age','cause','year','val'])
df_five_region = pd.read_csv('IHME-GBD_2019_DATA_allregion_allage.csv', usecols = ['location', 'sex','age','cause','year','val'])
df_SD_use = pd.read_csv('IHME-GBD_2019_DATA_sub_use_drug_use_all_regions.csv')
df_socialmedia = pd.read_csv('social media usage.csv')
df_five_region.head()

In [None]:
print('There are {} missing values in our five region dataframe'.format(df_five_region.isna().sum().sum()))
print('There are {} missing values in our global dataframe'.format(df_global.isna().sum().sum()))

In [None]:
to_concat = [df_global, df_five_region]
df_noagecat = pd.concat(to_concat)

In [None]:
main_df = compute(df_noagecat)
df_substance_and_drug_use = compute(df_SD_use)
df_five_regions = compute(df_five_region)

Creating lists of countries, all the mental health condiitons and different age_categories

In [None]:
list_of_countries = list(main_df['location'].unique())
list_of_condition = list(main_df['cause'].unique())
list_of_age_groups = list(main_df['age_categories'].unique())

In [None]:
df_depression = df_five_regions[(df_five_regions['cause'] == 'Major depressive disorder')]
main_df_depression = df_depression.groupby(by=['location']).agg({'val':'mean'}).reset_index()

fig_depression = px.bar(main_df_depression, x='location', y='val',title="PREVALENCE OF DEPRESSION IN FIVE MAIN REGIONS",labels={
                     "val":"Prevalence(c%)",
                     "location":"Region",
                     })
fig_depression.show()


main_df_all = df_five_regions.groupby(by=['location']).agg({'val':'mean'}).reset_index()

fig_all_disorders = px.bar(main_df_all, x='location', y='val',title="PREVALENCE OF MENTAL HEALTH DISORDERS IN FIVE MAIN REGIONS",labels={
                     "val":"Prevalence(c%)",
                     "location":"Region",
                     })
fig_all_disorders.show()

Above plot shows prevalence of depression disorder in five main regions that i.e., Africa, America, Asia, Europe, Oceania

In [None]:
interactive_graph(main_df, list_of_age_groups, 'Major depressive disorder', 'United States of America')

In [None]:
interactive_graph(main_df, list_of_age_groups, 'Major depressive disorder', 'Oceania')

In [None]:
interactive_graph(main_df, list_of_age_groups, 'Major depressive disorder', 'Asia')

In [None]:
interactive_graph(main_df, list_of_age_groups, 'Major depressive disorder', 'Global')

Displaying maximum prevalence value for all kinds of mental disorders in every region for reference.

In [None]:
for each_country in list_of_countries:
    for condition in list_of_condition:
        value_df = max_val(main_df, each_country, condition)
        country = (list(value_df['location']))
        age_group = (list(value_df['age_categories']))
        cause = (list(value_df['cause']))
        max_value = (list(value_df['val']))
        print('{} : Maximum percent of prevalance for {} condition in {} age group is {:.2f}'.format(country[0], cause[0], age_group[0], max_value[0]),'\n')

In [None]:
interactive_graph(df_substance_and_drug_use, list_of_age_groups, 'Substance use disorders', 'United States of America')

In [None]:
interactive_graph(df_substance_and_drug_use, list_of_age_groups, 'Drug use disorders','United States of America')

In [None]:
interactive_graph(df_substance_and_drug_use, list_of_age_groups, 'Substance use disorders', 'Oceania')

In [None]:
interactive_graph(df_substance_and_drug_use, list_of_age_groups, 'Drug use disorders', 'Oceania')

In [None]:
interactive_graph(df_substance_and_drug_use, list_of_countries, 'Substance use disorders', 'youth')

In [None]:
interactive_graph(df_substance_and_drug_use, list_of_countries, 'Drug use disorders', 'youth')

In [None]:
df_socialmedia = pd.read_csv('/Users/supriyajayadevhiremath/Desktop/Final Project PR/social media usage.csv')
df_socialmedia['date'] = pd.to_datetime(df_socialmedia['Unnamed: 0'],format='%m/%d/%y')
df_socialmedia['year'] = pd.DatetimeIndex(df_socialmedia['date']).year
df_social_media = df_socialmedia[['year','18-29','30-49','50-64','65+']]

# Referenced from https://pandas.pydata.org/docs/reference/api/pandas.melt.html
df_new = pd.melt(df_social_media, id_vars = "year",var_name="age_groups",value_name="percent_usage")
df_new['percent_usage'] = df_new['percent_usage'].map(lambda x: x.rstrip('%'))
df_new['percent_usage'] = df_new['percent_usage'].astype(str).astype(int)
trend_plot = sns.relplot(data=df_new, x="year", y="percent_usage", hue="age_groups",kind="line", height=10, aspect = 2)
