In [None]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Read in cleaned data, and clean again
data = pd.read_csv("data/sub_dataset.csv")
del data["country-year"]
data.head()

In [None]:
data["age"].value_counts()

In [None]:
data["sex"].value_counts()

In [None]:
# Grouping

sub_data = data.loc[:,["country","year","sex","age","suicides_no"]]
#group_data = sub_data.groupby(["country","age","sex"])
group_data = sub_data.groupby(["age","sex"])

suicide_sum = pd.DataFrame(group_data["suicides_no"].sum())

In [None]:
# DataFrame Overview

suicide_sum.rename(columns = {"suicides_no":"suicides_sum"})
suicide_sum.sort_values("age")
suicide_sum.head()

In [None]:
# Unstack the DataFrame to create bar chart
suicide_sum = suicide_sum.unstack()
suicide_sum

In [None]:
# Create an index column to sort age group
ind = [2, 3, 4, 1, 5, 6]
suicide_sum["index_col"] = ind
suicide_sum = suicide_sum.sort_values("index_col")['suicides_no']
suicide_sum

In [None]:
suicide_sum.plot(kind="bar", rot = 0, figsize = (10,4), 
                 color = ["orangered","steelblue"], fontsize = 12, alpha = 0.8)

plt.legend(["Female","Male"],title="Sex")
plt.title("Number of Suicides within Each Age Group")
plt.xlabel("Age Group")
plt.ylabel("Count")
plt.grid()
plt.savefig("Images/age_group_bar.png")

plt.show()

In [None]:
# Another graph showing sum of suicide counts for female and male  
sex_group = sub_data.groupby(["sex"])
sum_by_sex = pd.DataFrame(sex_group["suicides_no"].sum())
sum_by_sex

In [None]:
plt.figure()

sum_by_sex.plot(kind = "bar", color = "steelblue", width = 0.3, alpha = 0.8, rot = 0, fontsize = 12)
plt.title("Number of Suicides by Sex")
plt.ylabel("Count")
plt.legend(["Suicides Num"],loc="best")
plt.savefig("Images/sex_group_bar.png")
plt.show()

In [None]:
# Do a two sample T-Test on female and male data
import scipy.stats as stats
df3 = sub_data.groupby(["sex","year"])
df3_sum = pd.DataFrame(df3["suicides_no"].sum())
df3_sum

In [None]:
df_female = sub_data.loc[sub_data["sex"] == "female", :]
df_male = sub_data.loc[sub_data["sex"] == "male", :]

stats.ttest_ind(df_female["suicides_no"], df_male["suicides_no"], equal_var=False)

In [None]:
suicide_sum["male_to_female_ratio"] = suicide_sum["male"]//suicide_sum["female"]
suicide_sum