In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
# import folium as folium

df = pd.read_csv('../data/all_deaths_clean.csv', encoding='utf-8')

The following analysis is intended to uncover statistically-significant data regarding female vs. male inmate death trends. The scipy library is leveraged, specifically by using the t-test function: `stats.ttest_ind_from_stats` 

In [42]:
# SHOULD NA BE EXCLUDED? MAYBE NOT....
df_filt = df[df["cause_detail_group"].notna()]
df_f = df_filt[df_filt['gender']=='f']
df_m = df_filt[df_filt['gender']=='m']

# Get number of female and male inmates, respectively
n_f = df_f.shape[0]
n_m = df_m.shape[0]

## Male vs. Female inmates dying by drug-related cause

In [44]:
# Filter for drug-related deaths
df_f_drug = df_f[df_f["cause_detail_group"] == "drug"]
df_m_drug = df_m[df_m["cause_detail_group"] == "drug"]

# Get mu for female and male drug-related deaths, respectively
mu_f = df_f_drug.shape[0] / n_f
mu_m = df_m_drug.shape[0] / n_m

# Get standard deviations for female and male drug-related deaths, respectively
std_f = np.std(np.where(df_f["cause_detail_group"] == "drug", 1, 0))
std_m = np.std(np.where(df_m["cause_detail_group"] == "drug", 1, 0))

# Calculate p-value
pval = stats.ttest_ind_from_stats(mu_f, std_f, n_f, mu_m, std_m, n_m)

# Print result of p-value (stat significant or not?)
if pval[1] < 0.05:
    print(f"P-value IS statistically significant (p-value = {pval[1]})")
else:
    print(f"P-value IS NOT statistically significant (p-value = {pval[1]})")

P-value IS statistically significant (p-value = 5.075926751904812e-13)


Filtering for further analysis on `cause_short` identifier.

In [40]:
# F and separate our dataframes
df_filt = df[df['cause_short'].notna()]
df_f = df_filt[df_filt['gender']=='f']
df_m = df_filt[df_filt['gender']=='m']

# Get number of female and male inmates, respectively
n_f = df_f.shape[0]
n_m = df_m.shape[0]

## Male vs. Female inmates dying by medical complication

In [45]:
# Get mu of female and male medical complication deaths, respectively
mu_f = df_f[df_f['cause_short']=='m'].shape[0] / n_f
mu_m = df_m[df_m['cause_short']=='m'].shape[0] / n_m

# Get standard deviations for female and male medical complication deaths, respectively
std_f = np.std(np.where(df_f['cause_short']=='m', 1, 0))
std_m = np.std(np.where(df_m['cause_short']=='m', 1, 0))

# Calculate p-value
pval = stats.ttest_ind_from_stats(mu_f, std_f, n_f, mu_m, std_m, n_m)

# Print result of p-value (stat significant or not?)
if pval[1] < 0.05:
    print(f"P-value IS statistically significant (p-value = {pval[1]})")
else:
    print(f"P-value IS NOT statistically significant (p-value = {pval[1]})")

P-value IS NOT statistically significant (p-value = 0.09196638997698743)


First, I filtered the dataframe to include only rows where 'cause_short' is not NA. Then I make separate dataframes for male vs. female inmates. After, I calculated the relevant sample statistics (sample n, sample mean, sample standard deviation). Finally, I conducted an independent t-test. 

$H_0$: The proportion of incarcerated males who died in custody by medical complications is the same as the proportion of incarcerated females who died in custody of medical complications. 

$H_A$: The proportion of incarcerated males who died in custody by medical complications is **NOT** the same as the proportion of incarcerated females who died in custody of medical complications.

Our calculated p-value was 0.162; using a significance level of 0.05, this means that we fail to reject the null hypothesis. There was **insufficient** evidence to claim that there was a significant difference in the proportion of incarcerated males who died in custody by medical complications vs. the proportion of incarcerated females who died in custody by medical complication. 

## Male vs. Female dying by suicide

In [22]:
# Get mu of female and male suicide deaths, respectively
mu_f_s = df_f[df_f['cause_short']=='s'].shape[0] / n_f
mu_m_s = df_m[df_m['cause_short']=='s'].shape[0] / n_m

# Get standard deviations for female and male suicide deaths, respectively
std_f_s = np.std(np.where(df_f['cause_short']=='s', 1, 0))
std_m_s = np.std(np.where(df_m['cause_short']=='s', 1, 0))

# Calculate p-value
pval = stats.ttest_ind_from_stats(mu_f_s, std_f_s, n_f, mu_m_s, std_m_s, n_m)

# Print result of p-value (stat significant or not?)
if pval[1] < 0.05:
    print(f"P-value IS statistically significant (p-value = {pval[1]})")
else:
    print(f"P-value IS NOT statistically significant (p-value = {pval[1]})")

P-value IS statistically significant (p-value = 1.0912077967916176e-08)


$H_0$: The proportion of incarcerated males who died in custody by suicide is the same as the proportion of incarcerated females who died in custody by suicide. 

$H_A$: The proportion of incarcerated males who died in custody by suicide is **NOT** the same as the proportion of incarcerated females who died in custody by suicide. 

Our calculated p-value was 1.091e-08; using a significance level of 0.05, this means that we CAN reject the null hypothesis. There was **sufficient** evidence to claim that there **IS** a significant difference in the proportion of incarcerated males who died in custody by suicide vs. the proportion of incarcerated females who died in custody by suicide. 

MISSING CF BLOCK FROM ALI'S FILE

# Male vs. Female dying by homicide 

In [23]:
# Get mu of female and male homicide deaths, respectively
mu_f_h = df_f[df_f['cause_short']=='h'].shape[0] / n_f
mu_m_h = df_m[df_m['cause_short']=='h'].shape[0] / n_m

# Get standard deviations for female and male homicide deaths, respectively
std_f_h = np.std(np.where(df_f['cause_short']=='h', 1, 0))
std_m_h = np.std(np.where(df_m['cause_short']=='h', 1, 0))

# Calculate p-value
pval = stats.ttest_ind_from_stats(mu_f_h, std_f_h, n_f, mu_m_h, std_m_h, n_m)

# Print result of p-value (stat significant or not?)
if pval[1] < 0.05:
    print(f"P-value IS statistically significant (p-value = {pval[1]})")
else:
    print(f"P-value IS NOT statistically significant (p-value = {pval[1]})")

P-value IS statistically significant (p-value = 5.561132766859688e-07)


$H_0$: The proportion of incarcerated males who died in custody by homicide is the same as the proportion of incarcerated females who died in custody by homicide. 

$H_A$: The proportion of incarcerated males who died in custody by homicide is **NOT** the same as the proportion of incarcerated females who died in custody by homicide. 

Our calculated p-value was 5.561e-07; using a significance level of 0.05, this means that we CAN reject the null hypothesis. There was **sufficient** evidence to claim that there **IS** a significant difference in the proportion of incarcerated males who died in custody by homicide vs. the proportion of incarcerated females who died in custody by homicide. 