# Project 1: Does the number of mental health facilities in a given area affect the suicide rate?
----

### Data Analysis
#### * Found and downloaded mental health facility locations from the samhsa.gov website. 

---


In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import datetime
from pprint import pprint

# Import API key
#from api_keys import api_key


# Incorporated citipy to determine city based on latitude and longitude
from citipy import citipy

# Output File (CSV)
#output_facility_data_file = "../data/cleandata/facilities.csv"
#output_population_data_file = "../data/cleandata/population.csv"
#output_expenditure_data_file = "../data/Expenditure.csv"

## Data Gathering
### * Behavioral Health Treatment Facilities (from Substance Abuse and Mental Health Service Administration)


In [2]:
#Mental Health Facilities in the US
facilities_file = "../data/rawdata/Behavioral_Health_Treatment_Facility_listing_2019_03_22_215606.csv"
facilities_df = pd.read_csv(facilities_file)

#Keep only necessary columns
facilities_df_sub = facilities_df[['    name1', 'name2', 'city', 'state', 'zip', 'latitude', 'longitude']]

#Suppress leading spaces on column header for name1
facilities_df_final = facilities_df_sub.rename(columns={"    name1":"name1"})

#Look at data for consistency
facilities_df_final.count()

name1        9686
name2        5430
city         9686
state        9686
zip          9686
latitude     9685
longitude    9685
dtype: int64

In [3]:
#drop one row with no latitude and longitude data
facilities_df_final.dropna(subset=['latitude', 'longitude'], inplace=True)
facilities_df_final.count()

name1        9685
name2        5429
city         9685
state        9685
zip          9685
latitude     9685
longitude    9685
dtype: int64

In [4]:
#remove us territories:  GU (Guam), AS (), PR (Puerto Rico), VI (Virgin Islands) 
#Note: keeping Washington DC in data, so total group of states will be 51
us_facilities_df = facilities_df_final[(facilities_df_final.state != 'GU') & (facilities_df_final.state != 'AS') & (facilities_df_final.state != 'PR') & (facilities_df_final.state != 'VI')]
print(us_facilities_df.count())
print(us_facilities_df.state.unique())

name1        9614
name2        5379
city         9614
state        9614
zip          9614
latitude     9614
longitude    9614
dtype: int64
['TX' 'LA' 'AR' 'MS' 'OK' 'AL' 'MO' 'TN' 'KS' 'NM' 'FL' 'IL' 'KY' 'GA'
 'CO' 'NE' 'IN' 'IA' 'NC' 'SC' 'AZ' 'VA' 'OH' 'SD' 'WI' 'WY' 'WV' 'MN'
 'UT' 'MI' 'PA' 'CA' 'MD' 'ND' 'NV' 'MT' 'DC' 'ID' 'NY' 'DE' 'NJ' 'CT'
 'MA' 'VT' 'OR' 'RI' 'NH' 'WA' 'ME' 'AK' 'HI']


In [5]:
# Export file as a CSV, without the Pandas index, but with the header
us_facilities_df.to_csv(output_facility_data_file, index=False, header=True)

## Data Gathering
### * US Population (from Census Bureau)

In [6]:
#US Census Bureau population in 50 US States
us_population_file = "../data/rawdata/PopulationEstimatesByState.csv"
pop_df = pd.read_csv(us_population_file)

#rename abbreviation column to state so as to be able to merge data with other files
pop_df = pop_df.rename(columns={"Abbv":"state",
                                "States":"State Name"})

#remove commans in population counts and make it a numeric field
pop_df['Population'] = pop_df['Population'].str.replace(',', '')
pop_df["Population"] = pd.to_numeric(pop_df["Population"])
print(pop_df.head())

## Export file as a CSV, without the Pandas index, but with the header
pop_df.to_csv(output_population_data_file, index=False, header=True)

  state  State Name  Population
0    AL     Alabama     4887871
1    AK      Alaska      737438
2    AZ     Arizona     7171646
3    AR    Arkansas     3013825
4    CA  California    39557045


In [1]:
## Data Gathering
### * US Expenditure

In [None]:
#Mental Health Expenditure 2004-2013 in the US

#----------------------------------------------------------------------------
expenditure_data_file = "../data/rawdata/StateMentalHealthServiceExpenditures2004_2013.csv"
expenditure_df = pd.read_csv(expenditure_data_file)

expenditure_df = expenditure_df[(expenditure_df.state != 'US') & (expenditure_df.state != 'PR')]

#replace NR values with np.nan
expenditure_df['FY2011__SMHA Expenditures Per Capita'] = expenditure_df['FY2011__SMHA Expenditures Per Capita'].replace('NR', np.NaN, regex=True)
expenditure_df['FY2013__SMHA Expenditures Per Capita'] = expenditure_df['FY2013__SMHA Expenditures Per Capita'].replace('NR', np.NaN, regex=True)

#change those columns to numeric values
expenditure_df["FY2011__SMHA Expenditures Per Capita"] = pd.to_numeric(expenditure_df["FY2011__SMHA Expenditures Per Capita"])
expenditure_df["FY2013__SMHA Expenditures Per Capita"] = pd.to_numeric(expenditure_df["FY2013__SMHA Expenditures Per Capita"])

expenditure_df


#-clean expenditure data-Complete. Dropped N/A, US &PR data,change NR data to NAN. 

In [None]:
# Export file as a CSV, without the Pandas index, but with the header
expenditure_df.to_csv(output_expenditure_data_file, index=False, header=True)