In [1]:
import os
import json
import boto3
import shutil
import pandas as pd

# set your bucket name here
# 'ch10-data' is NOT your bucket. It's just an example here
# you should replace your bucket below
BUCKET_NAME = 'ch10-data'

In [2]:
# 1. download data from S3 bucket
s3_resource = boto3.resource('s3')
try:
    s3_resource.Bucket(BUCKET_NAME).download_file(
        'New_York_City_Leading_Causes_of_Death.csv', 
        './tmp/New_York_City_Leading_Causes_of_Death.csv')
except FileNotFoundError:
    os.mkdir('tmp/')
    s3_resource.Bucket(BUCKET_NAME).download_file(
        'New_York_City_Leading_Causes_of_Death.csv', 
        './tmp/New_York_City_Leading_Causes_of_Death.csv')
    
# read data
df_data = pd.read_csv('tmp/New_York_City_Leading_Causes_of_Death.csv')
df_data.head()

Unnamed: 0,Year,Leading Cause,Sex,Race Ethnicity,Deaths,Death Rate,Age Adjusted Death Rate
0,2010,Influenza (Flu) and Pneumonia (J09-J18),F,Hispanic,228,18.7,23.1
1,2008,"Accidents Except Drug Posioning (V01-X39, X43,...",F,Hispanic,68,5.8,6.6
2,2013,"Accidents Except Drug Posioning (V01-X39, X43,...",M,White Non-Hispanic,271,20.1,17.9
3,2010,Cerebrovascular Disease (Stroke: I60-I69),M,Hispanic,140,12.3,21.4
4,2009,"Assault (Homicide: Y87.1, X85-Y09)",M,Black Non-Hispanic,255,30.0,30.0


In [3]:
# 2. replace "." with value 0 & and convert to float type
df_data_cleaned = df_data.replace('.', 0).astype({'Deaths': float})

# check dtypes
df_data_cleaned.dtypes

Year                         int64
Leading Cause               object
Sex                         object
Race Ethnicity              object
Deaths                     float64
Death Rate                  object
Age Adjusted Death Rate     object
dtype: object

In [4]:
# 3. get top 3 death causes for each ethnicity
top_causes = {}

for ethnicity, df_g in df_data_cleaned.groupby(['Race Ethnicity']):
    df_top_3_causes = df_g.groupby('Leading Cause')[['Deaths']].sum().sort_values('Deaths', ascending=False).head(3)
    top_3_causes = df_top_3_causes.index.values.tolist()
    top_causes.update({ethnicity: top_3_causes})
    
top_causes

{'Asian and Pacific Islander': ['Malignant Neoplasms (Cancer: C00-C97)',
  'Diseases of Heart (I00-I09, I11, I13, I20-I51)',
  'All Other Causes'],
 'Black Non-Hispanic': ['Diseases of Heart (I00-I09, I11, I13, I20-I51)',
  'Malignant Neoplasms (Cancer: C00-C97)',
  'All Other Causes'],
 'Hispanic': ['Diseases of Heart (I00-I09, I11, I13, I20-I51)',
  'Malignant Neoplasms (Cancer: C00-C97)',
  'All Other Causes'],
 'Not Stated/Unknown': ['Diseases of Heart (I00-I09, I11, I13, I20-I51)',
  'All Other Causes',
  'Malignant Neoplasms (Cancer: C00-C97)'],
 'Other Race/ Ethnicity': ['Diseases of Heart (I00-I09, I11, I13, I20-I51)',
  'Malignant Neoplasms (Cancer: C00-C97)',
  'All Other Causes'],
 'White Non-Hispanic': ['Diseases of Heart (I00-I09, I11, I13, I20-I51)',
  'Malignant Neoplasms (Cancer: C00-C97)',
  'All Other Causes']}

In [5]:
# 4. dump output data to a JSON file
with open('tmp/top_causes_per_ethnicity.json', 'w') as fout:
    json.dump(top_causes, fout)


In [6]:
# 5. upload data to S3
s3_resource.Bucket(BUCKET_NAME).upload_file(
    'tmp/top_causes_per_ethnicity.json',
    'top_causes_per_ethnicity.json')

# clean up tmp
shutil.rmtree('./tmp')
