In [10]:
import pandas as pd
from mcod_prep.utils.mcause_io import get_mcause_data
from importlib import import_module

In [5]:
BLOCK_RERUN = {'block_rerun': True, 'force_rerun': False}
LOCS = "TWN_MOH"

df = get_mcause_data(
phase='format_map', source=LOCS, sub_dirs="x59",
data_type_id=9, assert_all_available=True,
verbose=True, **BLOCK_RERUN)

[2019-12-16 20:38:28.557748] Getting datasets to read
[2019-12-16 20:38:28.725927] Reading /ihme/cod/prep/agesak/process_data/mcod/db_cache/nid_locyears.csv
[2019-12-16 20:38:28.889512] Reading /ihme/cod/prep/agesak/process_data/mcod/db_cache/nid_metadata.csv
[2019-12-16 20:38:29.215905] Reading /ihme/cod/prep/agesak/process_data/mcod/db_cache/location_hierarchy_history_v420.csv
[2019-12-16 20:38:29.232764] Got 10 datasets
[2019-12-16 20:38:29.234317] Checking which datasets have available files
[2019-12-16 20:38:29.379923] Found 10 files to read data for.
[2019-12-16 20:38:29.380061] Reading and appending format_map data for 10 nid-extracts
[2019-12-16 20:38:29.973303] Constructed a dataset of 186741 rows


In [6]:
df.head()

Unnamed: 0,age_group_id,cause_id,cause_x59,code_id,deaths,drop_rows,extract_type_id,location_id,nid,pII_in_ncodes,pII_ncodes,pII_x59,pattern,sex_id,x59,year_id
0,4,302,no_int_cause,103584,1,0,1,8,356743,,,0,,1,0,2008
1,4,317,no_int_cause,94,1,0,1,8,356743,,,0,,1,0,2008
2,4,327,no_int_cause,12846,1,0,1,8,356743,,,0,,1,0,2008
3,4,327,no_int_cause,12893,1,0,1,8,356743,,,0,,1,0,2008
4,4,337,no_int_cause,680,1,0,1,8,356743,,,0,,1,0,2008


In [7]:
len(df)

186741

In [8]:
def get_formatting_method(source, data_type_id, year, drop_p2):
    """Return the formatting method by source."""
    if data_type_id == 3:
        clean_source = 'clean_hospital_data'
        args = [source, year]
    else:
        clean_source = 'clean_' + source.lower()
        args = [year, drop_p2]
    try:
        formatting_method = getattr(
            import_module(f"mcod_prep.datasets.{clean_source}"), f"{clean_source}"
        )
    except AttributeError:
        print(f"No formatting method found! Check module & main function are named clean_{source}")
    return formatting_method, args

In [186]:
formatting_method, args = get_formatting_method(source="TWN_MOH", data_type_id=9, year=2008, drop_p2=True)
df = formatting_method(*args)

Reading 2008


In [187]:
df.head()

Unnamed: 0,year_id,sex_id,age_group_id,nid,extract_type_id,location_id,code_system_id,deaths,multiple_cause_8,multiple_cause_19,multiple_cause_15,multiple_cause_5,multiple_cause_18,multiple_cause_1,multiple_cause_13,multiple_cause_4,multiple_cause_2,multiple_cause_9,multiple_cause_10,multiple_cause_12,multiple_cause_3,multiple_cause_17,multiple_cause_7,multiple_cause_14,multiple_cause_11,multiple_cause_20,multiple_cause_6,multiple_cause_16,cause
0,2008,2,31,356743,1,8,1,1,0,0,0,E46,0,R788,0,N19,T857,0,0,0,Y848,0,0,0,0,0,F0300,0,Y848
1,2008,2,31,356743,1,8,1,1,0,0,0,0000,0,A419,0,0000,R092,0,0,0,I219,0,0,0,0,0,0000,0,I219
2,2008,2,235,356743,1,8,1,1,0,0,0,0000,0,I64,0,0000,R092,0,0,0,0000,0,0,0,0,0,0000,0,I64
3,2008,2,20,356743,1,8,1,1,0,0,0,0000,0,R570,0,0000,R092,0,0,0,0000,0,0,0,0,0,0000,0,R092
4,2008,2,20,356743,1,8,1,1,0,0,0,0000,0,I38,0,0000,C541,0,0,0,0000,0,0,0,0,0,0000,0,I38


In [20]:
from __future__ import print_function
from builtins import zip
import pandas as pd
import numpy as np
import re
from cod_prep.utils import (
    print_log_message, report_duplicates, clean_icd_codes, report_if_merge_fail
)
from cod_prep.downloaders import get_cause_map, add_code_metadata
from cod_prep.claude.configurator import Configurator


In [147]:

    def get_code_columns(df):
        """Get a list of raw cause columns with ICD codes as values."""
        col_names = list(df.columns)
        code_cols = [x for x in col_names if "multiple_cause" in x and "pII" not in x] + ['cause']
        return code_cols


    def _get_cause_num(mcod_col):
        """Get sort order for cause columns.

        Assumes you have an underlying cause (cause_x) column and chain columns (multiple_cause_x)
        and that the value to sort off of is after the second underscore.
        """
        if mcod_col.startswith('cause'):
            return '0'
        else:
            assert re.match(r"^multiple_cause_[a-z]*[0-9]*", mcod_col), \
                f"column {mcod_col} does not match expected format: multiple_cause_x"
            return mcod_col.split('_')[2]


    def prep_raw_mapped_cause_dictionary(raw_cols, mapped_cols):
        """Create dictionary of raw cause columns to mapped cause columns.

        Ensures that "multiple_cause_2_mapped" is the value associated with
        "multiple_cause_2" key, e.g.
        """
        raw_cols = sorted(raw_cols, key=_get_cause_num)
        mapped_cols = sorted(mapped_cols, key=_get_cause_num)
        return dict(list(zip(raw_cols, mapped_cols)))


    def fix_icd_codes(df, codes, code_system_id):
        """Adjustment to icd9/10 cause codes."""
        if code_system_id == 6:
            # according to Mohsen, codes between 800 to 900 need an E if underlying
            # assume 800, 900 codes are N codes if in the chain, don't add any prefix
            df.loc[df['cause'].str.contains('^[89]'), 'cause'] = 'E' + df['cause']
        # commenting this out 11/19/2019
        # don't drop/zero out; let it be mapped as garbage
        # elif code_system_id == 1:
        #     # S + T codes are always intermediate causes of death
        #     # V + Y codes are always the underlying cause of death
        #     violations = df['cause'].str.contains('^[ST]')
        #     num_violations = len(df[violations])
        #     if num_violations > 0:
        #         print_log_message(
        #             f"Found S or T code as underlying cause, dropping {num_violations} rows"
        #         )
        #         assert np.isclose(len(df[~violations]), len(df), rtol=.10)
        #         df = df.loc[~violations]

        #     # next check violations in chain causes
        #     # V and Y codes can only be UCOD
        #     for col in codes:
        #         if col != 'cause':
        #             violations = df[col].str.contains('^[VY]')
        #             num_violations = len(df[violations])
        #             if num_violations > 0:
        #                 print_log_message(
        #                     f"Setting {num_violations} rows with V/Y in chain to 0000 for {col}")
        #                 df.loc[violations, col] = '0000'
        return df


    def prep_cause_package_map(cause_package_map):
        """Expects cause-package map.

        Set dictionary of value: map_id since we only care about the package name
        or the cause_id, not the individual ICD code level code.
        """
        check_map = cause_package_map[['map_id', 'map_type']].drop_duplicates()
        report_duplicates(check_map, 'map_id')
        cause_package_map = cause_package_map.set_index('value')['map_id'].to_dict()
        return cause_package_map


    def prep_cause_map(cause_map):
        """Clean up cause map."""
        cause_map['value'] = clean_icd_codes(cause_map['value'], remove_decimal=True)
        # duplicates are a result of weird _gc, the duplicates dropped all
        # have the higher sort_order (999999)
        cause_map = cause_map.drop_duplicates(['code_system_id', 'value'])
        cause_map['code_id'] = cause_map['code_id'].astype(int)
        cause_map = cause_map.set_index('value')['code_id'].to_dict()
        return cause_map

    def map_cause_codes(df, coi_map, coi, cols_to_map=None):
        """Map cause codes to any given value (e.g. acause, category, etc.).

        Inputs
        df (pd dataframe): incoming, unmapped data with ICD codes
        cause_map (pd dataframe): primary cause map, probably downloaded from the engine room
        coi_map (pd dataframe): special map designed just for one cause of interest
        coi (string): cause of interest
        Returns
        df (pd dataframe): mapped dataframe with additional columns for each cause
        """
        df = df.copy()
        if not cols_to_map:
            cols_to_map = get_code_columns(df)
        # map chain causes using cause of interest map
        for col in cols_to_map:
            df[col] = df[col].fillna('0000')
            df[col] = df[col].astype(object)
            df[col + '_' + coi] = df[col].map(coi_map)
        return df


    def trim_and_remap(df, code_dict, cause_map, code_system_id):
        """Trim ICD codes to 4 digits, map again, then 3, and map again."""
        df = df.copy()
        # before trimming, map "null" chain causes to '0000'
        for code, mapped_code in list(code_dict.items()):
            df.loc[df[code] == '0000', mapped_code] = '0000'

        # trim and re map null mappings
        for n in reversed(range(3, 6)):
            for code, mapped_code in list(code_dict.items()):
                temp_code = 'temp_' + code
                df[temp_code] = df[code].copy()
                try:
                    df.loc[df[mapped_code].isnull(), temp_code] = df[temp_code].apply(
                        lambda x: x[0:n])
                except TypeError:
                    # was getting a type error for some unicode issues?
                    if mapped_code != 'cause_mapped':
                        df[mapped_code] = '0000'
                    else:
                        print("problem code here..." + df[code])
                df.loc[df[mapped_code].isnull(), mapped_code] = df[temp_code].map(cause_map)
                df = df.drop(temp_code, axis=1)
        return df


In [24]:
raw_cause_cols = get_code_columns(df)

In [114]:
raw_cause_cols

['multiple_cause_8',
 'multiple_cause_19',
 'multiple_cause_15',
 'multiple_cause_5',
 'multiple_cause_18',
 'multiple_cause_1',
 'multiple_cause_13',
 'multiple_cause_4',
 'multiple_cause_2',
 'multiple_cause_9',
 'multiple_cause_10',
 'multiple_cause_12',
 'multiple_cause_3',
 'multiple_cause_17',
 'multiple_cause_7',
 'multiple_cause_14',
 'multiple_cause_11',
 'multiple_cause_20',
 'multiple_cause_6',
 'multiple_cause_16',
 'cause']

In [100]:
cache_options = {'force_rerun': False, 'block_rerun': True}
code_system_id = 1
full_cause_name = ["unspecified external factor x59"]
int_cause = "x59"
conf = Configurator()

In [188]:
df = fix_icd_codes(df, raw_cause_cols, code_system_id)

In [29]:
from cod_prep.downloaders import get_map_version

In [33]:
cause_map = get_cause_map(
    code_map_version_id=get_map_version(1, 'YLL', 'best'), **cache_options)

In [35]:
cause_map.head()

Unnamed: 0,code_system_id,code_system,source_label,value,code_name,code_id,cause_id
0,1,ICD10,,A00,Cholera,1,303
1,1,ICD10,,A00.0,"Cholera due to Vibrio cholerae 01, biovar chol...",2,303
2,1,ICD10,,A00.1,"Cholera due to Vibrio cholerae 01, biovar eltor",3,303
3,1,ICD10,,A00.9,"Cholera, unspecified",4,303
4,1,ICD10,,A01,Typhoid and paratyphoid fevers,5,743


In [36]:
code_map = prep_cause_map(cause_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [37]:
code_map

{'A00': 1,
 'A000': 2,
 'A001': 3,
 'A009': 4,
 'A01': 5,
 'A010': 6,
 'A0100': 7,
 'A0101': 8,
 'A0102': 9,
 'A0103': 10,
 'A0104': 11,
 'A0105': 12,
 'A0109': 13,
 'A011': 14,
 'A012': 15,
 'A013': 16,
 'A014': 17,
 'A02': 18,
 'A020': 19,
 'A021': 20,
 'A022': 21,
 'A0220': 22,
 'A0221': 23,
 'A0222': 24,
 'A0223': 25,
 'A0224': 26,
 'A0225': 27,
 'A0229': 28,
 'A028': 29,
 'A029': 30,
 'A03': 31,
 'A030': 32,
 'A031': 33,
 'A032': 34,
 'A033': 35,
 'A038': 36,
 'A039': 37,
 'A04': 38,
 'A040': 39,
 'A041': 40,
 'A042': 41,
 'A043': 42,
 'A044': 43,
 'A045': 44,
 'A046': 45,
 'A047': 46,
 'A048': 47,
 'A049': 48,
 'A05': 49,
 'A050': 50,
 'A051': 51,
 'A052': 52,
 'A053': 53,
 'A054': 54,
 'A055': 55,
 'A058': 56,
 'A059': 57,
 'A06': 58,
 'A060': 59,
 'A061': 60,
 'A062': 61,
 'A063': 62,
 'A064': 63,
 'A065': 64,
 'A066': 65,
 'A067': 66,
 'A068': 67,
 'A0681': 68,
 'A0682': 69,
 'A0689': 70,
 'A069': 71,
 'A07': 72,
 'A070': 73,
 'A071': 74,
 'A072': 75,
 'A073': 76,
 'A074': 77,

In [189]:
df['cause_mapped'] = df['cause'].map(code_map)

In [190]:
df.head()

Unnamed: 0,year_id,sex_id,age_group_id,nid,extract_type_id,location_id,code_system_id,deaths,multiple_cause_8,multiple_cause_19,multiple_cause_15,multiple_cause_5,multiple_cause_18,multiple_cause_1,multiple_cause_13,multiple_cause_4,multiple_cause_2,multiple_cause_9,multiple_cause_10,multiple_cause_12,multiple_cause_3,multiple_cause_17,multiple_cause_7,multiple_cause_14,multiple_cause_11,multiple_cause_20,multiple_cause_6,multiple_cause_16,cause,cause_mapped
0,2008,2,31,356743,1,8,1,1,0,0,0,E46,0,R788,0,N19,T857,0,0,0,Y848,0,0,0,0,0,F0300,0,Y848,39552
1,2008,2,31,356743,1,8,1,1,0,0,0,0000,0,A419,0,0000,R092,0,0,0,I219,0,0,0,0,0,0000,0,I219,11214
2,2008,2,235,356743,1,8,1,1,0,0,0,0000,0,I64,0,0000,R092,0,0,0,0000,0,0,0,0,0,0000,0,I64,11655
3,2008,2,20,356743,1,8,1,1,0,0,0,0000,0,R570,0,0000,R092,0,0,0,0000,0,0,0,0,0,0000,0,R092,26098
4,2008,2,20,356743,1,8,1,1,0,0,0,0000,0,I38,0,0000,C541,0,0,0,0000,0,0,0,0,0,0000,0,I38,11367


In [135]:
df = trim_and_remap(df, {'cause': 'cause_mapped'}, code_map, code_system_id)

In [191]:
report_if_merge_fail(df, 'cause_mapped', 'cause')

In [192]:
df = df.rename(columns={'cause_mapped': 'code_id'})
df['code_id'] = df['code_id'].astype(int)

In [193]:
df = add_code_metadata(df, 'cause_id', code_map_version_id=get_map_version(1, 'YLL', 'best'),
                   **cache_options)

In [194]:
df.head()

Unnamed: 0,year_id,sex_id,age_group_id,nid,extract_type_id,location_id,code_system_id,deaths,multiple_cause_8,multiple_cause_19,multiple_cause_15,multiple_cause_5,multiple_cause_18,multiple_cause_1,multiple_cause_13,multiple_cause_4,multiple_cause_2,multiple_cause_9,multiple_cause_10,multiple_cause_12,multiple_cause_3,multiple_cause_17,multiple_cause_7,multiple_cause_14,multiple_cause_11,multiple_cause_20,multiple_cause_6,multiple_cause_16,cause,code_id,cause_id
0,2008,2,31,356743,1,8,1,1,0,0,0,E46,0,R788,0,N19,T857,0,0,0,Y848,0,0,0,0,0,F0300,0,Y848,39552,708
1,2008,2,31,356743,1,8,1,1,0,0,0,0000,0,A419,0,0000,R092,0,0,0,I219,0,0,0,0,0,0000,0,I219,11214,493
2,2008,2,235,356743,1,8,1,1,0,0,0,0000,0,I64,0,0000,R092,0,0,0,0000,0,0,0,0,0,0000,0,I64,11655,743
3,2008,2,20,356743,1,8,1,1,0,0,0,0000,0,R570,0,0000,R092,0,0,0,0000,0,0,0,0,0,0000,0,R092,26098,743
4,2008,2,20,356743,1,8,1,1,0,0,0,0000,0,I38,0,0000,C541,0,0,0,0000,0,0,0,0,0,0000,0,I38,11367,503


In [140]:
full_cause_name

['unspecified external factor x59']

In [195]:
def prep_int_cause_map():
    map_dir = conf.get_directory('process_inputs')
    code_system_name = {1: 'icd10', 6: 'icd9'}[code_system_id]
    df = pd.read_excel(f"{map_dir}/mcause_map.xlsx", dtype={'icd_code': object})
    df = df[['icd_code', 'package_description', 'code_system']].drop_duplicates()

    # cleanup strings and things
    df[['package_description', 'code_system']] = df[['package_description', 'code_system']].apply(lambda x: x.str.lower())

    # only keep the rows we need for this intermediate cause
    df = df.loc[df['package_description'].isin(full_cause_name)]

    # intermediate causes should be mutually exclusive
    report_duplicates(df, ['icd_code', 'code_system'])

    # subset to just the code system being run through
    df = df.query(f'code_system == "{code_system_name}"')

    assert len(df) > 0, \
        f"There are no mappings for {code_system_name}, {full_cause_name}"

    # convert to a dictionary
    mcod_map = dict(list(zip(df['icd_code'], df['package_description'])))

    return mcod_map

In [142]:
df.head()

Unnamed: 0,year_id,sex_id,age_group_id,nid,extract_type_id,location_id,code_system_id,deaths,multiple_cause_8,multiple_cause_19,multiple_cause_15,multiple_cause_5,multiple_cause_18,multiple_cause_1,multiple_cause_13,multiple_cause_4,multiple_cause_2,multiple_cause_9,multiple_cause_10,multiple_cause_12,multiple_cause_3,multiple_cause_17,multiple_cause_7,multiple_cause_14,multiple_cause_11,multiple_cause_20,multiple_cause_6,multiple_cause_16,cause,code_id,cause_id
0,2008,2,31,356743,1,8,1,1,0,0,0,E46,0,R788,0,N19,T857,0,0,0,Y848,0,0,0,0,0,F0300,0,Y848,39552,708
1,2008,2,31,356743,1,8,1,1,0,0,0,0000,0,A419,0,0000,R092,0,0,0,I219,0,0,0,0,0,0000,0,I219,11214,493
2,2008,2,235,356743,1,8,1,1,0,0,0,0000,0,I64,0,0000,R092,0,0,0,0000,0,0,0,0,0,0000,0,I64,11655,743
3,2008,2,20,356743,1,8,1,1,0,0,0,0000,0,R570,0,0000,R092,0,0,0,0000,0,0,0,0,0,0000,0,R092,26098,743
4,2008,2,20,356743,1,8,1,1,0,0,0,0000,0,I38,0,0000,C541,0,0,0,0000,0,0,0,0,0,0000,0,I38,11367,503


In [196]:
int_cause_map = prep_int_cause_map()
# this just creates "multiple_cause_x_int_cause" cols with string name
df = map_cause_codes(df, int_cause_map, int_cause)

In [198]:
[x for x in list(df) if x.endswith("x59")]

['multiple_cause_8_x59',
 'multiple_cause_19_x59',
 'multiple_cause_15_x59',
 'multiple_cause_5_x59',
 'multiple_cause_18_x59',
 'multiple_cause_1_x59',
 'multiple_cause_13_x59',
 'multiple_cause_4_x59',
 'multiple_cause_2_x59',
 'multiple_cause_9_x59',
 'multiple_cause_10_x59',
 'multiple_cause_12_x59',
 'multiple_cause_3_x59',
 'multiple_cause_17_x59',
 'multiple_cause_7_x59',
 'multiple_cause_14_x59',
 'multiple_cause_11_x59',
 'multiple_cause_20_x59',
 'multiple_cause_6_x59',
 'multiple_cause_16_x59',
 'cause_x59']

In [199]:
print_log_message("Trimming ICD codes and remapping chain causes")
int_cause_cols = [x for x in df.columns if int_cause in x]
int_cause_col_dict = prep_raw_mapped_cause_dictionary(
    raw_cause_cols, int_cause_cols)

[2019-12-16 22:28:44.065518] Trimming ICD codes and remapping chain causes


In [200]:
df = trim_and_remap(df, int_cause_col_dict, int_cause_map, code_system_id)

In [151]:
def capture_int_cause(df, int_cause_cols):
    """Flag deaths related to the intermediate cause."""
    df[int_cause] = None

    # if self.int_cause in self.inj_causes:
    #     df = self.capture_injuries_pattern(df, int_cause_cols)


    for col in int_cause_cols:
        df[col] = df[col].fillna("other")
        df.loc[df[col].isin(full_cause_name), int_cause] = 1
    df[int_cause] = df[int_cause].fillna(0)

    assert df[int_cause].notnull().values.all()

    return df

In [201]:
df = capture_int_cause(df, int_cause_cols)

In [202]:
df.head()

Unnamed: 0,year_id,sex_id,age_group_id,nid,extract_type_id,location_id,code_system_id,deaths,multiple_cause_8,multiple_cause_19,multiple_cause_15,multiple_cause_5,multiple_cause_18,multiple_cause_1,multiple_cause_13,multiple_cause_4,multiple_cause_2,multiple_cause_9,multiple_cause_10,multiple_cause_12,multiple_cause_3,multiple_cause_17,multiple_cause_7,multiple_cause_14,multiple_cause_11,multiple_cause_20,multiple_cause_6,multiple_cause_16,cause,code_id,cause_id,multiple_cause_8_x59,multiple_cause_19_x59,multiple_cause_15_x59,multiple_cause_5_x59,multiple_cause_18_x59,multiple_cause_1_x59,multiple_cause_13_x59,multiple_cause_4_x59,multiple_cause_2_x59,multiple_cause_9_x59,multiple_cause_10_x59,multiple_cause_12_x59,multiple_cause_3_x59,multiple_cause_17_x59,multiple_cause_7_x59,multiple_cause_14_x59,multiple_cause_11_x59,multiple_cause_20_x59,multiple_cause_6_x59,multiple_cause_16_x59,cause_x59,x59
0,2008,2,31,356743,1,8,1,1,0,0,0,E46,0,R788,0,N19,T857,0,0,0,Y848,0,0,0,0,0,F0300,0,Y848,39552,708,0,0,0,other,0,other,0,other,other,0,0,0,other,0,0,0,0,0,other,0,other,0
1,2008,2,31,356743,1,8,1,1,0,0,0,0000,0,A419,0,0000,R092,0,0,0,I219,0,0,0,0,0,0000,0,I219,11214,493,0,0,0,0000,0,other,0,0000,other,0,0,0,other,0,0,0,0,0,0000,0,other,0
2,2008,2,235,356743,1,8,1,1,0,0,0,0000,0,I64,0,0000,R092,0,0,0,0000,0,0,0,0,0,0000,0,I64,11655,743,0,0,0,0000,0,other,0,0000,other,0,0,0,0000,0,0,0,0,0,0000,0,other,0
3,2008,2,20,356743,1,8,1,1,0,0,0,0000,0,R570,0,0000,R092,0,0,0,0000,0,0,0,0,0,0000,0,R092,26098,743,0,0,0,0000,0,other,0,0000,other,0,0,0,0000,0,0,0,0,0,0000,0,other,0
4,2008,2,20,356743,1,8,1,1,0,0,0,0000,0,I38,0,0000,C541,0,0,0,0000,0,0,0,0,0,0000,0,I38,11367,503,0,0,0,0000,0,other,0,0000,other,0,0,0,0000,0,0,0,0,0,0000,0,other,0


In [208]:
df = df[[x for x in list(df) if not x.endswith("x59")] + ["x59"]]

In [209]:
df[[x for x in list(df) if "multiple_cause" in x]] = df[[x for x in list(df) if "multiple_cause" in x]].replace("0000", np.NaN)

In [210]:
df.head()

Unnamed: 0,year_id,sex_id,age_group_id,nid,extract_type_id,location_id,code_system_id,deaths,multiple_cause_8,multiple_cause_19,multiple_cause_15,multiple_cause_5,multiple_cause_18,multiple_cause_1,multiple_cause_13,multiple_cause_4,multiple_cause_2,multiple_cause_9,multiple_cause_10,multiple_cause_12,multiple_cause_3,multiple_cause_17,multiple_cause_7,multiple_cause_14,multiple_cause_11,multiple_cause_20,multiple_cause_6,multiple_cause_16,cause,code_id,cause_id,x59
0,2008,2,31,356743,1,8,1,1,,,,E46,,R788,,N19,T857,,,,Y848,,,,,,F0300,,Y848,39552,708,0
1,2008,2,31,356743,1,8,1,1,,,,,,A419,,,R092,,,,I219,,,,,,,,I219,11214,493,0
2,2008,2,235,356743,1,8,1,1,,,,,,I64,,,R092,,,,,,,,,,,,I64,11655,743,0
3,2008,2,20,356743,1,8,1,1,,,,,,R570,,,R092,,,,,,,,,,,,R092,26098,743,0
4,2008,2,20,356743,1,8,1,1,,,,,,I38,,,C541,,,,,,,,,,,,I38,11367,503,0


In [171]:
from cod_prep.downloaders import (
    add_cause_metadata, get_best_cause_hierarchy_version,
    get_current_cause_hierarchy, get_cod_ages, get_age_weights, add_population
)

CONF = Configurator('standard')

In [175]:
# subset to injuries as UC
def get_most_detailed_inj_causes(int_cause, cause_set_version_id=None,
                                 cause_set_id=3, gbd_round_id=None,
                                 **cache_kwargs):
    """
    Gets cause ids for most detailed injuries causes,
    filters to unintentional for x59 - rewriting this sounds promising
    """
    restrict_df = pd.read_csv(CONF.get_resource(
        'x59_redistribution_restrictions'))
    cause_df = get_current_cause_hierarchy(
        cause_set_version_id=cause_set_version_id,
        cause_set_id=cause_set_id, gbd_round_id=None, **cache_kwargs)
    cause_df = cause_df.loc[(cause_df['yld_only'] != 1) & (
        cause_df['most_detailed'] == 1) & (cause_df["secret_cause"] != 1)]
    # this only works at the most detailed level
    dinj = list(
        cause_df.loc[cause_df['acause'].str.contains('inj')].cause_id.unique())

    # x59 only unintentional
    if int_cause == "x59":
        dinj = [x for x in dinj if x not in restrict_df.cause_id.unique()]

    # 3/10/2019: Mohsen said drop 729: inj_disaster and
    # 945: inj_war_warterror as targets
    dinj = [x for x in dinj if x not in [729, 945]]

    return dinj


In [176]:
causes = get_most_detailed_inj_causes("x59", cause_set_id=4)

  f"Supplied {entity_type}_set_version_id "


In [178]:
df.head()

Unnamed: 0,year_id,sex_id,age_group_id,nid,extract_type_id,location_id,code_system_id,deaths,multiple_cause_8,multiple_cause_19,multiple_cause_15,multiple_cause_5,multiple_cause_18,multiple_cause_1,multiple_cause_13,multiple_cause_4,multiple_cause_2,multiple_cause_9,multiple_cause_10,multiple_cause_12,multiple_cause_3,multiple_cause_17,multiple_cause_7,multiple_cause_14,multiple_cause_11,multiple_cause_20,multiple_cause_6,multiple_cause_16,cause,code_id,cause_id
0,2008,2,31,356743,1,8,1,1,,,,E46,,R788,,N19,T857,,,,Y848,,,,,,F0300,,Y848,39552,708
1,2008,2,31,356743,1,8,1,1,,,,,,A419,,,R092,,,,I219,,,,,,,,I219,11214,493
2,2008,2,235,356743,1,8,1,1,,,,,,I64,,,R092,,,,,,,,,,,,I64,11655,743
3,2008,2,20,356743,1,8,1,1,,,,,,R570,,,R092,,,,,,,,,,,,R092,26098,743
4,2008,2,20,356743,1,8,1,1,,,,,,I38,,,C541,,,,,,,,,,,,I38,11367,503


In [179]:
len(df)

142283

In [211]:
# think input data will look similar to this, but maybe a combined column for mcod info 
df.loc[(df.cause_id.isin(causes)) | (df.x59==1)]

Unnamed: 0,year_id,sex_id,age_group_id,nid,extract_type_id,location_id,code_system_id,deaths,multiple_cause_8,multiple_cause_19,multiple_cause_15,multiple_cause_5,multiple_cause_18,multiple_cause_1,multiple_cause_13,multiple_cause_4,multiple_cause_2,multiple_cause_9,multiple_cause_10,multiple_cause_12,multiple_cause_3,multiple_cause_17,multiple_cause_7,multiple_cause_14,multiple_cause_11,multiple_cause_20,multiple_cause_6,multiple_cause_16,cause,code_id,cause_id,x59
0,2008,2,31,356743,1,8,1,1,,,,E46,,R788,,N19,T857,,,,Y848,,,,,,F0300,,Y848,39552,708,0
8,2008,1,15,356743,1,8,1,1,,,,,,T794,,,S068,,,,V234,,,,,,,,V234,34825,692,0
18,2008,1,14,356743,1,8,1,1,,,,,,T818,,,Y832,,,,I219,,,,,,,,Y832,39536,708,0
100,2008,1,235,356743,1,8,1,1,,,,,,S065,,,S223,,,,X59,,,,,,,,X59,37920,743,1
107,2008,1,16,356743,1,8,1,1,,,,,,T71,,,T751,,,,W74,,,,,,,,W74,37070,698,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142017,2008,1,14,356743,1,8,1,1,,,,,,T71,,,S280,,,,W20,,,,,,,,W20,36394,707,0
142219,2008,1,11,356743,1,8,1,1,,,,,,S062,,W17,T794,,,,S029,,,,,,,,W17,36344,697,0
142234,2008,1,16,356743,1,8,1,1,,,,,,T794,,,S360,,,,W19,,,,,,,,W19,36383,697,0
142249,2008,1,10,356743,1,8,1,1,,,,,,S068,,,X59,,,,D469,,,,,,,,X59,37920,743,1
