**Objective:** assess racial-ethnic mix in adolescent suicides and contrast with overall suicides.

Other research questions: evolution of the racial-ethnic distribution of adolescent suicides over time (+ relative to the proportion of the population in that age group)

In parallel: evolution of the racial-ethnic distribution of overall suicides and adult suicides (20+) over time (+ relative to the proportion of the US population -- overall or 20+)

For now, we can focus on the national level.

However, we could also assess whether differences among racial-ethnic subgroups are more pronounced in certain HHS regions and/or states.

Along these lines, health journalists at CNN, US News, and NBC were most interested in the racial-ethnic mix in the 5 states with a stat. sig. increase in the absolute number of suicides + proportion outcome as well as California (stat. sig. increase in the proportion outcome only).

**Data extraction :**

[CDC Wonder](https://wonder.cdc.gov/mcd.html), Provisional Mortality Statistics, 2018 through Last Month Request & Current Final Multiple Cause of Death Data

Groupby: Residence HHS Regions, Residence State, Year, Single Race 6, Hispanic Origin

Cause of death: Intentional self-harm

4 files:
*  All years "Overall"
*  10-19 years "10-19"
*  20-64 years "20-64"
*  20+ years "20plus"

Years after 2010

# Import package

In [225]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
import datetime
from tqdm import tqdm

# Data loader

In [297]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

from typing import Dict, List, Any

color_ = [
    "#636EFA",
    "#EF553B",
    "#00CC96",
    "#AB63FA",
    "#FFA15A",
    "#19D3F3",
    "#FF6692",
    "#B6E880",
    "#FF97FF",
    "#FECB52",
] * 10


def relabel_fig(fig):
    color = color_.copy()
    label_color = dict()

    def f(trace):
        next_color = label_color.get(trace.name, None)
        if not next_color:
            next_color = color.pop(0)
            label_color[trace.name] = next_color
            trace.line.color = next_color

        else:
            trace.line.color = next_color
            return trace.update(showlegend=False)
        return trace.update()

    label_color = dict()
    fig.for_each_trace(f)


class SuicideData:
    """Available features to plot: race, year, population, hhs, deaths, deaths_perc, etchnicity, ethno_race, age_strat.
    
    
The data pipeline works as following:"""

    def __init__(
        self,
        data_folder: str = "Data",
        indexer_columns: List[str] = [
            "hhs",
            "State",
            "year",
            "race",
            "ethnicity",
            "age_strat",
            "ethno_race",
        ],
        drop_cols: List[str] = [
            "Year Code",
            "State Code",
            "HHS Region",
            "Race Code",
            "Crude Rate",
            "Hispanic Origin Code",
        ],
    ):
        """
        Load the files and create dataframes
        """

        self.indexer_columns = indexer_columns  # could compute it later
        self.numeric_columns = pd.Series(dtype="object")
        self.convert_cols = None
        self.data_folder = data_folder
        # every value detected only after 2018
        self.reject_list = [
            "American Indian or Alaska Native",
            "More than one race",
            "Native Hawaiian or Other Pacific Islander",
            "Not Stated",
        ]
        # list of unique values of a column
        self.partitions = dict()

        # will be cached
        self.data = self.load_data(drop_cols=drop_cols)
        self.processed_data = dict()

    def file_to_dataframe(
        self,
        data_folder: str,
        file: str,
        rename_mapper: Dict[str, str] = {
            "Single Race 6": "race",
            "Race": "race",
            "Residence HHS Region Code": "hhs",
            "HHS Region Code": "hhs",
            "Population": "population",
            "Year": "year",
            "Deaths": "deaths",
            "Hispanic Origin": "ethnicity",
        },
    ) -> pd.DataFrame:
        """
        CDC Wonder txt file into dataframe
        """
        process_line = lambda line: line.strip().replace('"', "").split("\t")

        lines = []
        with open(f"{data_folder}/{file}", "r") as f:
            for line in iter(lambda: f.readline().rstrip(), '"---"'):
                lines.append(process_line(line))
        # column is on the header, remove corner named notes
        return pd.DataFrame(lines[1:], columns=lines[0][1:]).rename(
            columns=rename_mapper
        )

    def processor(
        self,
        x: pd.DataFrame,
        force_numeric: List[str] = ["population", "Crude Rate"],
    ) -> pd.DataFrame:
        """Process dataframes: convert columns dtype, compute new features."""

        # Automatically convert numeric columns to float/int type
        if self.numeric_columns.empty:
            self.numeric_columns = x.apply(
                lambda s: pd.to_numeric(s.replace(np.nan, 0), errors="coerce")
                .notnull()
                .all()
            )

            # compare with the force numeric and prevent numeric columns
            self.convert_cols = [
                col
                for col, bool_ in self.numeric_columns.items()
                if (bool_ or col in force_numeric) and col != "year"
                # keep year as str to prevent unexpected ticks on plots
            ]

        x.columns = self.numeric_columns.index

        x.year = x.year.str.extract("(\d+)")
        x = x.replace(
            {
                "Not Hispanic or Latino": "Non-Hispanic",
                "Hispanic or Latino": "Hispanic",
                "Not Applicable": np.nan,
                "Unreliable": np.nan,
                "Asian or Pacific Islander": "API",
                "Asian": "API",
                "Black or African American": "Black",
            }
        )

        x[self.convert_cols] = x[self.convert_cols].apply(
            pd.to_numeric, errors="coerce"
        )

        x = x.loc[
            ~pd.concat(
                [x.eq(forbidden).any(axis=1) for forbidden in self.reject_list], axis=1
            ).any(axis=1)
        ]

        return x.assign(ethno_race=lambda x: x.race + " " + x.ethnicity)

    def load_data(
        self, drop_cols: List[str] = [], identifier: str = "Data"
    ) -> Dict[str, pd.DataFrame]:
        """
        Automatically load all files containing identifier.
        This method is specific to the suicie_rate project and should be
        rewritten for other projects.

        :param drop_cols: Select what columns should be dropped
        :param identifier: used to identify which files should be process

        return: dictionnary containing specific name of files and its
        associated dataframe.
        """
        available_files = os.listdir("data")
        raw_data = [file for file in available_files if identifier in file]

        data = {
            (entry.split()[-1].split(".")[0], entry.split()[-2]): entry
            for entry in raw_data
        }

        age_strats = set(
            key[0] for key in data.keys()
        )  # {'10-19', '20plus', 'Overall'}

        dataframes = {
            age_strat: pd.concat(
                map(
                    self.processor,
                    [
                        self.file_to_dataframe(self.data_folder, file).assign(
                            age_strat=age_strat
                        )
                        for key_tuple, file in data.items()
                        if key_tuple[0] == age_strat
                    ],
                ),
                axis=0,
            ).drop(columns=drop_cols)
            for age_strat in age_strats
        }
        return dataframes

    def select_data(
        self,
        user_request: Dict[str, Any] = {
            "hhs": slice("HHS1", "HHS4"),
            "age_strat": "20-64",
        },
    ) -> pd.DataFrame:
        """Will merge and select data from the data attribute. User can perform a
        request with dictionnaries and slice.

        Example request: {
            "hhs": slice("HHS1", "HHS4"),
            "age_strat": "20-64",
        }

        Will take hhs1,hhs2,hhs3 and hss4 for 20-64 age strat.
        """

        loc_request = [slice(None)] * len(sd.indexer_columns)
        for k, v in user_request.items():
            loc_request[sd.indexer_columns.index(k)] = v

        return (
            pd.concat(sd.data.values())
            .reset_index(drop=True)
            .set_index(sd.indexer_columns)
            .sort_index()
            .loc[tuple(loc_request), :]
            .reset_index()
        )

    def selection(self, subpop, df):
        """Warning: return a pointer to the slice, not a copy!"""
        return df.loc[(slice(None), slice(None), subpop)].sort_index().reset_index()

    def merge(
        self,
        x: str = "year",
        color: str = "age_strat",
        by: str = "race",
    ) -> pd.DataFrame:
        # get the list of values by
        by_list = self.partitions.get((x, by), None)
        if by_list is None:
            by_list = pd.concat(self.data.values())[by].unique()
            by_list = [k for k in by_list if k not in self.reject_list]
            by_list.sort()

            # Cache it for next time
            self.partitions[(x, by)] = by_list

            if "age_strat" in [x, color, by]:
                self.processed_data[(x, by)] = (
                    pd.concat(
                        [
                            df.set_index([color, x, by])[["deaths", "population"]]
                            .groupby(level=[0, 1, 2])
                            .sum()
                            .assign(
                                suicide_per_100k=lambda x: 100000.0
                                * x.deaths
                                / x.population,
                            )
                            for df in self.data.values()
                        ]
                    )
                    .reset_index()
                    .set_index([color, x, by])
                )
            else:
                self.processed_data[(x, by)] = (
                    self.data["Overall"]
                    .set_index([color, x, by])[["deaths", "population"]]
                    .groupby(level=[0, 1, 2])
                    .sum()
                    .assign(
                        suicide_per_100k=lambda x: 100000.0 * x.deaths / x.population
                    )
                    .reset_index()
                    .set_index([color, x, by])
                )

        return self.processed_data[(x, by)], by_list

    def plot(
        self,
        x: str = "year",
        y: str = "deaths",
        color: str = "age_strat",
        by: str = "race",
        scatter: bool = False,
        rows: int = 2,
    ) -> None:

        _, by_list = self.merge(x=x, color=color, by=by)

        # adjust the number of cols for the plot
        cols = len(by_list) // rows + 1 if len(by_list) % rows else len(by_list) // rows

        fig = make_subplots(
            rows=rows, cols=cols, subplot_titles=tuple(map(lambda x: str(x), by_list))
        )

        for i, subpop in enumerate(by_list):
            sub_df = self.selection(subpop, self.processed_data[(x, by)])
            plot_method = px.line
            sub_fig = plot_method(
                sub_df.sort_values(by=[x]),
                x=x,
                y=y,
                color=color,
                title="test",  # str(subpop),
            )

            # merge px express on subplot
            for d in sub_fig.data:
                fig.add_trace(
                    (go.Scatter(x=d["x"], y=d["y"], name=d["name"])),
                    row=(1 + i // cols),
                    col=(1 + i % cols),
                )
        if scatter:
            fig.update_traces(line=dict(width=0))
        relabel_fig(fig)
        fig.update_layout(
            title_text="{}Evolution of {} by {}".format(
                "Temporal " if x == "year" else "", y, by
            ),
            xaxis_title=x,
            yaxis_title=y,
            legend_title=by,
            height=330 * rows,
            width=400 * cols,
        )
        fig.update_xaxes(tickangle=-45)
        fig.write_image("outputs/{}_{}_by_{}_color_{}.png".format(y, x, by, color))
        fig.show()

In [256]:
sd.column_cache

AttributeError: 'SuicideData' object has no attribute 'column_cache'

In [298]:
sd = SuicideData()
sd.select_data()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,deaths,population
hhs,State,year,race,ethnicity,age_strat,ethno_race,Unnamed: 7_level_1,Unnamed: 8_level_1
HHS1,Connecticut,2010,API,Non-Hispanic,20-64,API Non-Hispanic,10,94372.0
HHS1,Connecticut,2010,Black,Non-Hispanic,20-64,Black Non-Hispanic,18,213852.0
HHS1,Connecticut,2010,White,Hispanic,20-64,White Hispanic,27,237991.0
HHS1,Connecticut,2010,White,Non-Hispanic,20-64,White Non-Hispanic,243,1559150.0
HHS1,Connecticut,2011,Black,Non-Hispanic,20-64,Black Non-Hispanic,14,217529.0
...,...,...,...,...,...,...,...,...
HHS4,Tennessee,2020,White,Hispanic,20-64,White Hispanic,27,188389.0
HHS4,Tennessee,2020,White,Non-Hispanic,20-64,White Non-Hispanic,795,2964443.0
HHS4,Tennessee,2021,Black,Non-Hispanic,20-64,Black Non-Hispanic,69,693833.0
HHS4,Tennessee,2021,White,Hispanic,20-64,White Hispanic,30,188389.0


In [254]:
sd.data['20-64']

Unnamed: 0,hhs,State,year,race,ethnicity,deaths,population,age_strat,ethno_race
0,HHS1,Connecticut,2012,Black,Non-Hispanic,12,220727.0,20-64,Black Non-Hispanic
1,HHS1,Connecticut,2012,White,Hispanic,24,254971.0,20-64,White Hispanic
2,HHS1,Connecticut,2012,White,Non-Hispanic,252,1529148.0,20-64,White Non-Hispanic
3,HHS1,Connecticut,2013,Black,Non-Hispanic,13,225016.0,20-64,Black Non-Hispanic
4,HHS1,Connecticut,2013,White,Hispanic,15,263061.0,20-64,White Hispanic
...,...,...,...,...,...,...,...,...,...
478,HHS10,Washington,2016,White,Non-Hispanic,707,3169816.0,20-64,White Non-Hispanic
480,HHS10,Washington,2017,API,Non-Hispanic,59,495036.0,20-64,API Non-Hispanic
481,HHS10,Washington,2017,Black,Non-Hispanic,26,211023.0,20-64,Black Non-Hispanic
482,HHS10,Washington,2017,White,Hispanic,50,459337.0,20-64,White Hispanic


In [229]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

data, _ = sd.merge(x="year", color="age_strat", by="ethnicity")

data = (
    data.assign(
        pop_share=lambda x: 100.0
        * x.population
        / x.groupby(level=[0, 1]).sum().population
    )
    .reset_index()
    .set_index(["ethnicity", "year", "age_strat"])
)

In [230]:
sd.data

{'20-64':                                 HHS Region HHS Region Code        State  year  \
 0    HHS Region #1  CT, ME, MA, NH, RI, VT            HHS1  Connecticut  2012   
 1    HHS Region #1  CT, ME, MA, NH, RI, VT            HHS1  Connecticut  2012   
 2    HHS Region #1  CT, ME, MA, NH, RI, VT            HHS1  Connecticut  2012   
 3    HHS Region #1  CT, ME, MA, NH, RI, VT            HHS1  Connecticut  2013   
 4    HHS Region #1  CT, ME, MA, NH, RI, VT            HHS1  Connecticut  2013   
 ..                                     ...             ...          ...   ...   
 478         HHS Region #10  AK, ID, OR, WA           HHS10   Washington  2016   
 480         HHS Region #10  AK, ID, OR, WA           HHS10   Washington  2017   
 481         HHS Region #10  AK, ID, OR, WA           HHS10   Washington  2017   
 482         HHS Region #10  AK, ID, OR, WA           HHS10   Washington  2017   
 483         HHS Region #10  AK, ID, OR, WA           HHS10   Washington  2017   
 
     

In [231]:
df = sd.selection("20-64", data).sort_values("year")

fig = make_subplots(specs=[[{"secondary_y": True}]])

for request in ["Hispanic", "Non-Hispanic"]:
    sub_df = df[df.ethnicity==request]
    # Add traces
    fig.add_trace(
        go.Scatter(x=sub_df.year, y=sub_df.suicide_per_100k, name="yaxis data"),
        secondary_y=False,
    )




    fig.add_trace(
        go.Scatter(x=sub_df.year, y=sub_df.pop_share, name="yaxis2 data",line=dict(dash='dot'),),
        secondary_y=True,
    )

# Add figure title
fig.update_layout(
    title_text="Double Y Axis Example"
)

# Set x-axis title
fig.update_xaxes(title_text="xaxis title")

# Set y-axes titles
fig.update_yaxes(title_text="<b>primary</b> yaxis title", secondary_y=False)
fig.update_yaxes(title_text="<b>secondary</b> yaxis title", secondary_y=True)

fig.show()

In [236]:
sd = SuicideData()
queue = [
    {
        "x": "year",
        "y": "suicide_per_100k",
        "by": "ethno_race",
        "color": "age_strat",
        "scatter": False,
        "rows": 3,
    },
    {
        "x": "year",
        "y": "suicide_per_100k",
        "by": "race",
        "color": "age_strat",
        "scatter": False,
        "rows": 3,
    },
    {
        "x": "year",
        "y": "population",
        "by": "race",
        "color": "age_strat",
        "scatter": False,
        "rows": 1,
    },
    {
        "x": "year",
        "y": "population",
        "color": "race",
        "by": "age_strat",
        "scatter": False,
        "rows": 1,
    },
    {
        "x": "age_strat",
        "y": "suicide_per_100k",
        "color": "race",
        "by": "year",
        "scatter": True,
        "rows": 4,
    },
    {
        "x": "age_strat",
        "y": "suicide_per_100k",
        "color": "year",
        "by": "race",
        "scatter": True,
        "rows": 1,
    },
    {
        "x": "ethno_race",
        "y": "suicide_per_100k",
        "color": "year",
        "by": "age_strat",
        "scatter": True,
        "rows": 2,
    },
]
[sd.plot(**{
    "x": "year",
    "y": "suicide_per_100k",
    "color": "hhs",
    "by": "race",
    "scatter": False,
    "rows": 1,
    })]

[None]

In [213]:
sd.merged[('year','HHS Region Code')].loc

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,deaths,population,suicide_per_100k
race,year,HHS Region Code,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
API,2010,HHS1,22,338733.0,6.494791
API,2010,HHS10,27,363232.0,7.433266
API,2010,HHS2,88,1506065.0,5.843041
API,2010,HHS3,60,775594.0,7.736006
API,2010,HHS4,41,553969.0,7.401136
...,...,...,...,...,...
White,2021,HHS5,4885,42498053.0,11.494644
White,2021,HHS6,4458,32679761.0,13.641471
White,2021,HHS7,1781,12155098.0,14.652288
White,2021,HHS8,1935,10746982.0,18.005055


In [205]:
pd.concat(sd.data.values())

Unnamed: 0,HHS Region,HHS Region Code,State,year,race,Race Code,ethnicity,Hispanic Origin Code,deaths,population,Crude Rate,age_strat,ethno_race
0,"HHS Region #1 CT, ME, MA, NH, RI, VT",HHS1,Connecticut,2012,Black,2054-5,Non-Hispanic,2186-2,12,220727.0,,20-64,Black Non-Hispanic
1,"HHS Region #1 CT, ME, MA, NH, RI, VT",HHS1,Connecticut,2012,White,2106-3,Hispanic,2135-2,24,254971.0,9.4,20-64,White Hispanic
2,"HHS Region #1 CT, ME, MA, NH, RI, VT",HHS1,Connecticut,2012,White,2106-3,Non-Hispanic,2186-2,252,1529148.0,16.5,20-64,White Non-Hispanic
3,"HHS Region #1 CT, ME, MA, NH, RI, VT",HHS1,Connecticut,2013,Black,2054-5,Non-Hispanic,2186-2,13,225016.0,,20-64,Black Non-Hispanic
4,"HHS Region #1 CT, ME, MA, NH, RI, VT",HHS1,Connecticut,2013,White,2106-3,Hispanic,2135-2,15,263061.0,,20-64,White Hispanic
...,...,...,...,...,...,...,...,...,...,...,...,...,...
742,"HHS Region #10 AK, ID, OR, WA",HHS10,Washington,2020,White,2106-3,Non-Hispanic,2186-2,947,5132984.0,18.4,Overall,White Non-Hispanic
745,"HHS Region #10 AK, ID, OR, WA",HHS10,Washington,2021,API,A,Non-Hispanic,2186-2,46,738981.0,6.2,Overall,API Non-Hispanic
746,"HHS Region #10 AK, ID, OR, WA",HHS10,Washington,2021,Black,2054-5,Non-Hispanic,2186-2,40,310512.0,12.9,Overall,Black Non-Hispanic
747,"HHS Region #10 AK, ID, OR, WA",HHS10,Washington,2021,White,2106-3,Hispanic,2135-2,56,859344.0,6.5,Overall,White Hispanic


In [686]:
x = "year"
y = "suicide_perc"
by = "race"
color = "age_strat"
pd.concat(
    [
        df.assign(ethno_race=lambda x: x.race  + x.ethnicity)
        .set_index([color, x, by])[["deaths", "Population"]]
        #.groupby(level=[0, 1, 2])
        #.sum()
        #.assign(
        #    suicide_perc=lambda x: 100.0 * x.deaths / x.Population,
        #)
        # .assign(ethno_race=lambda x: x.race + " " + x.ethnicity)
        for key, df in sd.data.items()
    ]
)#.reset_index()
# .rename(columns={"level_0": "age_strat"})
# .set_index([color, x, by])

Unnamed: 0,Residence HHS Region,hhs,Residence State,Residence State Code,year,Year Code,race,Single Race 6 Code,ethnicity,Hispanic Origin Code,deaths,Population,Crude Rate,age_strat,ethno_race
0,"HHS Region #1 CT, ME, MA, NH, RI, VT",HHS1,Connecticut,9,2018,2018,Black or African American,2054-5,Non-Hispanic,2186-2,23,269173.0,8.5,20plus,Black or African American Non-Hispanic
1,"HHS Region #1 CT, ME, MA, NH, RI, VT",HHS1,Connecticut,9,2018,2018,White,2106-3,Hispanic,2135-2,32,321427.0,10.0,20plus,White Hispanic
2,"HHS Region #1 CT, ME, MA, NH, RI, VT",HHS1,Connecticut,9,2018,2018,White,2106-3,Non-Hispanic,2186-2,344,1912405.0,18.0,20plus,White Non-Hispanic
3,"HHS Region #1 CT, ME, MA, NH, RI, VT",HHS1,Connecticut,9,2019,2019,Black or African American,2054-5,Non-Hispanic,2186-2,33,274133.0,12.0,20plus,Black or African American Non-Hispanic
4,"HHS Region #1 CT, ME, MA, NH, RI, VT",HHS1,Connecticut,9,2019,2019,White,2106-3,Hispanic,2135-2,31,327771.0,9.5,20plus,White Hispanic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,"HHS Region #10 AK, ID, OR, WA",HHS10,Washington,53,2018,2018,White,2106-3,Non-Hispanic,2186-2,49,519992.0,9.4,10-19,White Non-Hispanic
266,"HHS Region #10 AK, ID, OR, WA",HHS10,Washington,53,2019,2019,White,2106-3,Non-Hispanic,2186-2,42,516509.0,8.1,10-19,White Non-Hispanic
267,"HHS Region #10 AK, ID, OR, WA",HHS10,Washington,53,2020,2020,White,2106-3,Hispanic,2135-2,14,166154.0,,10-19,White Hispanic
268,"HHS Region #10 AK, ID, OR, WA",HHS10,Washington,53,2020,2020,White,2106-3,Non-Hispanic,2186-2,37,512801.0,7.2,10-19,White Non-Hispanic


In [682]:
for df in sd.data.values():
    print(df.columns)

Index(['Residence HHS Region', 'hhs', 'Residence State',
       'Residence State Code', 'year', 'Year Code', 'race',
       'Single Race 6 Code', 'ethnicity', 'Hispanic Origin Code', 'deaths',
       'Population', 'Crude Rate', 'age_strat'],
      dtype='object')
Index(['Residence HHS Region', 'hhs', 'Residence State',
       'Residence State Code', 'year', 'Year Code', 'race',
       'Single Race 6 Code', 'ethnicity', 'Hispanic Origin Code', 'deaths',
       'Population', 'Crude Rate', 'age_strat'],
      dtype='object')
Index(['Residence HHS Region', 'hhs', 'Residence State',
       'Residence State Code', 'year', 'Year Code', 'race',
       'Single Race 6 Code', 'ethnicity', 'Hispanic Origin Code', 'deaths',
       'Population', 'Crude Rate', 'age_strat'],
      dtype='object')


In [563]:
pd.concat(
    {
        key: df.set_index(["Year", "Single Race 6"])[["Deaths", "Population"]]
        .groupby(level=[0, 1])
        .sum()
        .assign(suicide_rate=lambda x: x.Deaths / x.Population)
        for key, df in sd.data.items()
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Deaths,Population,suicide_rate
Unnamed: 0_level_1,Year,Single Race 6,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20plus,2015,American Indian or Alaska Native,362,1188425.0,0.000305
20plus,2015,Asian or Pacific Islander,1101,13557779.0,0.000081
20plus,2015,Black or African American,2135,29097926.0,0.000073
20plus,2015,White,37673,189118506.0,0.000199
20plus,2016,American Indian or Alaska Native,419,1297492.0,0.000323
...,...,...,...,...,...
10-19,2020,White,1964,27119988.0,0.000072
10-19,2021,American Indian or Alaska Native,41,86362.0,0.000475
10-19,2021,Asian,24,792912.0,0.000030
10-19,2021,Black or African American,141,2702938.0,0.000052


In [380]:
sd.merged["hhs"]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Deaths,Population,suicide_rate
Unnamed: 0_level_1,Year,hhs,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20plus,2015,HHS1,1606,10803223.0,0.000149
20plus,2015,HHS10,2243,9885835.0,0.000227
20plus,2015,HHS2,2331,20973300.0,0.000111
20plus,2015,HHS3,3834,22690519.0,0.000169
20plus,2015,HHS4,9176,47507129.0,0.000193
...,...,...,...,...,...
10-19,2021,HHS5,264,5201953.0,0.000051
10-19,2021,HHS6,328,5202417.0,0.000063
10-19,2021,HHS7,91,1345176.0,0.000068
10-19,2021,HHS8,135,1175738.0,0.000115


In [383]:
sd.partitions["hhs"]

array(['HHS1', 'HHS2', 'HHS3', 'HHS4', 'HHS5', 'HHS6', 'HHS7', 'HHS8',
       'HHS9', 'HHS10'], dtype=object)

In [3]:
data_folder = "Data"  # where the txt files are
sd = SuicideData(data_folder)

In [4]:
sd.plot_race_data_deaths()

In [5]:
sd.plot_race_data_rate()

In [6]:
sd.plot_HHS_data_rate()

In [296]:
sd = SuicideData()
dd = pd.concat(sd.data.values()).reset_index(drop=True).set_index(sd.indexer_columns).sort_index()

user_request = {"hhs": slice("HHS1", "HHS4"), "age_strat": "20-64"}

loc_request=[slice(None)]*len(sd.indexer_columns)
for k, v in user_request.items():
    loc_request[sd.indexer_columns.index(k)] = v
dd.loc[tuple(loc_request), :]




Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,deaths,population
hhs,State,year,race,ethnicity,age_strat,ethno_race,Unnamed: 7_level_1,Unnamed: 8_level_1
HHS1,Connecticut,2010,API,Non-Hispanic,20-64,API Non-Hispanic,10,94372.0
HHS1,Connecticut,2010,Black,Non-Hispanic,20-64,Black Non-Hispanic,18,213852.0
HHS1,Connecticut,2010,White,Hispanic,20-64,White Hispanic,27,237991.0
HHS1,Connecticut,2010,White,Non-Hispanic,20-64,White Non-Hispanic,243,1559150.0
HHS1,Connecticut,2011,Black,Non-Hispanic,20-64,Black Non-Hispanic,14,217529.0
...,...,...,...,...,...,...,...,...
HHS4,Tennessee,2020,White,Hispanic,20-64,White Hispanic,27,188389.0
HHS4,Tennessee,2020,White,Non-Hispanic,20-64,White Non-Hispanic,795,2964443.0
HHS4,Tennessee,2021,Black,Non-Hispanic,20-64,Black Non-Hispanic,69,693833.0
HHS4,Tennessee,2021,White,Hispanic,20-64,White Hispanic,30,188389.0


In [289]:
dd.loc["HHS1", "Connecticut","2012"]


indexing past lexsort depth may impact performance.



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,deaths,population
race,ethnicity,age_strat,ethno_race,Unnamed: 4_level_1,Unnamed: 5_level_1
Black,Non-Hispanic,20-64,Black Non-Hispanic,12,220727.0
White,Hispanic,20-64,White Hispanic,24,254971.0
White,Non-Hispanic,20-64,White Non-Hispanic,252,1529148.0
Black,Non-Hispanic,20plus,Black Non-Hispanic,12,253922.0
White,Hispanic,20plus,White Hispanic,25,277967.0
White,Non-Hispanic,20plus,White Non-Hispanic,305,1992182.0
White,Non-Hispanic,10-19,White Non-Hispanic,14,317732.0
Black,Non-Hispanic,Overall,Black Non-Hispanic,13,364861.0
White,Hispanic,Overall,White Hispanic,26,429231.0
White,Non-Hispanic,Overall,White Non-Hispanic,319,2551460.0


In [266]:
sd.indexer_columns.index("hhds")

TypeError: slice indices must be integers or have an __index__ method

In [269]:
sd = SuicideData()
pd.concat(sd.data.values()).reset_index(drop=True).set_index(sd.indexer_columns)

request = {"hhs": slice("HHS1", "HHS4"), "age_strat": "20-64"}




#.drop(columns=["level_1"])#.rename(columns={"level_0": "age_strat"})

In [None]:
x: str = "year",
        color: str = "age_strat",
        by: str = "race",
    ) -> pd.DataFrame:
        # get the list of values by
        by_list = self.partitions.get((x, by), None)
        if by_list is None:
            by_list = pd.concat(self.data.values())[by].unique()
            by_list = [k for k in by_list if k not in self.reject_list]
            by_list.sort()

            # Cache it for next time
            self.partitions[(x, by)] = by_list

            if "age_strat" in [x, color, by]:
                self.merged[(x, by)] = (
                    pd.concat(
                        [
                            df.set_index([color, x, by])[["deaths", "population"]]
                            .groupby(level=[0, 1, 2])
                            .sum()
                            .assign(
                                suicide_per_100k=lambda x: 100000.0
                                * x.deaths
                                / x.population,
                            )
                            for df in self.data.values()
                        ]
                    ).reset_index()
                    .set_index([color, x, by])
                )