In [None]:
#chunk 1: global Variables and ICD Code Mappings

#age binning function
def bin_age(age):
    """
    convert a numerical age into an age bin. Age bins:
      - "0-24" for ages under 25,
      - "25-64" for ages 25 to 64,
      - "65+" for ages 65 and above.
    """
    try:
        age = int(age)
    except (ValueError, TypeError):
        return None
    if age < 25:
        return "0-24"
    elif 25 <= age <= 64:
        return "25-64"
    elif age >= 65:
        return "65+"
    else:
        return None

#education binning function   
def bin_education(education):
    """
    convert years in schooling into two bins:
        -"hs diploma or less" for years 00-12
        -"some college credit or more" for years 13-17
    """
    try:
        education = int(education)
    except (ValueError, TypeError):
        return None
    if education <=12:
        return "HS_or_less"
    elif 13<=education<=17:
        return "SomeColl_or_more"
    else:
        return None
    
#race binning function
def bin_race(raceWB,raceHS):
    """
    convert race/origin into two bins:
        -"white/non-hispanic" if white and not hispanic
        -"non-white/hispanic" if non white or hispanic
    """
    try:
        raceWB = int(raceWB)
        raceHS = int(raceHS)
    except (ValueError, TypeError):
        return None
    if raceWB == 1 and raceHS ==0:
        return "white/non-hispanic"
    elif raceWB != 1 or raceHS != 0:
        return "non-white/hispanic"
    else:
        return None
    

#ICD-9 Mappings:
ICD9_suicide_codes = [f"95{d}" for d in range(0, 10)]
ICD9_undetermined_codes = [f"98{d}" for d in range(0, 10)]
ICD9_poisoning_codes = [f"85{d}" for d in range(0, 10)]
ICD9_drowning_codes = ["910"]
ICD9_firearm_codes = ["9220"] + [f"922{d}" for d in range(1, 10)]
ICD9_train_codes = [f"80{d}" for d in range(0, 8)] + ["810"]
ICD9_mh_related_codes = set(ICD9_suicide_codes + ICD9_undetermined_codes + ICD9_drowning_codes + ICD9_train_codes + ICD9_poisoning_codes + ICD9_firearm_codes)

# ICD-10 Mappings
ICD10_suicide_codes = [f"X{str(i).zfill(2)}" for i in range(60, 85)] + ["Y870"] + ['U03']
ICD10_undetermined_codes = [f"Y{str(i).zfill(2)}" for i in range(10, 35)] + ["Y872", "Y899"]  
ICD10_poisoning_codes = [f"X{str(i).zfill(2)}" for i in range(40, 50)]
ICD10_drowning_codes = [f"W{str(i).zfill(2)}" for i in range(65, 75)]
ICD10_firearm_codes = [f"W{str(i).zfill(2)}" for i in range(32, 35)]
ICD10_train_codes = ["V05", "V15", "V806"] + [f"V81{i}" for i in range(2,10)] + ["V25", "V35", "V45", "V55", "V65", "V75", "V810", "V811", "V876", "V886"]
ICD10_mh_related_codes = set(ICD10_suicide_codes + ICD10_undetermined_codes+ ICD10_poisoning_codes + ICD10_drowning_codes + ICD10_firearm_codes +ICD10_train_codes)


#display the ICD code mappings for verification
print("ICD-9 Mental Health Related Codes:", sorted(ICD9_mh_related_codes))
print("ICD-10 Mental Health Related Codes:", sorted(ICD10_mh_related_codes))



In [3]:
#chunk 2: global Variables and ICD Code Mappings

def parse_line_1989_1995(line):
    month = line[54:56].strip()
    if month not in  ['06', '07', '08']:
        return None 
    else:
        month = int(month)
    #acounting for possible tape variation    
    cause1 = line[141:145].strip()
    cause2 = line[142:145].strip()
    cause3 = line[141:144].strip()
    if (cause1 in ICD9_mh_related_codes or cause2 in ICD9_mh_related_codes or cause3 in ICD9_mh_related_codes):
        mh_related = 1
    else:
        mh_related = 0

    state_fips = line[123:125].strip()
    county_fips = line[125:128].strip()
    fips = state_fips + county_fips

    sex_raw = line[58:59].strip()
    if sex_raw == '1':
        sex = 'M'
    else:
        sex = 'F'

    age_indicator = line[63]
    if age_indicator == '0':
        age_str = line[64:66].strip()
        try:
            age = int(age_str)
        except:
            age = None
    elif age_indicator == '1':
        age_str = line[66:68].strip()
        try:
            age = int(age_str) + 100
        except:
            age = None
    else:
        age = None
    age_bin = bin_age(age) 

    raceWB = line[61:62].strip()
    raceHS = line[79:81].strip()
    race_bin = bin_race(raceWB,raceHS) 

    education = line[51:53].strip()
    education_bin = bin_education(education) 

    return {
        "month": month,
        "fips": fips,
        "mh_related": mh_related,
        "sex": sex,
        "age": age,
        "age_bin": age_bin,
        "race": race_bin,
        "education": education_bin
    }

def parse_line_1996_1998(line):
    month = line[54:56].strip()
    if month not in  ['06', '07', '08']:
        return None 
    else:
        month =int(month)

    #account for icd tape variation
    cause1 = line[141:145].strip()
    cause2 = line[142:145].strip()
    cause3 = line[141:144].strip()
    if (cause1 in ICD9_mh_related_codes or cause2 in ICD9_mh_related_codes or cause3 in ICD9_mh_related_codes):
        mh_related = 1
    else:
        mh_related = 0

    state_fips = line[123:125].strip()
    county_fips = line[125:128].strip()
    fips = state_fips + county_fips

    sex_raw = line[58:59].strip()
    if sex_raw == '1':
        sex = 'M'
    else:
        sex = 'F'

    age_indicator = line[63]
    if age_indicator == '0':
        age_str = line[64:66].strip()
        try:
            age = int(age_str)
        except:
            age = None
    elif age_indicator == '1':
        age_str = line[66:68].strip()
        try:
            age = int(age_str) + 100
        except:
            age = None
    else:
        age = None
    age_bin = bin_age(age)     

    raceWB = line[61:62].strip()
    raceHS = line[79:81].strip()
    race_bin = bin_race(raceWB,raceHS) 

    education = line[51:53].strip()
    education_bin = bin_education(education) 

    return {
        "month": month,
        "fips": fips,
        "mh_related": mh_related,
        "sex": sex,
        "age": age,
        "age_bin": age_bin,
        "race": race_bin,
        "education": education_bin
    }

def parse_line_1999_2002(line):
    month = line[54:56].strip()
    if month not in  ['06', '07', '08']:
        return None 
    else:
        month = int(month)

    #account for tape location variation
    cause1 = line[141:145].strip()
    cause2 = line[142:145].strip()
    cause3 = line[141:144].strip()
    if (cause1 in ICD10_mh_related_codes or cause2 in ICD10_mh_related_codes or cause3 in ICD10_mh_related_codes):
        mh_related = 1
    else:
        mh_related = 0

    state_fips = line[123:125].strip()
    county_fips = line[125:128].strip()
    fips = state_fips + county_fips

    sex_raw = line[58:59].strip()
    if sex_raw == '1':
        sex = 'M'
    else:
        sex = 'F'

    age_indicator = line[63]
    if age_indicator == '0':
        age_str = line[64:66].strip()
        try:
            age = int(age_str)
        except:
            age = None
    elif age_indicator == '1':
        age_str = line[66:68].strip()
        try:
            age = int(age_str) + 100
        except:
            age = None
    else:
        age = None
    age_bin = bin_age(age)     

    raceWB = line[61:62].strip()
    raceHS = line[79:81].strip()
    race_bin = bin_race(raceWB,raceHS) 

    education = line[51:53].strip()
    education_bin = bin_education(education) 

    return {
        "month": month,
        "fips": fips,
        "mh_related": mh_related,
        "sex": sex,
        "age": age,
        "age_bin": age_bin,
        "race": race_bin,
        "education": education_bin
    }

def parse_line_2003_2019(line):
    month = line[64:66].strip()
    if month not in  ['06', '07', '08']:
        return None 
    else:
        month = int(month)
    #account for tape location variation
    cause1 = line[145:149].strip()
    cause2 = line[146:149].strip()
    cause3 = line[145:148].strip()
    if (cause1 in ICD10_mh_related_codes or cause2 in ICD10_mh_related_codes or cause3 in ICD10_mh_related_codes):
        mh_related = 1
    else:
        mh_related = 0

    state_fips = line[20:22].strip()
    county_fips = line[22:25].strip()
    fips = state_fips + county_fips

    sex_raw = line[68:69].strip()
    if sex_raw == "M":
        sex = 'M'
    else:
        sex = 'F'

    age_indicator = line[69]
    if age_indicator == '1':
        try: 
            age = int(line[70:73].strip())
        except ValueError:
            return None
    elif age_indicator != "1" and age_indicator != "9":
        age = 0
    else:
        return None     
    age_bin = bin_age(age)     

    raceWB = line[448:449].strip()
    raceHS = line[483:486].strip()
    raceHS = int(raceHS)
    if 100<= raceHS<= 199:
        raceHS =0 
    else:
        raceHs =1    
    race_bin = bin_race(raceWB,raceHS) 
    
    education = line[60:62].strip()
    education_bin = bin_education(education) 
    
    return {
        "month": month,
        "fips": fips,
        "mh_related": mh_related,
        "sex": sex,
        "age": age,
        "age_bin": age_bin,
        "race": race_bin,
        "education": education_bin
    }   

In [None]:
#chunk 3: aggregation
import os
import pandas as pd
from pathlib import Path
BASE_DIR = Path.cwd().resolve()

#paths relative to the base directory (adjust as needed)
mortality_folder = (BASE_DIR / ".." / "Data" / "Mortality" / "Mortality_Unnested").resolve()

output_folder = (BASE_DIR / ".."  / "Analysis" / "Analysis_Data").resolve()

urban_csv = (BASE_DIR / ".." / "Data" / "Additional Data" / "Urbanization.csv").resolve()

files = os.listdir(mortality_folder)
all_records = []

for filename in files:
    try:
        year = int(''.join(filter(str.isdigit, filename))[:4])
        file_year = int(year)
    except Exception as e:
        print(f"Could not extract year from {filename}: {e}")
        continue

    if year <= 1995:
        parser = parse_line_1989_1995 
    elif 1995 < year <= 1998:
        parser = parse_line_1996_1998
    elif 1998 < year <= 2002:
        parser = parse_line_1999_2002  
    elif 2002< year:
        parser = parse_line_2003_2019  

    filepath = mortality_folder / filename
    with open(filepath, 'r', encoding='latin1') as f:
        for line in f:
            record = parser(line)
            if record is not None:
                record["year"] = file_year
                all_records.append(record)
    print(f"Processed file {filename} (year: {file_year})")

df = pd.DataFrame(all_records)


urban_df = pd.read_csv(urban_csv)
urban_df['FIPS code'] = urban_df['FIPS code'].astype(str).str.zfill(5)


#mapping of state abbreviations to their FIPS codes
state_abbrev_to_fips = {
    "AL": "01", "AK": "02", "AZ": "04", "AR": "05", "CA": "06",
    "CO": "08", "CT": "09", "DE": "10", "DC": '11', "FL": "12", "GA": "13",
    "HI": "15", "ID": "16", "IL": "17", "IN": "18", "IA": "19",
    "KS": "20", "KY": "21", "LA": "22", "ME": "23", "MD": "24",
    "MA": "25", "MI": "26", "MN": "27", "MS": "28", "MO": "29",
    "MT": "30", "NE": "31", "NV": "32", "NH": "33", "NJ": "34",
    "NM": "35", "NY": "36", "NC": "37", "ND": "38", "OH": "39",
    "OK": "40", "OR": "41", "PA": "42", "RI": "44", "SC": "45",
    "SD": "46", "TN": "47", "TX": "48", "UT": "49", "VT": "50",
    "VA": "51", "WA": "53", "WV": "54", "WI": "55", "WY": "56"
}

def convert_abbrev_fips(abbrev_fips):
    """
    convert a FIPS string like "AL001" into a standard 5-digit FIPS code
    """
    state_abbrev = abbrev_fips[:2]
    county_code = abbrev_fips[2:]
    if state_abbrev in state_abbrev_to_fips:
        state_fips = state_abbrev_to_fips[state_abbrev]
        county_code = county_code.zfill(3)
        return state_fips + county_code
    else:
        return abbrev_fips  # fallback if not found

df["fips_standard"] = df["fips"].apply(lambda x: convert_abbrev_fips(x) if x[:2].isalpha() else x)
df= pd.merge(df, urban_df, left_on="fips_standard", right_on="FIPS code", how="left")


df = df.sort_values(by=["fips_standard", "year", "month"])

monthly_overall = df.groupby(['fips_standard', 'Urban','year', 'month'])['mh_related'].sum().reset_index(name='total_deaths')
monthly_sex = df.groupby(['fips_standard', 'Urban','year', 'month', 'sex'])['mh_related'].sum().reset_index(name='deaths_gender')
monthly_age = df.groupby(['fips_standard', 'Urban','year', 'month', 'age_bin'])['mh_related'].sum().reset_index(name='deaths_age')
monthly_education = df[df['age_bin'] != '0-24'].groupby( ['fips_standard', 'Urban', 'year', 'month', 'education'])['mh_related'].sum().reset_index(name='deaths_education')
monthly_race = df.groupby(['fips_standard', 'Urban','year', 'month', 'race'])['mh_related'].sum().reset_index(name='deaths_race')




#save the grouped data to CSV files. can edit to change location creation
monthly_overall.to_csv(output_folder / "monthly_overall.csv", index=False)
monthly_sex.to_csv(output_folder / "monthly_sex.csv", index=False)
monthly_age.to_csv(output_folder / "monthly_age.csv", index=False)
monthly_education.to_csv(output_folder / "monthly_education.csv", index=False)
monthly_race.to_csv(output_folder / "monthly_race.csv", index=False)
