# Generate Individuals
We use census data and other sources to generate statistical individuals to be used to train and test the risk assessment tool

In [83]:
import pandas as pd
from pathlib import Path
import numpy as np
import random

# Ignnore warning
import warnings
warnings.simplefilter(action='ignore')

In [10]:
# Import shooter DataFrame and get the columns as reference
csv_shooter = Path('model_blackbox_shooters.csv')
shooter = pd.read_csv(csv_shooter)
cols = shooter.columns
cols

Index(['Age', 'Gender', 'Race', 'Immigrant', 'Education', 'RelStatus',
       'Employed', 'Work', 'MilService', 'Arrested', 'ParentDivorce', 'SES',
       'MentalIllness', 'MentalIllnessHistory', 'Autism', 'HealthIssues',
       'Classification'],
      dtype='object')

In [14]:
# Create a DataFrame to store the info about the general population sample
genpop_df = pd.DataFrame()

for c in cols:
    genpop_df[c] = []

genpop_df

Unnamed: 0,Age,Gender,Race,Immigrant,Education,RelStatus,Employed,Work,MilService,Arrested,ParentDivorce,SES,MentalIllness,MentalIllnessHistory,Autism,HealthIssues,Classification


In [28]:
# Load the statistics for the US population
csv = Path('model_blackbox_stats.csv')
stats_df = pd.read_csv(csv)

# Select one state to get the statistics from
SELECT_STATE = 'CA'
sel_stats_df = stats_df.loc[stats_df['State'] == SELECT_STATE,:]
sel_stats_df

Unnamed: 0,State,Employed_Employed,Employed_Unemployed,Employed_Military,Employed_NotLabour,EmplType Private wage and salary workers,EmplType Government workers,EmplType Self-employed in own not incorporated business workers,EmplType Unpaid family workers,"SES less than $10,000",...,RelStatus Male Widowed,RelStatus Male Divorced,RelStatus Female Never married,"RelStatus Female Now married, except separated",RelStatus Female Separated,RelStatus Female Widowed,RelStatus Femaleale Divorced,Mental_Illness_Rate,Arrest_Rate,Autism_Rate
4,CA,58.9,4.2,0.4,36.5,78.3,13.4,8.1,0.2,5.1,...,2.2,7.5,33.9,45.3,2.4,7.7,10.8,20.49,2.7633,2.36


In [32]:
sel_stats_df.columns

Index(['State', 'Employed_Employed', 'Employed_Unemployed',
       'Employed_Military', 'Employed_NotLabour',
       'EmplType Private wage and salary workers',
       'EmplType Government workers',
       'EmplType Self-employed in own not incorporated business workers',
       'EmplType Unpaid family workers', 'SES less than $10,000',
       'SES between $10,000 to $14,999', 'SES between $15,000 to $24,999',
       'SES between $25,000 to $34,999', 'SES between $35,000 to $49,999',
       'SES between $50,000 to $74,999', 'SES between $75,000 to $99,999',
       'SES between $100,000 to $149,999', 'SES between $150,000 to $199,999',
       'SES $200,000 or more', 'Families households', 'Nonfamily households',
       'SES Below poverty line', 'Age Under 5 years', 'Age 5 to 9 years',
       'Age 10 to 14 years', 'Age 15 to 19 years', 'Age 20 to 24 years',
       'Age 25 to 34 years', 'Age 35 to 44 years', 'Age 45 to 54 years',
       'Age 55 to 59 years', 'Age 60 to 64 years', 'Age 65 

### Age probability distribution

In [84]:
def get_random_age(sel_stats_df):

    # Columns related to the age probability distribution
    age_columns = [
        'Age Under 5 years',
        'Age 5 to 9 years',
        'Age 10 to 14 years',
        'Age 15 to 19 years',
        'Age 20 to 24 years',
        'Age 25 to 34 years',
        'Age 35 to 44 years',
        'Age 45 to 54 years',
        'Age 55 to 59 years',
        'Age 60 to 64 years',
        'Age 65 to 74 years',
        'Age 75 to 84 years',
        'Age 85 years and over'
        ]

    # List holding the probability distribution
    age_prob_distr = []

    # Get the probability distribution from the statistical data
    for p in age_columns:
        age_prob_distr.append(sel_stats_df[p].values[0]/100)

    # Get age bracket based on age probability distribution
    age_bracket = age_columns[np.random.choice(np.arange(0, len(age_prob_distr)), p=age_prob_distr)]

    # Get age by selecting age within the bracket at random (all ages within the bracket have the same probability)
    age = 0

    if age_bracket == 'Age Under 5 years':
        age = random.randint(0, 4)
    elif age_bracket == 'Age 5 to 9 years':
        age = random.randint(5, 9)
    elif age_bracket == 'Age 10 to 14 years':
        age = random.randint(10, 14)
    elif age_bracket == 'Age 15 to 19 years':
        age = random.randint(15, 19)
    elif age_bracket == 'Age 20 to 24 years':
        age = random.randint(20, 24)
    elif age_bracket == 'Age 25 to 34 years':
        age = random.randint(25, 34)
    elif age_bracket == 'Age 35 to 44 years':
        age = random.randint(35, 44)
    elif age_bracket == 'Age 45 to 54 years':
        age = random.randint(45, 54)
    elif age_bracket == 'Age 55 to 59 years':
        age = random.randint(55, 59)
    elif age_bracket == 'Age 60 to 64 years':
        age = random.randint(60, 64)
    elif age_bracket == 'Age 65 to 74 years':
        age = random.randint(65, 74)
    elif age_bracket == 'Age 75 to 84 years':
        age = random.randint(75, 84)
    elif age_bracket == 'Age 85 years and over':
        age = random.randint(85, 94)

    return age

### Gender probability distribution

In [133]:
def get_random_gender(sel_stats_df):
    # Number of male for 100 female
    m_p_100f = sel_stats_df['Gender ratio (males per 100 females)'].values[0]

    # Probability of individual being a man
    prob_male = m_p_100f/(m_p_100f + 100)

    # Adjust probability for non-binary
    # Source: https://www.pewresearch.org/short-reads/2022/06/07/about-5-of-young-adults-in-the-u-s-say-their-gender-is-different-from-their-sex-assigned-at-birth/
    prob_other = 0.016

    prob_male = prob_male-prob_other/2
    prob_female = 1-prob_male-prob_other

    gender_options = ['Male', 'Female', 'Other']
    gender_prob_distr = [prob_male, prob_female, prob_other]

    gender = np.random.choice(np.arange(0, len(gender_prob_distr)), p=gender_prob_distr)
    
    return gender_options[gender]

### Race probability distribution

In [205]:
def get_random_race(sel_stats_df):
    race_columns = [
        'Race Hispanic or Latino (of any race)',
        'Race Not Hispanic or Latino!!White alone',
        'Race Not Hispanic or Latino!!Black or African American alone',
        'Race Not Hispanic or Latino!!American Indian and Alaska Native alone',
        'Race Not Hispanic or Latino!!Asian alone',
        'Race Not Hispanic or Latino!!Native Hawaiian and Other Pacific Islander alone',
        'Race Not Hispanic or Latino!!Some other race alone',
        'Race Not Hispanic or Latino!!Two or more races',
    ]

    # List holding the probability distribution
    race_prob_distr = []

    # Get the probability distribution from the statistical data
    for p in race_columns:
        race_prob_distr.append(sel_stats_df[p].values[0]/100)

    # Get race category based on age probability distribution
    race_category = race_columns[np.random.choice(np.arange(0, len(race_prob_distr)), p=race_prob_distr)]

    race = 'Other'

    if race_category == 'Race Not Hispanic or Latino!!White alone':
        race = 'White'
    elif race_category == 'Race Not Hispanic or Latino!!Black or African American alone':
        race = 'Black'
    elif race_category == 'Race Not Hispanic or Latino!!Asian alone':
        race = 'Asian'
    elif race_category == 'Race Hispanic or Latino (of any race)':
        race = 'Latinx'
    else: 
        race = 'Other'

    return race

### Immigrant probability distribution

In [235]:
def get_random_immi(sel_stats_df):
    # Probability of immmigrant
    prob_immi = sel_stats_df['Immigrant Foreign born'].values[0]/100

    # Immigration satus
    immigrant_status = ['yes', 'no']

    # List holding the probability distribution
    immigrant = immigrant_status[np.random.choice(np.arange(0, 2), p=[prob_immi, 1-prob_immi])]
    return immigrant

### Education probability distribution
**Less than high school:**
- Education Less than 9th grade
- Education 9th to 12th grade, no diploma

In [263]:
def get_random_edu(sel_stats_df):
     edu_columns = [
          "Education Less than 9th grade",
          "Education 9th to 12th grade, no diploma",
          "Education High school graduate (includes equivalency)",
          "Education Some college, no degree",
          "Education Associate's degree",
          "Education Bachelor's degree",
          "Education Graduate or professional degree",
     ]

     # Categories based on shooter database
     edu_categories = [
          'Less than high school',
          'High school/GED',
          'Some college/trade school',
          "Bachelor's degree",
          'Graduate school/advanced degree',
          'Unknown'
     ]

     # List holding the probability distribution
     edu_prob_distr = []

     # Get the probability distribution from the statistical data
     for p in edu_columns:
          edu_prob_distr.append(sel_stats_df[p].values[0]/100)

     # Merge probabilities corresponding to 'below high school'
     edu_prob_distr[1] = edu_prob_distr[0] + edu_prob_distr[1]
     edu_prob_distr = edu_prob_distr[1:]

     # Add probability for 'Unknown' to get a total probability = 1
     edu_prob_distr.append(1 - np.sum(edu_prob_distr))

     # Get education category based on age probability distribution
     education = edu_categories[np.random.choice(np.arange(0, len(edu_prob_distr)), p=edu_prob_distr)]

     return education

### Relationship status probability distribution
- Individuals of less than 16 -> single
- Never married, assume that 38% are single (source: https://www.pewresearch.org/social-trends/2021/10/05/rising-share-of-u-s-adults-are-living-without-a-spouse-or-partner/)

In [301]:
def get_random_relstatus(sel_stats_df, gender, age):

    # Probabilities for non-married individuals
    prob_single = 0.38
    prob_relationship = 1-prob_single

    # Individuals younger than 18 are assumed to be single
    if age < 18:
        status = 'Single'

    # Statistical data can be used for older individuals
    else:
        # Because no statistical data are available for non-binary, they are assumed to be either single or in a non-married relationship
        if gender == 'Other':
            other_status = ['Single', 'Boyfriend/girlfriend']
            other_prob = [prob_single, prob_relationship]
            status = other_status[np.random.choice(np.arange(0, len(other_prob)), p=other_prob)]
                
        # Male and female individuals can make use of full statistical data
        else:

            male_relstatus_columns = [
                'RelStatus Male Never married',
                'RelStatus Male Now married, except separated',
                'RelStatus Male Separated',
                'RelStatus Male Widowed',
                'RelStatus Male Divorced'
            ]

            female_relstatus_columns = [
                'RelStatus Female Never married',
                'RelStatus Female Now married, except separated',
                'RelStatus Female Separated',
                'RelStatus Female Widowed',
                'RelStatus Femaleale Divorced'
            ]

            # Select probability based on gender
            if gender == 'Male':
                relstatus_columns = male_relstatus_columns
            elif gender == 'Female':
                relstatus_columns = female_relstatus_columns

            # List holding the probability distribution
            relstatus_prob_distr = []

            # Get the probability distribution from the statistical data
            for p in relstatus_columns:
                relstatus_prob_distr.append(sel_stats_df[p].values[0]/100)

            # Merge probability related to Divorced/separated/widowed
            relstatus_prob_distr[-3] = relstatus_prob_distr[-3] + relstatus_prob_distr[-2] + relstatus_prob_distr[-1]
            relstatus_prob_distr = relstatus_prob_distr[:-2]

            # Calculate probability of being single or in a non-married relationship for people that were never married
            relstatus_prob_distr.append(relstatus_prob_distr[0]*prob_single)
            relstatus_prob_distr[0] = relstatus_prob_distr[0]*prob_relationship

            # Adjust probabilities to sum to 1
            diff = np.sum(relstatus_prob_distr)-1
            relstatus_prob_distr = [p-diff/len(relstatus_prob_distr) for p in relstatus_prob_distr]

            # Possible status
            relationship_status = ['Boyfriend/girlfriend', 'Married', 'Divorced/separated/widowed', 'Single']

            status = relationship_status[np.random.choice(np.arange(0, len(relstatus_prob_distr)), p=relstatus_prob_distr)]

    return status
    

### Employed probability distribution

In [413]:
def get_random_employed(sel_stats_df, age):
    employed_columns = [
        'Employed_Employed',
        'Employed_Unemployed',
        'Employed_Military',
        'Employed_NotLabour']

    # List holding the probability distribution
    employed_prob_distr = []

    # Get the probability distribution from the statistical data
    for p in employed_columns:
        employed_prob_distr.append(sel_stats_df[p].values[0]/100)

    # Indivduals younger than 16 are assumed to not be working
    if age < 16:
        employed = 'Not working'
    else:
        # Group 'Employed' and 'Military' as employed. Other as unemployed
        employed_prob_distr = [
            employed_prob_distr[0]+employed_prob_distr[2],
            employed_prob_distr[1]+employed_prob_distr[3]
        ]

        # Possible status
        employed_status = ['Working', 'Not Working']

        # Get status based on probability
        employed = employed_status[np.random.choice(np.arange(0, len(employed_prob_distr)), p=employed_prob_distr)]
    
    return employed


### Military service probability distribution

In [433]:
def get_random_military(sel_stats_df, age):
    if age > 16:
        mil_prob = sel_stats_df['Employed_Military'].values[0]/100
        civ_prob = 1-mil_prob

        mil_prob_distr = [mil_prob, civ_prob]

        mil_status = ['Yes', 'No']

        military = mil_status[np.random.choice(np.arange(0, len(mil_prob_distr)), p=mil_prob_distr)]
    else:
        military = 'No'

    return military

### Arrest probability

In [470]:
def get_random_arrest(sel_stats_df):
    arrest_prob = sel_stats_df['Arrest_Rate'].values[0]/100
    arrest_prob_distr = [arrest_prob, 1-arrest_prob]
    arrest_status = ['Yes', 'No']
    arrested = arrest_status[np.random.choice(np.arange(0, len(arrest_prob_distr)), p=arrest_prob_distr)]
    return arrested

### Parent divorce probability

In [None]:
# Proabability of a female getting a divorce
divorce_prob = sel_stats_df['RelStatus Femaleale Divorced'].values[0]/100
divorce_prob

In [None]:
# 
'RelStatus Femaleale Divorced',
'Mental_Illness_Rate',
,
'Autism_Rate'

In [277]:
genpop_df

Unnamed: 0,Age,Gender,Race,Immigrant,Education,RelStatus,Employed,Work,MilService,Arrested,ParentDivorce,SES,MentalIllness,MentalIllnessHistory,Autism,HealthIssues,Classification


In [467]:
shooter['Arrested'].value_counts()

Yes    102
No      91
Name: Arrested, dtype: int64

In [546]:
age = get_random_age(sel_stats_df)
gender = get_random_gender(sel_stats_df)
race = get_random_race(sel_stats_df)
immigrant = get_random_immi(sel_stats_df)
education = get_random_edu(sel_stats_df)
relsatus = get_random_relstatus(sel_stats_df, gender, age)
employed = get_random_employed(sel_stats_df, age)
military = get_random_military(sel_stats_df, age)
arrested = get_random_arrest(sel_stats_df)

{
    'age': age,
    'gender': gender,
    'race': race,
    'immigrant': immigrant,
    'education': education,
    'relsatus': relsatus,
    'employed': employed,
    'military': military,
    'arrested': arrested
    }

{'age': 43,
 'gender': 'Female',
 'race': 'Latinx',
 'immigrant': 'no',
 'education': 'Some college/trade school',
 'relsatus': 'Divorced/separated/widowed',
 'employed': 'Working',
 'military': 'No',
 'arrested': 'No'}

In [67]:
import random
random.randint(0, 4)

1