# Generate Individuals
We use census data and other sources to generate statistical individuals to be used to train and test the risk assessment tool

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import random

# Ignnore warning
import warnings
warnings.simplefilter(action='ignore')

In [2]:
def normalise(distribution):    
    # Return a unit vector
    return [p/np.sum(distribution) for p in distribution]

### Age probability distribution

In [3]:
def get_random_age(sel_stats_df):

    # Columns related to the age probability distribution
    age_columns = [
        'Age Under 5 years',
        'Age 5 to 9 years',
        'Age 10 to 14 years',
        'Age 15 to 19 years',
        'Age 20 to 24 years',
        'Age 25 to 34 years',
        'Age 35 to 44 years',
        'Age 45 to 54 years',
        'Age 55 to 59 years',
        'Age 60 to 64 years',
        'Age 65 to 74 years',
        'Age 75 to 84 years',
        'Age 85 years and over'
        ]

    # List holding the probability distribution
    age_prob_distr = []

    # Get the probability distribution from the statistical data
    for p in age_columns:
        age_prob_distr.append(sel_stats_df[p].values[0]/100)

    age_prob_distr = normalise(age_prob_distr)

    # Get age bracket based on age probability distribution
    age_bracket = age_columns[np.random.choice(np.arange(0, len(age_prob_distr)), p=age_prob_distr)]

    # Get age by selecting age within the bracket at random (all ages within the bracket have the same probability)
    age = 0

    if age_bracket == 'Age Under 5 years':
        age = random.randint(0, 4)
    elif age_bracket == 'Age 5 to 9 years':
        age = random.randint(5, 9)
    elif age_bracket == 'Age 10 to 14 years':
        age = random.randint(10, 14)
    elif age_bracket == 'Age 15 to 19 years':
        age = random.randint(15, 19)
    elif age_bracket == 'Age 20 to 24 years':
        age = random.randint(20, 24)
    elif age_bracket == 'Age 25 to 34 years':
        age = random.randint(25, 34)
    elif age_bracket == 'Age 35 to 44 years':
        age = random.randint(35, 44)
    elif age_bracket == 'Age 45 to 54 years':
        age = random.randint(45, 54)
    elif age_bracket == 'Age 55 to 59 years':
        age = random.randint(55, 59)
    elif age_bracket == 'Age 60 to 64 years':
        age = random.randint(60, 64)
    elif age_bracket == 'Age 65 to 74 years':
        age = random.randint(65, 74)
    elif age_bracket == 'Age 75 to 84 years':
        age = random.randint(75, 84)
    elif age_bracket == 'Age 85 years and over':
        age = random.randint(85, 94)

    return age

### Gender probability distribution

In [4]:
def get_random_gender(sel_stats_df):
    # Number of male for 100 female
    m_p_100f = sel_stats_df['Gender ratio (males per 100 females)'].values[0]

    # Probability of individual being a man
    prob_male = m_p_100f/(m_p_100f + 100)

    # Adjust probability for non-binary
    # Source: https://www.pewresearch.org/short-reads/2022/06/07/about-5-of-young-adults-in-the-u-s-say-their-gender-is-different-from-their-sex-assigned-at-birth/
    prob_other = 0.016

    prob_male = prob_male-prob_other/2
    prob_female = 1-prob_male-prob_other

    gender_options = ['Male', 'Female', 'Other']
    gender_prob_distr = [prob_male, prob_female, prob_other]

    gender_prob_distr = normalise(gender_prob_distr)

    gender = np.random.choice(np.arange(0, len(gender_prob_distr)), p=gender_prob_distr)
    
    return gender_options[gender]

### Race probability distribution

In [5]:
def get_random_race(sel_stats_df):
    race_columns = [
        'Race Hispanic or Latino (of any race)',
        'Race Not Hispanic or Latino!!White alone',
        'Race Not Hispanic or Latino!!Black or African American alone',
        'Race Not Hispanic or Latino!!American Indian and Alaska Native alone',
        'Race Not Hispanic or Latino!!Asian alone',
        'Race Not Hispanic or Latino!!Native Hawaiian and Other Pacific Islander alone',
        'Race Not Hispanic or Latino!!Some other race alone',
        'Race Not Hispanic or Latino!!Two or more races',
    ]

    # List holding the probability distribution
    race_prob_distr = []

    # Get the probability distribution from the statistical data
    for p in race_columns:
        race_prob_distr.append(sel_stats_df[p].values[0]/100)

    race_prob_distr = normalise(race_prob_distr)

    # Get race category based on age probability distribution
    race_category = race_columns[np.random.choice(np.arange(0, len(race_prob_distr)), p=race_prob_distr)]

    race = 'Other'

    if race_category == 'Race Not Hispanic or Latino!!White alone':
        race = 'White'
    elif race_category == 'Race Not Hispanic or Latino!!Black or African American alone':
        race = 'Black'
    elif race_category == 'Race Not Hispanic or Latino!!Asian alone':
        race = 'Asian'
    elif race_category == 'Race Hispanic or Latino (of any race)':
        race = 'Latinx'
    else: 
        race = 'Other'

    return race

### Immigrant probability distribution

In [6]:
def get_random_immi(sel_stats_df):
    # Probability of immmigrant
    prob_immi = sel_stats_df['Immigrant Foreign born'].values[0]/100

    # Immigration satus
    immigrant_status = ['Yes', 'No']

    # List holding the probability distribution
    immigrant = immigrant_status[np.random.choice(np.arange(0, 2), p=[prob_immi, 1-prob_immi])]
    return immigrant

### Education probability distribution
**Less than high school:**
- Education Less than 9th grade
- Education 9th to 12th grade, no diploma

**Work type**

Whether an individual classify as 'blue collar', 'white collar' or 'both' is assumed to be fully dependent on education.

In [7]:
def get_work_type(education):
     if (education == 'Less than high school') | (education == 'High school/GED'):
          work = 'Blue collar'
     elif education == 'Some college/trade school':
          work = 'In between'
     elif (education == "Bachelor's degree") | (education == 'Graduate school/advanced degree'):
          work = 'White collar'
     else:
          work = 'Unknown'

     return work

def get_random_edu(sel_stats_df):
     edu_columns = [
          "Education Less than 9th grade",
          "Education 9th to 12th grade, no diploma",
          "Education High school graduate (includes equivalency)",
          "Education Some college, no degree",
          "Education Associate's degree",
          "Education Bachelor's degree",
          "Education Graduate or professional degree",
     ]

     # Categories based on shooter database
     edu_categories = [
          'Less than high school',
          'High school/GED',
          'Some college/trade school',
          "Bachelor's degree",
          'Graduate school/advanced degree'
     ]

     # List holding the probability distribution
     edu_prob_distr = []

     # Get the probability distribution from the statistical data
     for p in edu_columns:
          edu_prob_distr.append(sel_stats_df[p].values[0]/100)

     # Merge probabilities corresponding to 'below high school'
     edu_prob_distr = [
          edu_prob_distr[0] + edu_prob_distr[1],
          edu_prob_distr[2],
          edu_prob_distr[3] + edu_prob_distr[4],
          edu_prob_distr[5],
          edu_prob_distr[6]
     ]

     edu_prob_distr = normalise(edu_prob_distr)

     # Get education category based on age probability distribution
     education = edu_categories[np.random.choice(np.arange(0, len(edu_prob_distr)), p=edu_prob_distr)]

     return education

### Relationship status probability distribution
- Individuals of less than 16 -> single
- Never married, assume that 38% are single (source: https://www.pewresearch.org/social-trends/2021/10/05/rising-share-of-u-s-adults-are-living-without-a-spouse-or-partner/)

In [8]:
def get_random_relstatus(sel_stats_df, gender, age):

    # Probabilities for non-married individuals
    prob_single = 0.38
    prob_relationship = 1-prob_single

    # Individuals younger than 18 are assumed to be single
    if age < 18:
        status = 'Single'

    # Statistical data can be used for older individuals
    else:
        # Because no statistical data are available for non-binary, they are assumed to be either single or in a non-married relationship
        if gender == 'Other':
            other_status = ['Single', 'Boyfriend/girlfriend']
            other_prob = [prob_single, prob_relationship]
            status = other_status[np.random.choice(np.arange(0, len(other_prob)), p=other_prob)]
                
        # Male and female individuals can make use of full statistical data
        else:

            male_relstatus_columns = [
                'RelStatus Male Never married',
                'RelStatus Male Now married, except separated',
                'RelStatus Male Separated',
                'RelStatus Male Widowed',
                'RelStatus Male Divorced'
            ]

            female_relstatus_columns = [
                'RelStatus Female Never married',
                'RelStatus Female Now married, except separated',
                'RelStatus Female Separated',
                'RelStatus Female Widowed',
                'RelStatus Femaleale Divorced'
            ]

            # Select probability based on gender
            if gender == 'Male':
                relstatus_columns = male_relstatus_columns
            elif gender == 'Female':
                relstatus_columns = female_relstatus_columns

            # List holding the probability distribution
            relstatus_prob_distr = []

            # Get the probability distribution from the statistical data
            for p in relstatus_columns:
                relstatus_prob_distr.append(sel_stats_df[p].values[0]/100)

            # Merge probability related to Divorced/separated/widowed
            relstatus_prob_distr[-3] = relstatus_prob_distr[-3] + relstatus_prob_distr[-2] + relstatus_prob_distr[-1]
            relstatus_prob_distr = relstatus_prob_distr[:-2]

            # Calculate probability of being single or in a non-married relationship for people that were never married
            relstatus_prob_distr.append(relstatus_prob_distr[0]*prob_single)
            relstatus_prob_distr[0] = relstatus_prob_distr[0]*prob_relationship

            # Adjust probabilities to sum to 1
            relstatus_prob_distr = normalise(relstatus_prob_distr)

            # Possible status
            relationship_status = ['Boyfriend/girlfriend', 'Married', 'Divorced/separated/widowed', 'Single']

            status = relationship_status[np.random.choice(np.arange(0, len(relstatus_prob_distr)), p=relstatus_prob_distr)]

    return status
    

### Employed probability distribution

In [9]:
def get_random_employed(sel_stats_df, age):
    employed_columns = [
        'Employed_Employed',
        'Employed_Unemployed',
        'Employed_Military',
        'Employed_NotLabour']

    # List holding the probability distribution
    employed_prob_distr = []

    # Get the probability distribution from the statistical data
    for p in employed_columns:
        employed_prob_distr.append(sel_stats_df[p].values[0]/100)

    # Indivduals younger than 16 are assumed to not be working
    if age < 16:
        employed = 'Not working'
    else:
        # Group 'Employed' and 'Military' as employed. Other as unemployed
        employed_prob_distr = [
            employed_prob_distr[0]+employed_prob_distr[2],
            employed_prob_distr[1]+employed_prob_distr[3]
        ]

        employed_prob_distr = normalise(employed_prob_distr)

        # Possible status
        employed_status = ['Working', 'Not Working']

        # Get status based on probability
        employed = employed_status[np.random.choice(np.arange(0, len(employed_prob_distr)), p=employed_prob_distr)]
    
    return employed


### Military service probability distribution

In [10]:
def get_random_military(sel_stats_df, age):
    if age > 16:
        mil_prob = sel_stats_df['Employed_Military'].values[0]/100
        civ_prob = 1-mil_prob

        mil_prob_distr = [mil_prob, civ_prob]

        mil_status = ['Yes', 'No']

        military = mil_status[np.random.choice(np.arange(0, len(mil_prob_distr)), p=mil_prob_distr)]
    else:
        military = 'No'

    return military

### Arrest probability

In [11]:
def get_random_arrest(sel_stats_df):
    arrest_prob = sel_stats_df['Arrest_Rate'].values[0]/100
    arrest_prob_distr = [arrest_prob, 1-arrest_prob]
    arrest_status = ['Yes', 'No']
    arrested = arrest_status[np.random.choice(np.arange(0, len(arrest_prob_distr)), p=arrest_prob_distr)]
    return arrested

### Parent divorce probability

In [12]:
def get_random_divorce(sel_stats_df):
    # Proabability of a female getting a divorce
    divorce_prob = sel_stats_df['RelStatus Femaleale Divorced'].values[0]/100
    divorce_prob_distr = [divorce_prob, 1-divorce_prob]
    divorce_status = ['Yes', 'No evidence']
    divorce = divorce_status[np.random.choice(np.arange(0, len(divorce_prob_distr)), p=divorce_prob_distr)]
    return divorce

### Socioeconomic status (SES) probability distribution
- Source: https://en.wikipedia.org/wiki/Social_class_in_the_United_States#/media/File:Class_US.svg

In [13]:
def get_random_ses(sel_stats_df):
    ses_columns = [
        'SES less than $10,000',
        'SES between $10,000 to $14,999',
        'SES between $15,000 to $24,999',
        'SES between $25,000 to $34,999',
        'SES between $35,000 to $49,999',
        'SES between $50,000 to $74,999',
        'SES between $75,000 to $99,999',
        'SES between $100,000 to $149,999',
        'SES between $150,000 to $199,999',
        'SES $200,000 or more'
    ]

    # List holding the probability distribution
    ses_prob_distr = []

    # Get the probability distribution from the statistical data
    for p in ses_columns:
        ses_prob_distr.append(sel_stats_df[p].values[0]/100)

    # Make sure the probability is not greater than one
    ses_prob_distr = normalise(ses_prob_distr)
        

    # Merge probabilities based on shooter categories (3 categories)
    ses_prob_distr = [
        np.sum(ses_prob_distr[:4]),
        np.sum(ses_prob_distr[4:7]),
        np.sum(ses_prob_distr[7:])
    ]

    ses_class = ['Lower class', 'Middle class', 'Upper class']

    ses = ses_class[np.random.choice(np.arange(0, len(ses_prob_distr)), p=ses_prob_distr)]
    return ses

### Mental Illness Probability
- Probability is assumed to be the same for parents and children
- Parent and children mental illness probability are assumed independent

In [14]:
def get_random_mentalillness(sel_stats_df):
    mental_prob = sel_stats_df['Mental_Illness_Rate'].values[0]/100
    mental_prob_distr = [mental_prob, 1-mental_prob]
    mental_status = ['Yes', 'No evidence']
    mental_illness = mental_status[np.random.choice(np.arange(0, len(mental_prob_distr)), p=mental_prob_distr)]
    return mental_illness

### Autism probability

In [15]:
def get_random_autism(sel_stats_df):
    autism_prob = sel_stats_df['Autism_Rate'].values[0]/100
    autism_prob_distr = [autism_prob, 1-autism_prob]
    autism_status = ['Diagnosed or extremely likely', 'No evidence']
    autism = autism_status[np.random.choice(np.arange(0, len(autism_prob_distr)), p=autism_prob_distr)]
    return autism

### Health issue probability

In [16]:
def get_random_healthissue(sel_stats_df):
    health_prob = sel_stats_df['HealthIssues Disability Percent'].values[0]/100
    health_prob_distr = [health_prob, 1-health_prob]
    health_status = ['Yes', 'No evidence']
    healthissue = health_status[np.random.choice(np.arange(0, len(health_prob_distr)), p=health_prob_distr)]
    return healthissue

## Generate individuals representative of general population

### Function to create a random individual based on statistics

In [17]:
def random_individual(sel_stats_df):

    age = get_random_age(sel_stats_df)
    gender = get_random_gender(sel_stats_df)
    education = get_random_edu(sel_stats_df)

    return {
        'Age': age,
        'Gender': gender,
        'Race': get_random_race(sel_stats_df),
        'Immigrant': get_random_immi(sel_stats_df),
        'Education': education,
        'RelStatus': get_random_relstatus(sel_stats_df, gender, age),
        'Employed': get_random_employed(sel_stats_df, age),
        'Work': get_work_type(education),
        'MilService': get_random_military(sel_stats_df, age),
        'Arrested': get_random_arrest(sel_stats_df),
        'ParentDivorce': get_random_divorce(sel_stats_df),
        'SES': get_random_ses(sel_stats_df),
        'MentalIllness': get_random_mentalillness(sel_stats_df),
        'MentalIllnessHistory': get_random_mentalillness(sel_stats_df),
        'Autism': get_random_autism(sel_stats_df),
        'HealthIssues': get_random_healthissue(sel_stats_df),
        'Classification': 0
        }

### Generate a DataFrame of 10000 individuals

In [21]:
# Define number of individuals
N = 200

In [22]:
# Load the statistics for the US population
csv = Path('model_blackbox_stats.csv')
stats_df = pd.read_csv(csv)

# Get all the states in a list
all_states = stats_df['State'].tolist()

# Declare empty general population list
gen_population = []

for i in range(N):

    # Select state to get the statistics from
    state = all_states[random.randrange(0,len(all_states))]

    # Generate individual and add to population list
    gen_population.append(random_individual(stats_df.loc[stats_df['State'] == state,:]))

genpop_df = pd.DataFrame(gen_population)
genpop_df


Unnamed: 0,Age,Gender,Race,Immigrant,Education,RelStatus,Employed,Work,MilService,Arrested,ParentDivorce,SES,MentalIllness,MentalIllnessHistory,Autism,HealthIssues,Classification
0,35,Female,White,No,Less than high school,Single,Working,Blue collar,No,No,No evidence,Middle class,No evidence,No evidence,No evidence,No evidence,0
1,5,Male,White,No,High school/GED,Single,Not working,Blue collar,No,No,Yes,Middle class,No evidence,No evidence,No evidence,No evidence,0
2,24,Male,White,No,Some college/trade school,Married,Not Working,In between,No,No,Yes,Lower class,No evidence,Yes,No evidence,No evidence,0
3,38,Female,Black,No,Less than high school,Divorced/separated/widowed,Not Working,Blue collar,No,No,No evidence,Middle class,Yes,No evidence,No evidence,No evidence,0
4,65,Male,White,No,Bachelor's degree,Boyfriend/girlfriend,Working,White collar,No,No,No evidence,Upper class,No evidence,No evidence,No evidence,No evidence,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,61,Female,Latinx,No,Bachelor's degree,Single,Working,White collar,No,No,No evidence,Upper class,No evidence,No evidence,No evidence,No evidence,0
196,55,Female,White,No,Bachelor's degree,Married,Not Working,White collar,No,No,No evidence,Middle class,No evidence,No evidence,No evidence,No evidence,0
197,37,Female,White,No,Some college/trade school,Boyfriend/girlfriend,Working,In between,No,Yes,No evidence,Lower class,No evidence,No evidence,No evidence,No evidence,0
198,31,Male,Black,No,Graduate school/advanced degree,Married,Not Working,White collar,No,No,No evidence,Lower class,No evidence,No evidence,No evidence,No evidence,0


### Save the DataFrame with random individuals to a CSV

In [24]:
csv_out = Path('model_blackbox_demo_200.csv')
genpop_df.to_csv(csv_out, index=False)