### Second Amendment
A **well regulated** Militia, being necessary to the security of a free State, the right of the people to keep and bear Arms, shall not be infringed.

## Import dependencies

In [41]:
import pandas as pd
from pathlib import Path
from data_exploration import dataset_info

# Local modules
from codes import code_column_names

## Load Data

In [42]:
csv = Path('full_database.csv')
attacks_df = pd.read_csv(csv)

In [44]:
info_df = dataset_info(attacks_df)
info_df

Unnamed: 0,columns,dtypes,elements,missing,unique
0,Case #,int64,193,0,193
1,Shooter Last Name,object,193,0,181
2,Shooter First Name,object,193,0,145
3,Full Date,object,193,0,189
4,Day of Week,object,193,0,8
...,...,...,...,...,...
148,On-Scene Outcome,int64,193,0,5
149,Who Killed Shooter On Scene,int64,193,0,4
150,Attempt to Flee,int64,193,0,3
151,Insanity Defense,int64,193,0,4


## Data types
Making sure the right type is used for each column

In [45]:
print(f"{len(info_df.loc[info_df['dtypes'] == 'int64',:])} columns are of type 'int64'")
print(f"{len(info_df.loc[info_df['dtypes'] == 'float64',:])} columns are of type 'float64'")
print(f"{len(info_df.loc[info_df['dtypes'] == 'object',:])} columns are of type 'object'")

info_df.loc[info_df['dtypes'] == 'object',:]

72 columns are of type 'int64'
44 columns are of type 'float64'
37 columns are of type 'object'


Unnamed: 0,columns,dtypes,elements,missing,unique
1,Shooter Last Name,object,193,0,181
2,Shooter First Name,object,193,0,145
3,Full Date,object,193,0,189
4,Day of Week,object,193,0,8
5,Day,object,193,0,32
8,Street Number,object,179,14,165
9,Street Name,object,187,6,178
10,City,object,193,0,173
11,State,object,193,0,42
12,County,object,193,0,141


In [46]:
attacks_df['Race'].value_counts().sort_index()

0.0    103
1.0     40
2.0     17
3.0     13
4.0      9
5.0      3
Name: Race, dtype: int64

## Detailed columns
Several columns have the word 'specify' in their name, meaning they contain detailed information that are not essential to trend analysis based on elements that can be common between several shoorters. These columns are isolated and removed from the primary database.

In [47]:
def find_keyword_in_list(keyword, llist):

    finds = []

    for c in llist:
        if keyword in c:
            finds.append(c)

    return finds

In [48]:
keyword_1 = 'Specify'
keyword_2 = 'Specified'
keyword_3 = 'Expanded'
keyword_4 = 'Medication Specified'

detailed_column = find_keyword_in_list(keyword_1, attacks_df.columns)
detailed_column += find_keyword_in_list(keyword_2, attacks_df.columns)
detailed_column += find_keyword_in_list(keyword_3, attacks_df.columns)
detailed_column += find_keyword_in_list(keyword_4, attacks_df.columns)


detailed_column

['Specify Armed Person',
 'Health Issues - Specify',
 'Specify Relationship to Other Shooting(s)',
 'Specify Pop Culture Connection',
 'Specify Other Weapons or Gear',
 'School Performance Specified',
 'Community Involvement Specified',
 'Domestic Abuse Specified',
 'Psychiatric Medication Specified',
 'Signs of Crisis Expanded',
 'Psychiatric Medication Specified']

In [49]:
# Create a new DataFrame from the attacks dataframe
reduced_attacks_df = attacks_df[:]

# Drop all columns containing detailed information
reduced_attacks_df = reduced_attacks_df.drop(columns=detailed_column)

In [50]:
reduced_info_df = dataset_info(reduced_attacks_df)

print(f"{len(reduced_info_df.loc[reduced_info_df['dtypes'] == 'int64',:])} columns are of type 'int64'")
print(f"{len(reduced_info_df.loc[reduced_info_df['dtypes'] == 'float64',:])} columns are of type 'float64'")
print(f"{len(reduced_info_df.loc[reduced_info_df['dtypes'] == 'object',:])} columns are of type 'object'")
print("---")
print(f"{len(reduced_attacks_df.columns)} TOTAL columns")

reduced_info_df.loc[reduced_info_df['dtypes'] == 'float64',:]

71 columns are of type 'int64'
44 columns are of type 'float64'
28 columns are of type 'object'
---
143 TOTAL columns


Unnamed: 0,columns,dtypes,elements,missing,unique
14,Latitude,float64,193,0,190
15,Longitude,float64,193,0,190
33,Race,float64,185,8,6
34,Height,float64,58,135,15
35,Weight,float64,46,147,28
36,Immigrant,float64,192,1,2
37,Sexual Orientation,float64,189,4,2
38,Religion,float64,94,99,5
39,Education,float64,140,53,5
40,School Performance,float64,89,104,3


In [87]:
# Find exact formatting and unconventional characters
col_to_drop = []

for c in reduced_attacks_df.columns:
    if 'Motive' in c:
        col_to_drop.append(c)

col_to_drop

['Motive: Racism/Xenophobia',
 'Motive: Religious Hate',
 'Motive: Misogyny',
 'Motive: Homophobia',
 'Motive: Employment Issue',
 'Motive: Economic Issue',
 'Motive: Legal Issue',
 'Motive: Relationship Issue',
 'Motive: Interpersonal Conflict\xa0',
 'Motive: Fame-Seeking',
 'Motive: Other\xa0',
 'Motive: Unknown']

In [88]:
# Find exact formatting and unconventional characters
col_to_drop = []

for c in reduced_attacks_df.columns:
    if 'Leak' in c:
        col_to_drop.append(c)

col_to_drop

['Leakage\xa0',
 'Leakage How',
 'Leakage Who\xa0',
 'Leakage Specific/Nonspecific\xa0']

In [96]:
non_shooter_columns = ['Full Date',
                       'Day of Week',
                       'Day',
                       'Month',
                       'Year',
                       'Street Number',
                       'Street Name',
                       'City',
                       'State',
                       'County',
                       'Zip Code',
                       'Latitude',
                       'Longitude',
                       'State Code',
                       'Region',
                       'Urban/Suburban/Rural',
                       'Metro/Micro Statistical Area Type',
                       'Location',
                       'Insider or Outsider',
                       'Workplace Shooting',
                       'Multiple Locations',
                       'Other Location',
                       'Armed Person on Scene',
                       'Number Killed',
                       'Number Injured',
                       'Family Member Victim',
                       'Romantic Partner Victim',
                       'Kidnapping or Hostage Situation',
                       'Motive: Racism/Xenophobia',
                       'Motive: Religious Hate',
                       'Motive: Misogyny',
                       'Motive: Homophobia',
                       'Motive: Employment Issue',
                       'Motive: Interpersonal Conflict\xa0',
                       'Motive: Other\xa0',
                       'Motive: Economic Issue',
                       'Motive: Legal Issue',
                       'Motive: Relationship Issue',
                       'Motive: Fame-Seeking',
                       'Motive: Unknown',
                       'Role of Psychosis in the Shooting',
                       'Leakage\xa0',
                       'Leakage How',
                       'Leakage Who\xa0',
                       'Leakage Specific/Nonspecific\xa0',
                       'Interest in Past Mass Violence',
                       'Relationship with Other Shooting(s)',
                       'Legacy Token',
                       'Pop Culture Connection',
                       'Planning',
                       'Performance',
                       'On-Scene Outcome',
                       'Who Killed Shooter On Scene',
                       'Attempt to Flee',
                       'Insanity Defense',
                       'Criminal Sentence']

In [97]:
non_shooter_columns

['Full Date',
 'Day of Week',
 'Day',
 'Month',
 'Year',
 'Street Number',
 'Street Name',
 'City',
 'State',
 'County',
 'Zip Code',
 'Latitude',
 'Longitude',
 'State Code',
 'Region',
 'Urban/Suburban/Rural',
 'Metro/Micro Statistical Area Type',
 'Location',
 'Insider or Outsider',
 'Workplace Shooting',
 'Multiple Locations',
 'Other Location',
 'Armed Person on Scene',
 'Number Killed',
 'Number Injured',
 'Family Member Victim',
 'Romantic Partner Victim',
 'Kidnapping or Hostage Situation',
 'Motive: Racism/Xenophobia',
 'Motive: Religious Hate',
 'Motive: Misogyny',
 'Motive: Homophobia',
 'Motive: Employment Issue',
 'Motive: Interpersonal Conflict\xa0',
 'Motive: Other\xa0',
 'Motive: Economic Issue',
 'Motive: Legal Issue',
 'Motive: Relationship Issue',
 'Motive: Fame-Seeking',
 'Motive: Unknown',
 'Role of Psychosis in the Shooting',
 'Leakage\xa0',
 'Leakage How',
 'Leakage Who\xa0',
 'Leakage Specific/Nonspecific\xa0',
 'Interest in Past Mass Violence',
 'Relationship w

In [98]:
shooters_df = reduced_attacks_df.drop(columns=non_shooter_columns)
shooters_df

Unnamed: 0,Case #,Shooter Last Name,Shooter First Name,Age,Gender,Race,Height,Weight,Immigrant,Sexual Orientation,...,Autism Spectrum,Substance Use,Health Issues,Head Injury / Possible TBI,Known Prejudices,Social Media Use,Interest in Firearms,Firearm Proficiency,Total Firearms Brought to the Scene,Other Weapons or Gear
0,1,Whitman,Charles,25,0,0.0,72.0,200.0,0.0,0.0,...,0,3,1,0,0,2,1.0,3.0,7,1
1,2,Smith,Robert,18,0,0.0,,,0.0,0.0,...,0,0,0,0,"2, 4",2,0.0,1.0,1,1
2,3,Held,Leo,39,0,0.0,72.0,200.0,0.0,0.0,...,0,0,0,0,0,2,1.0,3.0,2,1
3,4,Pearson,Eric,56,0,0.0,,,0.0,0.0,...,0,1,0,0,0,2,0.0,0.0,1,0
4,5,Lambright,Donald,31,0,1.0,,,0.0,0.0,...,0,0,0,0,1,2,0.0,3.0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,191,Tran,Huu Can,72,0,3.0,70.0,150.0,1.0,0.0,...,0,0,0,0,0,0,1.0,1.0,2,1
189,192,Zhao,Chunli,66,0,3.0,,,1.0,0.0,...,0,0,0,0,0,0,1.0,1.0,1,0
190,193,Hale,Audrey,28,4,0.0,,,0.0,,...,1,0,0,0,0,1,1.0,1.0,3,0
191,194,Sturgeon,Connor,25,0,0.0,76.0,,0.0,,...,0,0,1,1,0,1,0.0,0.0,1,0


In [104]:
print(f"Shooters are represented by {len(shooters_df.columns)} parameters:")
shooters_df.columns

Shooters are represented by 87 parameters:


Index(['Case #', 'Shooter Last Name', 'Shooter First Name', 'Age', 'Gender',
       'Race', 'Height', 'Weight', 'Immigrant', 'Sexual Orientation',
       'Religion', 'Education', 'School Performance', 'Birth Order',
       'Number of Siblings', 'Older Siblings', 'Younger Siblings',
       'Relationship Status', 'Children', 'Employment Status',
       'Employment Type ', 'Military Service', 'Military Branch',
       'Community Involvement', 'Known to Police or FBI', 'Criminal Record',
       'Part I Crimes', 'Part II Crimes',
       'Highest Level of Justice System Involvement',
       'History of Physical Altercations', 'History of Animal Abuse',
       'History of Domestic Abuse', 'History of Sexual Offenses',
       'Gang Affiliation', 'Terror Group Affiliation',
       'Known Hate Group or Chat Room Affiliation', 'Violent Video Games',
       'Bully', 'Bullied', 'Raised by Single Parent',
       'Parental Divorce / Separation', 'Parental Death in Childhood',
       'Parental Suicide

## Dealing with NaN

### Information about the shooyters DataFrame

In [124]:
shooters_info_df = dataset_info(shooters_df)

print(f"{len(shooters_info_df.loc[shooters_info_df['dtypes'] == 'int64',:])} columns are of type 'int64'")
print(f"{len(shooters_info_df.loc[shooters_info_df['dtypes'] == 'float64',:])} columns are of type 'float64'")
print(f"{len(shooters_info_df.loc[shooters_info_df['dtypes'] == 'object',:])} columns are of type 'object'")
print("---")
print(f"{len(shooters_info_df.columns)} TOTAL columns")

shooters_info_df.sort_values('missing', ascending=False)

45 columns are of type 'int64'
28 columns are of type 'float64'
14 columns are of type 'object'
---
5 TOTAL columns


Unnamed: 0,columns,dtypes,elements,missing,unique
72,Medication Category,object,44,149,23
7,Weight,float64,46,147,28
22,Military Branch,float64,49,144,6
6,Height,float64,58,135,15
12,School Performance,float64,89,104,3
...,...,...,...,...,...
50,Parental Substance Abuse,int64,193,0,2
51,Parent Criminal Record,int64,193,0,2
52,Family Member Incarcerated,int64,193,0,2
53,Adult Trauma,object,193,0,8


In [127]:
shooters_info_df.loc[shooters_info_df['dtypes'] == 'object',:]

Unnamed: 0,columns,dtypes,elements,missing,unique
1,Shooter Last Name,object,193,0,181
2,Shooter First Name,object,193,0,145
19,Employment Status,object,182,11,3
26,Part I Crimes,object,193,0,21
27,Part II Crimes,object,193,0,40
35,Known Hate Group or Chat Room Affiliation,object,192,1,6
53,Adult Trauma,object,193,0,8
54,Recent or Ongoing Stressor,object,193,0,23
70,Voluntary or Mandatory Counseling,object,193,0,4
72,Medication Category,object,44,149,23


### Finding columns in the shooters DataFrame with NaN

In [118]:
shooters_df.columns[shooters_df.isna().any()].tolist()

['Race',
 'Height',
 'Weight',
 'Immigrant',
 'Sexual Orientation',
 'Religion',
 'Education',
 'School Performance',
 'Birth Order',
 'Number of Siblings',
 'Older Siblings',
 'Younger Siblings',
 'Relationship Status',
 'Children',
 'Employment Status',
 'Employment Type\xa0',
 'Military Branch',
 'Community Involvement',
 'Known Hate Group or Chat Room Affiliation',
 'Violent Video Games',
 'Bully',
 'Bullied',
 'Physically Abused',
 'Childhood SES',
 'Timeline of Signs of Crisis',
 'Prior Counseling',
 'Psychiatric Medication',
 'Medication Category',
 'FASD (Fetal Alcohol Spectrum Disorder)',
 'Substance Use',
 'Known Prejudices\xa0',
 'Interest in Firearms',
 'Firearm Proficiency']

In [117]:
shooters_df.loc[shooters_df['Race'].isna()]

Unnamed: 0,Case #,Shooter Last Name,Shooter First Name,Age,Gender,Race,Height,Weight,Immigrant,Sexual Orientation,...,Autism Spectrum,Substance Use,Health Issues,Head Injury / Possible TBI,Known Prejudices,Social Media Use,Interest in Firearms,Firearm Proficiency,Total Firearms Brought to the Scene,Other Weapons or Gear
23,24,Leacock,Alban,41,0,,,,0.0,0.0,...,0,0,0,0,0,2,0.0,0.0,1,0
30,31,Hammett,Mansel,39,0,,,,0.0,0.0,...,0,0,0,0,0,2,0.0,1.0,1,0
42,43,Daigneau,Robert,45,0,,,,0.0,0.0,...,0,0,1,0,0,2,0.0,3.0,1,0
46,47,Miller,John,50,0,,,,0.0,0.0,...,0,1,0,0,0,2,0.0,3.0,1,0
56,57,Simpson,James,28,0,,,,0.0,0.0,...,0,0,0,0,0,2,0.0,0.0,2,0
84,85,Lockey,William,54,0,,,,0.0,0.0,...,0,0,0,0,0,0,0.0,1.0,2,0
89,90,Brown,Elijah,21,0,,,,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,2,0
95,96,Cranshaw,Freddy,54,0,,,,0.0,0.0,...,0,0,1,0,0,0,0.0,1.0,2,0


### Filling the gaps
Some search is made on the different individuals: https://www.reddit.com/r/masskillers/comments/14ikako/american_mass_shooters_that_i_cant_find_any/

These individuals seem to be hard to find information about...


In [119]:
shooters_valid_df = shooters_df.dropna(how='any')

### Dropping all NaN is simply not an option...

In [122]:
shooters_valid_df

Unnamed: 0,Case #,Shooter Last Name,Shooter First Name,Age,Gender,Race,Height,Weight,Immigrant,Sexual Orientation,...,Autism Spectrum,Substance Use,Health Issues,Head Injury / Possible TBI,Known Prejudices,Social Media Use,Interest in Firearms,Firearm Proficiency,Total Firearms Brought to the Scene,Other Weapons or Gear
8,9,Essex,Mark,23,0,1.0,64.0,135.0,0.0,0.0,...,0,2,0,0,"1, 4",2,0.0,3.0,1,1
105,106,Kazmierczak,Steven,27,0,0.0,72.0,300.0,0.0,1.0,...,0,3,0,0,1,1,0.0,2.0,4,1
152,155,Kelley,Devin Patrick,26,0,0.0,69.0,165.0,0.0,0.0,...,0,3,1,1,0,1,1.0,3.0,3,1


In [11]:
reduced_attacks_df['Recent or Ongoing Stressor'].value_counts()

0          55
2          47
1          14
1, 2       14
6          12
4           6
5           6
3           6
2, 5        6
2, 3        5
1, 3        4
4, 5        4
1, 2, 4     2
1, 3, 5     2
2, 4        2
3, 5, 6     1
3, 4        1
1, 5        1
1, 2, 5     1
3, 5        1
2, 6        1
1, 2, 3     1
2,3,6       1
Name: Recent or Ongoing Stressor, dtype: int64

In [12]:
reduced_attacks_df['Voluntary or Mandatory Counseling'].value_counts()

0       139
1        35
2        15
1, 2      4
Name: Voluntary or Mandatory Counseling, dtype: int64

In [13]:
reduced_attacks_df['Mental Illness'].value_counts()

0       60
4       46
2       31
1       30
1, 2    17
3        9
Name: Mental Illness, dtype: int64

In [14]:
reduced_attacks_df['Known Family Mental Health History'].value_counts()

0       161
1        21
2        10
1, 2      1
Name: Known Family Mental Health History, dtype: int64

# Questions

## Question 1: what are the states with the most and fewest shootings

In [16]:
attacks_per_state_df = attacks_df.groupby('State')['Case #'].count().sort_values(ascending=False)

Nt = 7

print(f"The Top {Nt} states for mass shootings are:")
display(attacks_per_state_df.head(Nt))

print(f"The Bottom {Nt} states for mass shootings are:")
display(attacks_per_state_df.tail(Nt))

The Top 7 states for mass shootings are:


State
CA    30
TX    21
FL    13
NY    10
PA     9
CO     9
WA     8
Name: Case #, dtype: int64

The Bottom 7 states for mass shootings are:


State
RI    1
NH    1
DC    1
NE    1
UT    1
MA    1
MD    1
Name: Case #, dtype: int64

### Follow up questions
- More people in a state = more shootings?
- Political orientation
- Views on gun in the state
- Background checks

## Question 2: What are some common traits (background) of shooters?

### Most common race and gender

In [17]:
Codes = {
    'background': {
        'gender': {
            '0': 'Male',
            '1': 'Female',
            '3': 'Non-Binary',
            '4': 'Transgender'
        },
        'race': {
            '0': 'White',
            '1': 'Black',
            '2': 'Latinx',
            '3': 'Asian',
            '4': 'Middle Eastern',
            '5': 'Native American'
        }
    }
}

In [33]:
attacks_df['Race'].value_counts()

0           102
1            40
2            17
3            13
4             8
5             3
Moroccan      1
Bosnian       1
Name: Race, dtype: int64

In [36]:
per_gender_df = attacks_df['Gender'].value_counts()
per_gender_df.index

Float64Index([0.0, 1.0, 3.0, 4.0], dtype='float64')

In [27]:
gender = 2

Codes['background']['gender'][str(gender)]

KeyError: '2'